diff --git a/.github/workflows/android-arm-cpu.yml b/.github/workflows/android-arm-cpu.yml
index d7515d45a..15cf7a16d 100644
--- a/.github/workflows/android-arm-cpu.yml
+++ b/.github/workflows/android-arm-cpu.yml
@@ -13,19 +13,19 @@ jobs:
     outputs:
       CONDITION: ${{ steps.preflight.outputs.CONDITION }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Preflight
       id: preflight
       run: |
-        echo ::set-output name=CONDITION::0
-        ./scripts/.ci/preflight.sh android || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+        echo "CONDITION=0" >> $GITHUB_OUTPUT
+        ./scripts/.ci/preflight.sh android || ret=$? && echo $ret && echo "CONDITION=$ret" >> $GITHUB_OUTPUT
 
   android:
     needs: [setup]
     if: ${{ needs.setup.outputs.CONDITION != '11' }}
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: configure
       run: sudo apt-get install attr
     - name: build
diff --git a/.github/workflows/ios-cpu.yml b/.github/workflows/ios-cpu.yml
index 3aeb4af4c..da1beae88 100644
--- a/.github/workflows/ios-cpu.yml
+++ b/.github/workflows/ios-cpu.yml
@@ -13,18 +13,18 @@ jobs:
     outputs:
       CONDITION: ${{ steps.preflight.outputs.CONDITION }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Preflight
       id: preflight
       run: |
-        echo ::set-output name=CONDITION::0
-        ./scripts/.ci/preflight.sh ios || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+        echo "CONDITION=0" >> $GITHUB_OUTPUT
+        ./scripts/.ci/preflight.sh ios || ret=$? && echo $ret && echo "CONDITION=$ret" >> $GITHUB_OUTPUT
 
   ios-iphone-os:
     needs: [setup]
     if: ${{ needs.setup.outputs.CONDITION != '11' }}
     runs-on: macos-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: build
       run: ./scripts/build_framework_ios.sh
diff --git a/.github/workflows/linux-x86-cpu-gcc.yml b/.github/workflows/linux-x86-cpu-gcc.yml
index 3e8a716d5..b1a5aba6a 100644
--- a/.github/workflows/linux-x86-cpu-gcc.yml
+++ b/.github/workflows/linux-x86-cpu-gcc.yml
@@ -9,22 +9,22 @@ on:
 
 jobs:
   setup:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     outputs:
       CONDITION: ${{ steps.preflight.outputs.CONDITION }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Preflight
       id: preflight
       run: |
-        echo ::set-output name=CONDITION::0
-        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+        echo "CONDITION=0" >> $GITHUB_OUTPUT
+        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo "CONDITION=$ret" >> $GITHUB_OUTPUT
 
   linux-gcc:
     needs: [setup]
     if: ${{ needs.setup.outputs.CONDITION != '11' }}
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: build
       run: ./scripts/build_x86_linux.sh
diff --git a/.github/workflows/macos-x64-cpu.yml b/.github/workflows/macos-x64-cpu.yml
index 1fa1e4924..252f995a8 100644
--- a/.github/workflows/macos-x64-cpu.yml
+++ b/.github/workflows/macos-x64-cpu.yml
@@ -13,19 +13,19 @@ jobs:
     outputs:
       CONDITION: ${{ steps.preflight.outputs.CONDITION }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Preflight
       id: preflight
       run: |
-        echo ::set-output name=CONDITION::0
-        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo ::set-output name=CONDITION::$ret
+        echo "CONDITION=0" >> $GITHUB_OUTPUT
+        ./scripts/.ci/preflight.sh x86 || ret=$? && echo $ret && echo "CONDITION=$ret" >> $GITHUB_OUTPUT
 
   macos-clang:
     needs: [setup]
     if: ${{ needs.setup.outputs.CONDITION != '11' }}
     runs-on: macos-11
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: protobuf
       run: brew install protobuf opencv3
     - name: build
diff --git a/.gitignore b/.gitignore
index b37594314..b1998411b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -494,6 +494,10 @@ model/
 # opencl generated code
 opencl_program.cc
 
+# cache
+*.cache
+*.cache~
+
 # opencl generated code
 opencl_program.cc
 platforms/mac/tnn.xcodeproj/project.xcworkspace/xcuserdata/darrenyao.xcuserdatad/UserInterfaceState.xcuserstate
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e50f270da..5f62c19e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.1)
-
+#set(CMAKE_CXX_STANDARD 14)
+#set(CMAKE_CXX_STANDARD_REQUIRED ON)
 # https://cmake.org/cmake/help/latest/policy/CMP0068.html
 if(POLICY CMP0068)
     cmake_policy(SET CMP0068 NEW)
@@ -15,9 +16,9 @@ project(TNN)
 ENABLE_LANGUAGE(ASM)
 
 set(TNN_MAJOR_VERSION 0)
-set(TNN_MINOR_VERSION 3)
-set(TNN_PATCH_VERSION 0)
-set(TNN_BUILD_VERSION 0)
+set(TNN_MINOR_VERSION 4)
+set(TNN_PATCH_VERSION 2)
+set(TNN_BUILD_VERSION 11)
 set(TNN_VERSION "${TNN_MAJOR_VERSION}.${TNN_MINOR_VERSION}.${TNN_PATCH_VERSION}.${TNN_BUILD_VERSION}")
 
 option(TNN_CPU_ENABLE "Enable Cpu" ON)
@@ -27,14 +28,16 @@ option(TNN_ARM82_ENABLE "Enable Arm82" OFF)
 option(TNN_METAL_ENABLE "Enable Metal" OFF)
 option(TNN_OPENCL_ENABLE "Enable OpenCL" OFF)
 option(TNN_CUDA_ENABLE "Enable CUDA" OFF)
-option(TNN_DSP_ENABLE "Enable DSP" OFF)
+option(TNN_SNPE_ENABLE "Enable Qualcomm SNPE DSP" OFF)
 option(TNN_ATLAS_ENABLE "Enable Atlas" OFF)
 option(TNN_TENSORRT_ENABLE "Enable TensorRT" OFF)
 option(TNN_OPENVINO_ENABLE  "Enable OPENVINO" OFF)
 option(TNN_APPLE_NPU_ENABLE "Enable NPU" OFF)
 option(TNN_HUAWEI_NPU_ENABLE "Enable NPU" OFF)
 option(TNN_RK_NPU_ENABLE "Enable RKNPU" OFF)
-option(TNN_JETSON_NANO_ENABLE "Enable Jetson Nano" OFF)
+option(TNN_TNNTORCH_ENABLE "Enable TNNTorch" OFF)
+option(TNN_ZIXIAO_ENABLE "Enable TNNTorch" OFF)
+option(TNN_TORCHVISION_ENABLE "Enable TorchVision" OFF)
 option(TNN_SYMBOL_HIDE "Enable Hide Symbol Visibility" ON)
 option(TNN_OPENMP_ENABLE "Enable OpenMP" OFF)
 option(TNN_BUILD_SHARED "Build Shared Library" ON)
@@ -52,12 +55,18 @@ option(TNN_ONNX2TNN_ENABLE "Enable ONNX2TNN Converter" OFF)
 option(TNN_TNN2MEM_ENABLE "Enable tnn2mem" OFF)
 option(TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE "Enable Build Benchmark Test Lib" OFF)
 option(TNN_GLIBCXX_USE_CXX11_ABI_ENABLE "Enable Use CXX11 ABI" ON)
+option(TNN_PYBIND_ENABLE  "Enable Pybind" OFF)
 option(TNN_METAL_FLOAT32 "Enable Metal Float32" OFF)
 option(TNN_COREML_FLOAT32 "Enable Float32 CoreML Model" ON)
 option(TNN_DYNAMIC_RANGE_QUANTIZATION_ENABLE "Enable Dynamic Range Quantization" OFF)
+option(TNN_PACK_TORCH_LIB "Enable Torch Lib Pack in release" ON)
+option(TNN_CUDA_JETSON_ENABLE "Enable CUDA build for Nvidia Jetson Driving Chips like Orin, Thor etc." OFF)
 
 set(TNN_USE_GFLAGS OFF)
 
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+set(CMAKE_SKIP_BUILD_RPATH FALSE)
+
 message(${CMAKE_SOURCE_DIR})
 message(${CMAKE_CURRENT_SOURCE_DIR})
 
@@ -76,6 +85,10 @@ if(TNN_PROFILER_ENABLE)
     set(TNN_SYMBOL_HIDE OFF)
 endif()
 
+if(TNN_TORCHVISION_ENABLE)
+    add_definitions(-DTNN_TORCHVISION)
+endif()
+
 if(TNN_BENCHMARK_MODE)
     add_definitions(-DGENERATE_RESOURCE)
 endif()
@@ -131,12 +144,17 @@ if(TNN_UNIT_TEST_ENABLE)
     add_definitions(-DGENERATE_RESOURCE)
 endif()
 
+if(TNN_MATCHER_TEST_ENABLE)
+    set(TNN_SYMBOL_HIDE OFF)
+endif()
+
 if(TNN_CONVERTER_ENABLE)
     set(TNN_ONNX2TNN_ENABLE ON)
 endif()
 
 if(TNN_CONVERTER_ENABLE OR TNN_ONNX2TNN_ENABLE)
     set(TNN_SYMBOL_HIDE OFF)
+    set(TNN_PYBIND_ENABLE ON)
     add_definitions(-DTNN_CONVERTER_RUNTIME)
 endif()
 
@@ -220,7 +238,11 @@ if(UNIX)
     endif()
 endif()
 
-set(CMAKE_CXX_STANDARD 11)
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
+    set(CMAKE_CXX_STANDARD 11)
+else()
+    set(CMAKE_CXX_STANDARD 17)
+endif()
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 if(TNN_METAL_ENABLE OR TNN_APPLE_NPU_ENABLE)
@@ -256,7 +278,7 @@ message(STATUS "\tArm82:\t${TNN_ARM82_ENABLE}")
 message(STATUS "\tMetal:\t${TNN_METAL_ENABLE}")
 message(STATUS "\tOpenCL:\t${TNN_OPENCL_ENABLE}")
 message(STATUS "\tCUDA:\t${TNN_CUDA_ENABLE}")
-message(STATUS "\tDSP:\t${TNN_DSP_ENABLE}")
+message(STATUS "\tSNPE:\t${TNN_SNPE_ENABLE}")
 message(STATUS "\tAtlas:\t${TNN_ATLAS_ENABLE}")
 message(STATUS "\tTensorRT:\t${TNN_TENSORRT_ENABLE}")
 message(STATUS "\tAppleNPU:\t${TNN_APPLE_NPU_ENABLE}")
@@ -264,6 +286,8 @@ message(STATUS "\tHuaweiNPU:\t${TNN_HUAWEI_NPU_ENABLE}")
 message(STATUS "\tRKNPU:\t${TNN_RK_NPU_ENABLE}")
 message(STATUS "\tJetson Nano:\t${TNN_JETSON_NANO_ENABLE}")
 message(STATUS "\tOpenVINO:\t${TNN_OPENVINO_ENABLE}")
+message(STATUS "\tTNNTorch:\t${TNN_TNNTORCH_ENABLE}")
+message(STATUS "\tZIXIAO:\t${TNN_ZIXIAO_ENABLE}")
 message(STATUS "\tOpenMP:\t${TNN_OPENMP_ENABLE}")
 message(STATUS "\tTEST:\t${TNN_TEST_ENABLE}")
 message(STATUS "\t--Unit Test:\t${TNN_UNIT_TEST_ENABLE}")
@@ -279,6 +303,7 @@ message(STATUS "\tTNN2MEM:\t${TNN_TNN2MEM_ENABLE}")
 message(STATUS "\tBENCHMARK Test Lib:\t${TNN_BUILD_BENCHMARK_TEST_LIB_ENABLE}")
 message(STATUS "\tDynamic Range Quantization:\t${TNN_DYNAMIC_RANGE_QUANTIZATION_ENABLE}")
 message(STATUS "\tSHARING_MEM_WITH_OPENGL:\t${SHARING_MEM_WITH_OPENGL}")
+message(STATUS "\tCuda Build Jetson Chips:\t${TNN_CUDA_JETSON_ENABLE}")
 
 include_directories(include)
 include_directories(source)
@@ -385,19 +410,58 @@ if(TNN_CUDA_ENABLE)
     set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNCuda>")
 endif()
 
+if(TNN_SNPE_ENABLE)
+    if(ANDROID_ABI STREQUAL "armeabi-v7a")
+        # SNPE 2.11+ no longer support ARMv7
+        message(STATUS "TNN SNPE not available on Android ARMv7")
+    else()
+        link_directories(third_party/snpe/lib/aarch64-android/)
+        add_subdirectory(source/tnn/device/snpe)
+        set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNSNPE>")
+    endif()
+endif()
+
 if(TNN_HUAWEI_NPU_ENABLE)
     if(ANDROID_ABI STREQUAL "armeabi-v7a")
         link_directories(
-                third_party/huawei_npu/hiai_ddk_latest/armeabi-v7a/
+                third_party/huawei_npu/hiai_ddk_latest/ddk/ai_ddk_lib/lib/
         )
     else()
         link_directories(
-                third_party/huawei_npu/hiai_ddk_latest/arm64-v8a/
+                third_party/huawei_npu/hiai_ddk_latest/ddk/ai_ddk_lib/lib64/
         )
     endif()
     add_subdirectory(source/tnn/device/huawei_npu)
     set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNNPU>")
 endif()
+if(TNN_ATLAS_ENABLE)
+    add_definitions(-DGET_NETWORK_ENABLE)
+    add_subdirectory(source/tnn/device/atlas)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNAtlas>")
+    include_directories(${CMAKE_SOURCE_DIR}/source/tnn/device/atlas)
+    set(ASCEND_PATH $ENV{DDK_PATH})
+    if (NOT DEFINED ENV{DDK_PATH})
+        set(ASCEND_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+        message(STATUS "set default ASCEND_PATH: ${ASCEND_PATH}")
+    else ()
+        message(STATUS "env ASCEND_PATH: ${ASCEND_PATH}")
+    endif()
+    set(ACL_LIB_PATH $ENV{NPU_HOST_LIB})
+    if (NOT DEFINED ENV{NPU_HOST_LIB})
+        set(ACL_LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/lib64")
+        message(STATUS "set default ACL_LIB_PATH: ${ACL_LIB_PATH}")
+    else ()
+        message(STATUS "env ACL_LIB_PATH: ${ACL_LIB_PATH}")
+    endif()
+    # Header path
+    include_directories(
+        ${ASCEND_PATH}/acllib/include/
+        )
+    # add host lib path
+    link_directories(
+        ${ACL_LIB_PATH}
+        )
+endif()
 
 if(TNN_RK_NPU_ENABLE)
     if(CMAKE_SIZEOF_VOID_P EQUAL 8)
@@ -413,9 +477,23 @@ if(TNN_RK_NPU_ENABLE)
     set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNRKNPU>")
 endif()
 
+if(TNN_TNNTORCH_ENABLE)
+    add_subdirectory(source/tnn/network/torch)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNTorch>")
+endif()
+
+if(TNN_ZIXIAO_ENABLE)
+    add_subdirectory(source/tnn/device/zixiao)
+    set(TARGET_OBJECTS ${TARGET_OBJECTS} "$<TARGET_OBJECTS:TNNZIXIAO>")
+endif()
+
 if(TNN_BUILD_SHARED)
     add_library(TNN SHARED ${SRC} ${TARGET_OBJECTS})
     set_target_properties(TNN PROPERTIES VERSION ${TNN_VERSION} SOVERSION ${TNN_MAJOR_VERSION})
+    set_target_properties(TNN PROPERTIES LINK_FLAGS "-Wl,-rpath,$ORIGIN")
+    if (NOT TNN_PACK_TORCH_LIB)
+        set_target_properties(TNN PROPERTIES LINK_FLAGS "-Wl,-rpath,$ORIGIN -Wl,-rpath,$ORIGIN/../torch/lib")
+    endif()
     if(SHARING_MEM_WITH_OPENGL)
         if(SYSTEM.Windows)
             target_link_libraries(TNN opengl32)
@@ -456,6 +534,19 @@ elseif(SYSTEM.Windows)
     include(platforms/windows/CMakeLists.txt)
 endif()
 
+if(TNN_PYBIND_ENABLE)
+    set(CMAKE_CXX_STANDARD 17)
+    include_directories(third_party/pybind11/include)
+    add_subdirectory(third_party/pybind11)
+    file(GLOB_RECURSE TORCH_SRC "source/pytnn/*.cc")
+    add_library(_pytnn SHARED ${TORCH_SRC})
+    target_link_libraries(_pytnn pybind11::module)
+    set_target_properties(_pytnn PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
+    set_target_properties(_pytnn PROPERTIES LINK_FLAGS "-Wl,-rpath,\$ORIGIN")
+    target_link_libraries(_pytnn TNN)
+endif()
+
 if (TNN_TEST_ENABLE OR TNN_CONVERTER_ENABLE OR TNN_MODEL_CHECK_ENABLE OR TNN_DYNAMIC_RANGE_QUANTIZATION_ENABLE)
     set(TNN_USE_GFLAGS ON)
 endif ()
@@ -499,6 +590,7 @@ endif()
 if(TNN_DYNAMIC_RANGE_QUANTIZATION_ENABLE)
     add_subdirectory(tools/dynamic_range_quantization)
 endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\"")
 
 if (MSVC)
     target_compile_options(TNN PUBLIC "/Zc:__cplusplus")
diff --git a/doc/cn/user/api.md b/doc/cn/user/api.md
index 80df1ae80..cf5f1a870 100644
--- a/doc/cn/user/api.md
+++ b/doc/cn/user/api.md
@@ -63,7 +63,7 @@ TNN_NS::Status error;
 auto net_instance = tnn.CreateInst(config, error);
 ```
 
-TNN网络构建需配置NetworkConfig，device_type可配置`DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU`等多种加速方式，通过CreateInst接口完成网络的构建。
+TNN网络构建需配置NetworkConfig，device_type可配置`DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU`,`DEVICE_ATLAS`等多种加速方式，通过CreateInst接口完成网络的构建。
 
 
 ### 步骤3. 输入设定
@@ -143,7 +143,7 @@ struct PUBLIC ModelConfig {
 
 ModelConfig参数说明：
 
-- `model_type`: TNN当前开源版本仅支持传入`MODEL_TYPE_TNN`， `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML` 模型格式。  
+- `model_type`: TNN当前开源版本仅支持传入`MODEL_TYPE_TNN`， `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML`, `MODEL_TYPE_ATLAS`模型格式。  
 - `params`: TNN模型需传入proto文件内容以及model文件路径。NCNN模型需传入param文件内容以及bin文件路径, COREML模型需传入coreml 模型所在目录路径。
 
 
@@ -181,7 +181,7 @@ struct PUBLIC NetworkConfig {
 
 NetworkConfig参数说明：  
 
-- `device_type`: 默认为`DEVICE_ARM`。 当前已支持 `DEVICE_NAIVE`、`DEVICE_ARM`、`DEVICE_X86`、`DEVICE_OPENCL`、`DEVICE_METAL`、`DEVICE_CUDA`、`DEVICE_HUAWEI_NPU`、`DEVICE_RK_NPU`。  
+- `device_type`: 默认为`DEVICE_ARM`。 当前已支持 `DEVICE_NAIVE`、`DEVICE_ARM`、`DEVICE_X86`、`DEVICE_OPENCL`、`DEVICE_METAL`、`DEVICE_CUDA`、`DEVICE_HUAWEI_NPU`、`DEVICE_RK_NPU`、`DEVICE_ATLAS`。  
 - `device_id`: 默认为0，多个设备支持通过`device_id`选择，当前仅`DEVICE_CUDA`需配置此参数指定gpu id。  
 - `data_format`: 默认为tnn自动选择blob数据排布方式进行加速，可通过此参数设定特定blob数据排布进行加速。  
 - `network_type`: 默认根据`device_type`自动选择网络类型，可指定构建网络类型。  
@@ -556,3 +556,94 @@ struct PUBLIC MatConvertParam {
 
 ### 16. version.h
 构建版本信息
+
+# Python API说明
+
+Python API 基于pybind 对 C++ Core 相关API进行了封装，所有定义类型均可通过`pytnn`包名引入。相关用法与C++ API基本相同，仅改变了c++传引用参数作为返回值的函数行为，在python对应接口中改为直接作为函数返回值返回。此外，Python API提供了简化的API接口。
+
+## 一、模型加载
+
+### 1. load 
+
+```python
+def load(model_path, config_dict = {}):
+```  
+
+其中`model_path`传递模型路径，对于TNN这种模型结构与权重分开存储的模型，仅需传递tnnproto 文件路径，模型权重路径基于后缀名自动查找。`config_dict`支持字典传入，相关key说明如下:  
+
+* `input_shapes`:  支持list以及dict两种形式传入，其中dict key 可指定输入name。shape可通过两种格式指定：
+
+```python
+{ "input_shapes": [ {"min": [1,3,224,224], "max": [1,3,248,248]} ]}
+{ "input_shapes": [ [1,3,224,224] ]}
+```  
+其中min, max可用来指定支持的最小，最大尺寸，固定尺寸仅需指定一个尺寸即可，尺寸支持tuple和list。  
+对于多输入模型，不同输入尺寸可以采用不同的格式指定支持的输入尺寸。   
+
+```python
+{ "input_shapes":  [ [1,3,112,112], {"min": [1,3,224,224], "max": [1,3,248,248]} ] }
+```   
+
+其中第一个输入为固定输入尺寸，第二个输入为可变尺寸。  
+相同的输入，通过dict传入，key可用于指定输入name：  
+
+```python
+{ "input_shapes":  { "data_0": [1,3,112,112], "data_1": {"min": [1,3,224,224], "max": [1,3,248,248]} } }
+```   
+其中`data_0` 为固定输入尺寸，`data_1` 为可变输入尺寸。  
+
+* `device_type`:  支持DeviceType枚举类型以及字符串传入。
+
+```python
+{"device_type": DEVICE_NAIVE}
+{"device_type": "naive"}
+```  
+枚举类型同c++，支持 `DEVICE_CUDA`, `DEVICE_X86`, `DEVICE_ARM`, `DEVICE_NAIVE`等。  
+字符串类型与枚举类型命名一一对应，如`CUDA`, `cuda`均表示DEVICE_CUDA，支持大小写。  
+特别说明：不指定device_type，默认选择`DEVICE_CUDA`。
+
+* `data_format`，`network_type`, `precision`, `share_memory_mode`, `data_format` 与 `device_type`类似，均支持枚举类型和字符串类型输入，枚举类型同c++， 字符串类型与枚举类型命名一一对应，支持大小写。
+
+* `cache_path`, `library_path` 支持字符串类型传入，`enable_tune_kernel` 支持布尔类型传入。
+
+### 2. `load_raw`, `load_raw_range`
+
+```python
+def load_raw(model_path, network_config, input_shapes=None):
+def load_raw_range(model_path, network_config, min_input_shapes, max_input_shapes):
+```
+两接口为TNN对应接口CreateInst的简单封装，其中`model_path`传递模型路径；`network_config`为`NetworkConfig`类实例，与C++类相同；`input_shapes`，`min_input_shapes`以及`max_input_shapes` 对应相关输入尺寸设定，类型为字典，其中key为输入name，value对应输入尺寸list。
+
+## 二、网络运行
+
+模型加载完成后，会返回pytnn新定义的Module类实例， 其中Module类定义的一重要函数为forward。
+
+```python
+class Module:
+...
+    def forward(self, *inputs, rtype="list"):
+...
+```  
+
+其中`inputs`为不定长参数，每个输入数据存储于`numpy.ndarray`中，排布为NC[D1-D4]。支持多个输入直接传入，也支持list, tuple，dict形式传入。 如一个两输入网络，输入name依次为`data_1`, `data_2`，可支持以下几种方式传入数据。
+
+```
+input1=numpy.ones((1,3,224,224), np.float32, 'F')
+input2=numpy.ones((1,3,224,224), np.float32, 'F')
+# case1
+outputs=module.forward(input1, input2)
+#case2
+outputs=module.forward((input1, input2))
+#case3
+outputs=module.forward([input1, input2])
+#case4
+outputs=module.forward({"data_1":input1, "data_2":input2})
+```
+输出`outputs`默认返回类型为list，每个输出存储于`numpy.ndarray`中，排布为NC[D1-D4]。
+
+`rtype` 支持  `list` , `dict`， 指定输出返回类型为字典类型时，key为模型输出name，value对应输出数据，存储于`numpy.ndarray`中。
+
+
+ 
+
+
diff --git a/doc/cn/user/compile.md b/doc/cn/user/compile.md
index b38c6d96a..ec02dd976 100644
--- a/doc/cn/user/compile.md
+++ b/doc/cn/user/compile.md
@@ -227,7 +227,41 @@ cd <path_to_tnn>/scripts
 ```
 ./build_macos.sh
 ```
+## 九、ATLAS环境编译
 
+### 1. 环境要求
+#### 依赖库
+  - cmake（使用3.1及以上版本）
+  - 交叉编译需要安装编译工具链
+  - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu
+  - other linux: 下载arm toolchain: https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+  - CANN环境依赖:
+    toolkit软件包: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run
+    kernel包: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run 
+    chmod a+x Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run && chmod a+x Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run
+    ./Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run --install  # 默认安装路径:/usr/local/Ascend/ascend-toolkit
+    ./Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run --install
+
+### 2. 编译步骤
+1）切换到脚本目录
+```
+cd <path_to_tnn>/scripts
+```
+2）编辑`build_atlas.sh`修改配置选项 
+```
+ SHARED_LIB="ON"                # ON表示编译动态库，OFF表示编译静态库
+ ARM="ON"                       # ON表示编译带有Arm CPU版本的库
+ OPENMP="ON"                    # ON表示打开OpenMP
+ #ARM64:
+ CC=aarch64-linux-gnu-gcc       # 指定C编译器
+ CXX=aarch64-linux-gnu-g++      # 指定C++编译器
+ TARGET_ARCH=aarch64            # 指定指令架构
+
+```
+3）执行编译脚本
+```
+./build_atlas.sh
+```
 ## 编译参数option说明
 
 |Option|默认值|说明|
@@ -239,7 +273,7 @@ cd <path_to_tnn>/scripts
 |TNN_METAL_ENABLE| OFF | 代码source/device/metal编译开关，代码包含metal加速指令。|
 |TNN_OPENCL_ENABLE| OFF | 代码source/device/opencl编译开关，代码包含opencl加速指令。|
 |TNN_CUDA_ENABLE| OFF | 代码source/device/cuda编译开关，当前适配TensorRT实现，后续会迁入更多加速代码实现。|
-|TNN_DSP_ENABLE| OFF | 代码source/device/dsp编译开关，当前适配snpe实现。|
+|TNN_SNPE_ENABLE| OFF | 代码source/device/snpe编译开关，当前适配Qualcomm SNPE DSP实现。|
 |TNN_ATLAS_ENABLE| OFF | 代码source/device/atlas编译开关，当前适配华为atlas加速框架。|
 |TNN_HUAWEI_NPU_ENABLE| OFF | 代码source/device/huawei_npu编译开关，当前适配HiAI加速框架。|
 |TNN_RK_NPU_ENABLE| OFF | 代码source/device/rknpu编译开关，当前适配rknpu_ddk加速框架。|
diff --git a/doc/cn/user/demo.md b/doc/cn/user/demo.md
index cf30e04e4..28b4c31b1 100644
--- a/doc/cn/user/demo.md
+++ b/doc/cn/user/demo.md
@@ -108,8 +108,8 @@ c) 如果需要执行OCR demo，需要将tnn_sdk_sample.h中的宏HAS_OPENCV设
 
 ### 运行环境要求
 
-1. Android Studio 3.5 或以上
-2. NDK version >= 18, <= 21
+1. Android Studio 3.5 或以上, Android Studio 2022.2.1 测试可运行
+2. NDK version >= 18,
 NDK 22和23在链接第三方动态库可能会出错，例如opencv，hiai，不建议使用。
 
 ### 运行步骤
@@ -534,6 +534,48 @@ NDK 22和23在链接第三方动态库可能会出错，例如opencv，hiai，
    文本识别 demo
    ./demo_cuda_ocrdetecor
    ```
+##### Atlas
+* 环境要求
+   - Cmake (>= 3.1)
+   - 交叉编译需要安装编译工具链
+   - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu      gcc-aarch64-linux-gnu  
+   - other linux: 下载 arm toolchain: https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+  - CANN环境依赖:
+    toolkit软件包: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run
+    kernel包: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run 
+    chmod a+x Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run && chmod a+x Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run
+    ./Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run --install  # 默认安装路径:/usr/local/Ascend/ascend-toolkit
+    ./Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run --install
+* 编译  
+   进入 `examples/linux/atlas` 目录
+   ```
+   cd <path_to_tnn>/examples/linux/atlas
+   ```
+   执行 `build_atlas.sh`
+   ```
+   sh build_aarch64_linux.sh
+   ```
+* 执行  
+   进入 `examples/linux/cross/build_atlas` 目录，当不使用任何参数执行demo文件时，会打印demo用法信息，以图形分类demo为例:
+   ```
+   cd build_atlas
+   ./demo_atlas_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_arm_linux_imageclassify [-h] [-p] tnnproto [-m] ommodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) om model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p`和`-m`选项分别用于指定demo使用的tnnproto和ommodel文件的路径；`-i`选项用于指定输入图片的路径；`-l`选项用于指定分类标签文件的路径。`-h`选项打印帮助信息。各个demo的示例用法如下所示:
+   ```
+   cd build_atlas
+
+   图形分类 demo
+   ./demo_atlas_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.0.om -i ../../../assets/tiger_cat.jpg
+   ```
 
 ### 常见问题
 
diff --git a/doc/cn/user/test.md b/doc/cn/user/test.md
index 36195f2e2..5713c1908 100644
--- a/doc/cn/user/test.md
+++ b/doc/cn/user/test.md
@@ -14,17 +14,58 @@
 ```
 TNNTest
 必选参数：
-    -mp 模型proto位置(模型model需要在同文件夹下同前缀名)
-    -dt DEVICE类型（ARM, OPENCL, HUAWEI_NPU, X86, CUDA）
+  -mp 模型或模型拓扑文件位置
+    - 模型为 TNN Proto + Model 格式时，填写 TNN Proto 文件的绝对或相对路径，Proto 与 Model 需同名且在统一路径下，如：-mp ./model/SqueezeNet/squeezenet_v1.1.tnnproto
+    - 模型为 TorchScript 格式时，填写 TorchScript 模型文件的绝对或相对路径
+    - 模型为 华为 ATLAS OM 格式时，填写 OM 格式模型文件的绝对或相对路径
+  -dt 目标设备类型
+    - NAIVE - 在X86或ARM CPU端运行，NAIVE模式下的算子使用c++写成，不包含优化功能，可用于正确性校对
+    - X86 - 在X86_64 CPU上运行
+    - ARM - 在ARM V7/V8 CPU上运行
+    - CUDA - 在英伟达 GPU 上运行
+    - OPENCL - 使用 OPENCL 在 GPU 上运行
+    - METAL - 使用 METAL 在苹果 GPU 上运行
+    - HUAWEI_NPU - 在华为海思 NPU 上运行
+    - RKNPU - 在瑞芯微 RK NPU 上运行
+    - APPLE_NPU - 在苹果 NPU 上运行
+    - ZIXIAO - 在紫霄 NPU 上运行
+    - ATLAS - 在华为 ATLAS NPU 上运行
+  -nt 网络类型
+    - 默认类型 - 当目标设备类型为 X86, ARM, CUDA 等时，保持网络类型为默认即可，后台会自动选择默认的网络类型并运行
+    - COREML - 运行苹果 CoreML
+    - HUAWEI_NPU - 运行华为海思 NPU
+    - OPENVINO - 运行 OpenVINO
+    - RKNPU - 运行瑞芯微 RK NPU
+    - SNPE - 运行高通 SNPE
+    - TRT - 运行英伟达 TensorRT
+    - TORCH - 运行 Pytorch TorchScript 网络
+    - ZIXIAO - 运行紫霄网络
+    - ATLAS - 运行华为ATLAS NPU
+  -op 输出文件位置
+    - 空(默认) - 不输出网络运行结果，一般不推荐
+    - ${YOUR_OUTPUT_NAME}.txt - 输出文件的绝对或相对路径，指定输出文件后，TNNTest 会将网络所有输出写到该文件中
+
 常用可选参数：
-    -nt network类型（默认naive， 华为Npu需要特殊指定 -nt HUAWEI_NPU）
-    -op 输出文件位置   
-    -ic 循环次数  
-    -wc warmup运行次数
-    -dl 设备list
-    -ip 输入文件
-    -it（输入类型，默认为NCHW float）
-    -th (CPU线程数)  
+  -pr 网络运行精度
+    - AUTO(默认) - 根据目标设备与平台，自动选择适合的默认精度，默认精度可能为 float32 或 float16
+    - HIGH - 指定精度为 float32
+    - LOW - 指定精度为 float16
+  -wc 预热运行次数(不计时)
+    - 0(默认值) - 默认从第一次循环即开始计时，一些设备中首次或前几次运行推理速度较慢，跳过前几次推理可以使测速结果更加准确
+    - N - 指定后，在计时前额外运行 N 次不计时的前向推理
+  -ic 循环次数(计时)
+    - 1(默认值) - 默认运行前向一次，推荐验证模型运行结果时使用
+    - N - 指定后，运行 N 次计时的前向推理
+  -dl 设备号码
+    - 0(默认值) - 默认在 0 号设备上运行
+    - 手动指定 - 可以指定单个或者多个设备，用逗号分割，如 -dl "0,1,2,3,4,5"
+  -is 模型输入形状
+    - 空(默认值) - 模型为 TNN Proto 格式时从 TNN Proto 文件中读取默认输入形状，其他部分模型格式此项为必选，需手动指定
+    - 手动指定 - 手动指定单个或多个输入的形状，用分号分割，如："in_0:1,3,512,512;in_2:1,3,256,256;"
+  -it 模型输入数据类型
+    - 空(默认值) - 模型为 TNN Proto 格式时从 TNN Proto 文件中读取默认输入形状，其他部分模型格式此项为必选，需手动指定，一些模型格式下默认为 float32
+    - 手动指定 - 手动指定单个或多个输入的数据格式，用分号分割，如："in_0:0;;in_2:3;"
+    - 数字与数据类型对应关系 - 0->float32; 1->float16; 2->int8; 3->int32; 4->bfp16; 5->int64; 6->uint32; 8->uint8
 
 测试会输出模型耗时：time cost: min = xx   ms  |  max = xx   ms  |  avg = xx   ms
 
@@ -34,4 +75,15 @@ TNNTest
 P.S. 华为NPU
 NPU需要把HiAI so动态库push到手机上，并将他们添加到LD_LIBRARY_PATH环境变量中.
 可以参考 TNN/platform/android/test_android.sh 运行TNNTest
- 
\ No newline at end of file
+
+## 三、测试示例
+### 1. CUDA 端 TNNTest 示例
+```
+./scripts/build_cuda_linux/test/TNNTest \
+    -mp ./model/SqueezeNet/squeezenet_v1.1.tnnproto \
+    -pr HIGH \
+    -is "data:1,3,224,224;" \
+    -dt CUDA \
+    -wc 0 \
+    -ic 1 \
+    -op ./squeezenet_result.txt
diff --git a/doc/en/user/api_en.md b/doc/en/user/api_en.md
index 0f6a4f704..97926d289 100644
--- a/doc/en/user/api_en.md
+++ b/doc/en/user/api_en.md
@@ -64,7 +64,7 @@ TNN_NS::Status error;
 auto net_instance = tnn.CreateInst(config, error);
 ```
 
-TNN network construction needs configure the NetworkConfig parameter，and device_type could be set as `DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU` or other acceleration method，the construction of the network is completed through CreateInst interface.
+TNN network construction needs configure the NetworkConfig parameter，and device_type could be set as `DEVICE_ARM`， `DEVICE_OPENCL`， `DEVICE_METAL`， `DEVICE_X86`， `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU`、`DEVICE_ATLAS` or other acceleration method，the construction of the network is completed through CreateInst interface.
 
 ```cpp
 config.network_type = TNN_NS::NETWORK_TYPE_HUAWEI_NPU;
@@ -146,7 +146,7 @@ struct PUBLIC ModelConfig {
 
 ModelConfig parameters：  
 
-- `model_type`: The current open source version of TNN only supports importing `MODEL_TYPE_TNN`, `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML` model formats.  
+- `model_type`: The current open source version of TNN only supports importing `MODEL_TYPE_TNN`, `MODEL_TYPE_NCNN`, `MODEL_TYPE_COREML`、`DEVICE_ATLAS` model formats.  
 - `params`: The TNN model needs to pass in the content of the proto file and the path of the model file. The NCNN model needs to input the content of the param file and the path of the bin file, and the COREML model needs to input the directory path where the coreml model is located.  
 
 ```cpp
@@ -183,7 +183,7 @@ struct PUBLIC NetworkConfig {
 NetworkConfig parameter description:  
 
 - `device_type`:   
-The default is `DEVICE_ARM`. `DEVICE_NAIVE`, `DEVICE_ARM`, `DEVICE_X86`, `DEVICE_OPENCL`, `DEVICE_METAL`, `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU` are currently supported.  
+The default is `DEVICE_ARM`. `DEVICE_NAIVE`, `DEVICE_ARM`, `DEVICE_X86`, `DEVICE_OPENCL`, `DEVICE_METAL`, `DEVICE_CUDA`, `DEVICE_HUAWEI_NPU`, `DEVICE_RK_NPU` 、`DEVICE_ATLAS`are currently supported.  
 - `device_id`: The default value is 0. Multiple devices can be selected by `device_id`. Currently, only `DEVICE_CUDA` needs to configure this parameter to specify the gpu id.  
 - `data_format`: By default, tnn automatically selects the blob data arrangement method for acceleration. You can set a specific blob data arrangement for acceleration through this parameter.  
 - `network_type`: By default, the network type is automatically selected according to the `device_type`, and the network type to be constructed can be specified.  
@@ -556,3 +556,97 @@ The interface provides conversion from uchar string to std::string, which is mai
 
 ### 16. version.h
 Build version information.
+
+# Python API Documentation
+
+Python API encapsulates C++ Core related APIs based on pybind, and all defined types can be imported through the package name of `pytnn`. The related usage is basically the same as that of the C++ API, except that the function behavior of C++ passing reference parameters as the return value is changed, and it is changed to directly return as the function return value in the python corresponding interface. In addition, the Python API provides a simplified API interface.
+
+### 1. load 
+
+```python
+def load(model_path, config_dict = {}):
+```  
+
+Models like TNN whose model structure and weight are stored separately, only need to pass the tnnproto file path, and the model weight path is automatically searched based on the suffix name. `config_dict` supports dictionary input, the related key description is as follows:  
+
+
+* `input_shapes`:  support python list or dict input, the dict key can specify the input name。 each input can be specified in two formats:
+
+```python
+{ "input_shapes": [ {"min": [1,3,224,224], "max": [1,3,248,248]} ]}
+{ "input_shapes": [ [1,3,224,224] ]}
+```  
+
+min, max can be used to specify the minimum and maximum sizes supported. For a fixed size, you only need to specify one size. The size supports tuple and list.  
+For multi-input models, different input sizes can use different formats to specify the supported input sizes.  
+
+```python
+{ "input_shapes":  [ [1,3,112,112], {"min": [1,3,224,224], "max": [1,3,248,248]} ] }
+```   
+
+The first input is a fixed input size, and the second input specifies the minimum and maximum size supported.  
+
+The same input is passed in through dict, and the key can be used to specify the input name:
+
+```python
+{"input_shapes": {"data_0": [1,3,112,112], "data_1": {"min": [1,3,224,224], "max": [1,3,248,248]}}}
+```
+
+`data_0` is a fixed input size, and `data_1` specifies the minimum and maximum size supported.  
+
+
+* `device_type`:  Support enumeration type DeviceType and string type as input.
+
+```python
+{"device_type": DEVICE_NAIVE}
+{"device_type": "naive"}
+```  
+The enumeration type is the same as c++, and supports `DEVICE_CUDA`, `DEVICE_X86`, `DEVICE_ARM`, `DEVICE_NAIVE`, etc.  
+There is a one-to-one correspondence between string types and enumeration types. For example, `CUDA` and `cuda` both represent DEVICE_CUDA and support upper and lower case.
+
+Special note: device_type is not specified, `DEVICE_CUDA` is selected by default.
+
+* `data_format`, `network_type`, `precision`, `share_memory_mode`, `data_format` are similar to `device_type`, both support enumeration type and string type as input, enumeration type is the same as c++, string type and enumeration type Name one-to-one correspondence, support upper and lower case.
+
+* `cache_path`, `library_path` support string type as input, `enable_tune_kernel` supports boolean type as input.
+
+### 2. `load_raw`, `load_raw_range`
+
+```python
+def load_raw(model_path, network_config, input_shapes=None):
+def load_raw_range(model_path, network_config, min_input_shapes, max_input_shapes):
+```
+
+The two interfaces are simple packages of the TNN corresponding interface CreateInst, where `model_path` passes the model path; `network_config` is an instance of the `NetworkConfig` class, which is the same as the C++ class; `input_shapes`, `min_input_shapes`, and `max_input_shapes` correspond to related input size settings, the type is a dictionary, where the key is the input name, and the value corresponds to the input size list.
+
+## 二、网络运行
+
+After the model is loaded, an instance of the Module class will be returned, and an important function defined by the Module class is forward.
+
+```python
+class Module:
+...
+    def forward(self, *inputs, rtype="list"):
+...
+```  
+
+`inputs` is a variable number of  arguments, each input data is stored in `numpy.ndarray`, arranged as NC[D1-D4]. It supports direct input of multiple inputs, and also supports input in the form of list, tuple, and dict. For example, a two-input network, the input name is `data_1`, `data_2` in sequence, and support the following ways to transfer parameters.
+
+```
+input1=numpy.ones((1,3,224,224), np.float32, 'F')
+input2=numpy.ones((1,3,224,224), np.float32, 'F')
+# case1
+outputs=module.forward(input1, input2)
+#case2
+outputs=module.forward((input1, input2))
+#case3
+outputs=module.forward([input1, input2])
+#case4
+outputs=module.forward({"data_1":input1, "data_2":input2})
+```  
+
+The default return type of output is list, and each output is stored in `numpy.ndarray`, arranged as NC[D1-D4].
+
+`rtype` supports `list`, `dict`, if the output return type is a dictionary type, where key is the model output name, and value corresponds to the output data, which is stored in `numpy.ndarray`.
+
+
diff --git a/doc/en/user/compile_en.md b/doc/en/user/compile_en.md
index 96e28b03c..35fe574de 100644
--- a/doc/en/user/compile_en.md
+++ b/doc/en/user/compile_en.md
@@ -260,6 +260,42 @@ PARALLEL_LEVEL=4 cmake -D CMAKE_BUILD_TYPE=Release \
       -G Ninja ..
 ninja
 ```
+## X、Compile(ATLAS) 
+
+### 1. Environment requirements
+#### Dependencies
+
+  - cmake（version 3.1 or higher）
+  - Install arm toolchain
+  - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu
+  - other linux: download toolchains from https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+  - CANN environment dependencies:
+    toolkit package: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run
+    kernel package: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run 
+    chmod a+x Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run && chmod a+x Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run
+    ./Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run --install  # Default installation path:/usr/local/Ascend/ascend-toolkit
+    ./Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run --install
+### 2. Compilation Steps
+1）switch to 'scripts' dir
+```
+cd <path_to_tnn>/scripts
+```
+2）edit `build_aarch64_linux.sh` or `build_armhf_linux.sh` to config the building options  
+```
+ SHARED_LIB="ON"                # ON for dynamic lib，OFF for static lib
+ ARM="ON"                       # ON to build for ARM CPU
+ OPENMP="ON"                    # ON to enable OpenMP
+ OPENCL="OFF"                   # ON to build for GPU
+ RKNPU="OFF"                    # ON to build for RKNPU 
+ #for arm64:
+ CC=aarch64-linux-gnu-gcc       # set compiler for aarch64 C
+ CXX=aarch64-linux-gnu-g++      # set compiler for aarch64 C++
+ TARGET_ARCH=aarch64
+```
+3）execute the building script
+```
+./build_aarch64_linux.sh
+```
 
 ## Description for build options 
 
@@ -272,7 +308,7 @@ ninja
 |TNN_METAL_ENABLE| OFF | Code source/device/metal compilation switch, the code contains metal acceleration instructions.|
 |TNN_OPENCL_ENABLE| OFF | Code source/device/opencl compilation switch, the code contains opencl acceleration instructions.|
 |TNN_CUDA_ENABLE| OFF | Code source/device/cuda compilation switch, the code contains cuda acceleration instructions, currently only a small part of the implementation has been migrated.|
-|TNN_DSP_ENABLE| OFF | Code source/device/dsp compilation switch, currently adapted to snpe implementation.|
+|TNN_SNPE_ENABLE| OFF | Code source/device/snpe compilation switch, currently adapted to Qualcomm SNPE dsp implementation.|
 |TNN_ATLAS_ENABLE| OFF | The code source/device/atlas compilation switch is currently adapted to Huawei's atlas acceleration framework.|
 |TNN_HUAWEI_NPU_ENABLE| OFF | The code source/device/huawei_npu compilation switch is currently adapted to the HiAI acceleration framework.|
 |TNN_RK_NPU_ENABLE| OFF | The code source/device/rknpu compilation switch is currently adapted to the rknpu_ddk acceleration framework.|
diff --git a/doc/en/user/demo_en.md b/doc/en/user/demo_en.md
index fbcd5d5f7..c780f887f 100644
--- a/doc/en/user/demo_en.md
+++ b/doc/en/user/demo_en.md
@@ -108,8 +108,8 @@
 ## II. Introduction to Android Demo
 ### Environment requirements
 
-1. Android Studio 3.5 or above
-2. NDK version >= 18, <= 21
+1. Android Studio 3.5 or above, tested on Android Studio 2022.2.1
+2. NDK version >= 18
 NDK 22 and 23 are not suggested, because they may report error when link third party lib, eg. opencv, hiai.
 
 ### Steps
@@ -539,6 +539,48 @@ NDK 22 and 23 are not suggested, because they may report error when link third p
    ocr-detector demo
    ./demo_cuda_ocrdetecor
    ```
+##### Atlas
+* Environment Requirements  
+   - Cmake (>=3.1)
+   - Install arm toolchain
+   - ubuntu: aarch64: sudo apt-get install g++-aarch64-linux-gnu  gcc-aarch64-linux-gnu  
+   - other linux: download toolchains from https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads
+   - CANN environment dependencies:
+    toolkit package: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run
+    kernel package: wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/6.0.0.alpha003/Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run 
+    chmod a+x Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run && chmod a+x Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run
+    ./Ascend-cann-toolkit_6.0.0.alpha003_linux-aarch64.run --install  # Default installation path:/usr/local/Ascend/ascend-toolkit
+    ./Ascend-cann-kernels-310p_6.0.0.alpha003_linux.run --install
+* Comiple  
+   Move to `examples/linux/atlas` directory:
+   ```
+   cd <path_to_tnn>/examples/linux/atlas
+   ```
+   execute `build_atlas.sh`
+   ```
+   sh build_atlas.sh
+   ```
+* Execute  
+   Move to `examples/linux/atlas/build_atlas` directory, when execued without any parameters, demo executables will print usage message:
+    ```
+   cd build_atlas
+   ./demo_atlas_imageclassify
+   >Parameter -m and -p should be set 
+   >usage:
+   >./demo_arm_linux_imageclassify [-h] [-p] tnnproto [-m] ommodel [-i] <input>
+   >     -h, <help>      print a usage message.
+   >     -p, <proto>     (required) tnn proto file path
+   >     -m, <model>     (required) om model file path
+   >     -i, <input>     (required) input file path
+   >     -l, <label>     (optional) label file path. Default is: ../../../assets/synset.txt
+   ```
+   `-p` and `-m` are used for the tnnproto and om odel file paths, respectively; `-i` is for specifying the classification label file path. `-h` option is for printing the usage message. Examples for executing each demo is shown below:
+   ```
+   cd build_atlas
+
+   image-classification demo
+   ./demo_atlas_imageclassify -p ../../../../model/SqueezeNet/squeezenet_v1.1.tnnproto -m ../../../../model/SqueezeNet/squeezenet_v1.0.om -i ../../../assets/tiger_cat.jpg
+
 ### FAQ
 
 #### Demo Execution Questions
diff --git a/doc/en/user/test_en.md b/doc/en/user/test_en.md
index 6fa99acda..70eb6b665 100644
--- a/doc/en/user/test_en.md
+++ b/doc/en/user/test_en.md
@@ -12,23 +12,77 @@ Enable the test options：
 ## II. Usage 
 ### 1. Command
 ```
-TNNTest 
-required parameters： 
-    -mp path to model and proto(The proto and model should have the same prefix in the same folder)
-    -dt device type (ARM, OPENCL, HUAWEI_NPU, X86, CUDA)
-optional parameters：
-    -nt network type（default naive， npu needs to be specified as -nt HUAWEI_NPU）
-    -op path of the output  
-    -ic loop counter
-    -wc warmup counter 
-    -dl device list 
-    -ip input 
-    -it input type，default is NCHW float
-    -th CPU thread number 
+TNNTest
+Required Parameters：
+  -mp Model or Model Proto Position
+    - when model type is TNN Proto + Model, fill in absolute or relative path of TNN Proto file，e.g.：-mp ./model/SqueezeNet/squeezenet_v1.1.tnnproto
+    - when model type is TorchScript, fill in absolute or relative path of TorchScript file
+    - when model type is HUAWEI ATLAS OM, fill in absolute or relative path of OM file
+  -dt Target Device Type
+    - NAIVE - run on X86 or arm CPU，NAIVE mode ops are written by C++ without any optimization tech, NAIVE is for correctness checking
+    - X86 - run on X86_64 CPU
+    - ARM - run on ARM V7/V8 CPU
+    - CUDA - run on NVidia GPU
+    - OPENCL - run on OPENCL available GPU
+    - METAL - run METAL supported APPLE GPU
+    - HUAWEI_NPU - run on HUAWEI Hiai NPU
+    - RKNPU - run on RockChip RK NPU
+    - APPLE_NPU - run on APPLE NPU
+    - ZIXIAO - run on ZIXIAO NPU
+    - ATLAS - run on HUAWEI ATLAS NPU
+  -nt TNN Network Type
+    - Default - when target device is X86, ARM, CUDA etc, TNN will automatically select network, default is OK
+    - COREML - run APPLE CoreML
+    - HUAWEI_NPU - run HUAWEI Hiai NPU
+    - OPENVINO - run OpenVINO
+    - RKNPU - run RockChip RK NPU
+    - SNPE - run Qualcomm SNPE
+    - TRT - run NVidia TensorRT
+    - TORCH - run Pytorch TorchScript Network
+    - ZIXIAO - run ZIXIAO Network
+    - ATLAS - run HUAWEI ATLAS NPU
+  -op Output File Location
+    - blank(default) - end TNNTest without saving Output, not recommended
+    - ${YOUR_OUTPUT_NAME}.txt - absolute path for output.txt file，TNNTest will write all network results to this file
+
+Frequently Used Parameters：
+  -pr Network Precision
+    - AUTO(default) - automatically select precision depends on target device, may be either float32 or float16 depending on target device type
+    - HIGH - set precision to float32
+    - LOW - set precision to float16
+  -wc Warmup Iterations (not timed)
+    - 0(default) - TNNTest will start timing on the first forward by default，the first several Forward Calls might be slow on some devices
+    - N - run N extra forward steps before timing start
+  -ic Counted Iterations (timed)
+    - 1(default) - TNNTest will run 1 forward pass by default, 1 is recommended in result checking
+    - N - run N timed forward steps, TNNTest will record MIN/MAX/AVG forward time.
+  -dl Device Numbers
+    - 0(default) - TNNTest will run on device 0 by default
+    - Specified Values - specify one or multiple devices for TNNTest to run on, separated by comma，e.g.: -dl "0,1,2,3,4,5"
+  -is Input Shapes
+    - blank(default) - when mode type is TNN Proto, TNNTest will read default input shapes from TNN Proto, for other model types, -is is required
+    - Specified Values - specify one or multiple input shapes for TNNTest to run on, separated by semi-colon，e.g.: "in_0:1,3,512,512;in_2:1,3,256,256;"
+  -it Input DataTypes
+    - blank(default) - when mode type is TNN Proto, TNNTest will read default input data types from TNN Proto, for other model types, -it is required, input type is float32 by default in most cases
+    - Specified Values - specify one or multiple input data types for TNNTest to run on, separated by semi-colon，e.g.: "in_0:0;;in_2:3;"
+    - numbers and datatypes - 0->float32; 1->float16; 2->int8; 3->int32; 4->bfp16; 5->int64; 6->uint32; 8->uint8
 
 The test will output the timing info as：time cost: min = xx   ms  |  max = xx   ms  |  avg = xx   ms
 
 It can also be used as a benchmark tool. When you use it, you need to formulate wc> = 1, because the first run will prepare memory, context, etc.,which increases time consumption
+
 ```
 ### 2.  NPU
 The HiAI so libraries needs to be pushed to the phone，and which 
+
+## III. Example
+### 1. CUDA TNNTest Example
+```
+./scripts/build_cuda_linux/test/TNNTest \
+    -mp ./model/SqueezeNet/squeezenet_v1.1.tnnproto \
+    -pr HIGH \
+    -is "data:1,3,224,224;" \
+    -dt CUDA \
+    -wc 0 \
+    -ic 1 \
+    -op ./squeezenet_result.txt
diff --git a/examples/android/build.gradle b/examples/android/build.gradle
index 18ae16509..b2135e8b5 100644
--- a/examples/android/build.gradle
+++ b/examples/android/build.gradle
@@ -7,7 +7,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.5.2'
+        classpath 'com.android.tools.build:gradle:7.4.0'
 
 
         // NOTE: Do not place your application dependencies here; they belong
@@ -24,4 +24,4 @@ allprojects {
 
 task clean(type: Delete) {
     delete rootProject.buildDir
-}
\ No newline at end of file
+}
diff --git a/examples/android/demo/CMakeLists.txt b/examples/android/demo/CMakeLists.txt
index e5e01c90b..ddfbcc7e5 100644
--- a/examples/android/demo/CMakeLists.txt
+++ b/examples/android/demo/CMakeLists.txt
@@ -22,6 +22,8 @@ include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/kannarotate-andr
 include_directories(${BASE_SRC}/)
 include_directories(${UTILS_SRC}/)
 include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/cc/)
+include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/SNPE/include)
+include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/SNPE/include/SNPE)
 include_directories(${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/hiai_ddk/include)
 
 set(CMAKE_BUILD_TYPE Release)
@@ -33,6 +35,7 @@ if(ANDROID_ABI STREQUAL "arm64-v8a")
 endif()
 set(TNN_BUILD_SHARED OFF CACHE BOOL "" FORCE)
 set(TNN_HUAWEI_NPU_ENABLE OFF CACHE BOOL "" FORCE)
+set(TNN_SNPE_ENABLE OFF CACHE BOOL "" FORCE)
 set(TNN_CPU_ENABLE ON CACHE BOOL "" FORCE)
 set(TNN_OPENCV_ENABLE OFF CACHE BOOL "" FORCE)
 set(SHARING_MEM_WITH_OPENGL OFF CACHE BOOL "" FORCE)
@@ -59,6 +62,18 @@ if(TNN_OPENCV_ENABLE)
             ${OPENCV_ANDROID_SDK_PATH}/sdk/native/libs/${ANDROID_ABI}/libopencv_java3.so)
 endif()
 
+if(TNN_SNPE_ENABLE)
+    if(ANDROID_ABI STREQUAL "arm64-v8a")
+        add_library(SNPE
+            SHARED
+            IMPORTED)
+        set_target_properties(SNPE
+            PROPERTIES
+            IMPORTED_LOCATION
+            ${CMAKE_SOURCE_DIR}/src/main/jni/thirdparty/snpe/lib/aarch64-android/libSNPE.so)
+    endif()
+endif()
+
 if(TNN_HUAWEI_NPU_ENABLE)
     add_library(hiai
             SHARED
@@ -126,6 +141,12 @@ if(TNN_OPENCV_ENABLE)
     target_link_libraries(tnn_wrapper opencv)
 endif()
 
+if(TNN_SNPE_ENABLE)
+    if(ANDROID_ABI STREQUAL "arm64-v8a")
+        target_link_libraries(tnn_wrapper SNPE)
+    endif()
+endif()
+
 if(TNN_HUAWEI_NPU_ENABLE)
     target_link_libraries( # Specifies the target library.
             tnn_wrapper hiai hiai_ir hiai_ir_build)
diff --git a/examples/android/demo/build.gradle b/examples/android/demo/build.gradle
index 9273b92ee..e6c3cf2df 100644
--- a/examples/android/demo/build.gradle
+++ b/examples/android/demo/build.gradle
@@ -1,12 +1,11 @@
 apply plugin: 'com.android.application'
 
 android {
-    compileSdkVersion 24
-    buildToolsVersion "25.0.2"
+    compileSdkVersion 25
+
 
     defaultConfig {
         minSdkVersion 15
-        targetSdkVersion 22
         versionCode 1000
         versionName "1.0.0"
 
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java
index 189476de7..e96d424ee 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageBlazeFaceDetector/ImageBlazeFaceDetectFragment.java
@@ -43,6 +43,8 @@ public class ImageBlazeFaceDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -51,6 +53,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = mFaceDetector.checkNpu(modelPath);
     }
@@ -62,6 +67,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "blazeface.tnnmodel",
                 "blazeface.tnnproto",
+                "blazeface.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -176,6 +182,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if (mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java
index be08e765c..dd5b52ac8 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyDetector/ImageClassifyDetectFragment.java
@@ -36,6 +36,8 @@ public class ImageClassifyDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -44,6 +46,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = mImageClassify.checkNpu(modelPath);
     }
@@ -55,6 +60,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "squeezenet_v1.1.tnnmodel",
                 "squeezenet_v1.1.tnnproto",
+                "squeezenet_v1.1.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -168,6 +174,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if (mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyOpenGLDetector/ImageClassifyOpenGLDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyOpenGLDetector/ImageClassifyOpenGLDetectFragment.java
index 3233f63df..eff942fbb 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyOpenGLDetector/ImageClassifyOpenGLDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageClassifyOpenGLDetector/ImageClassifyOpenGLDetectFragment.java
@@ -36,6 +36,8 @@ public class ImageClassifyOpenGLDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -45,7 +47,9 @@ public void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
         String modelPath = initModel();
-
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         // OpenGL Demo not support huawei npu
         NpuEnable = false;
     }
@@ -57,6 +61,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "squeezenet_v1.1.tnnmodel",
                 "squeezenet_v1.1.tnnproto",
+                "squeezenet_v1.1.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -171,6 +176,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if (mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java
index b9338e73d..8f50e0fd4 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageFaceDetector/ImageFaceDetectFragment.java
@@ -43,6 +43,8 @@ public class ImageFaceDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -51,6 +53,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = mFaceDetector.checkNpu(modelPath);
     }
@@ -58,10 +63,11 @@ public void onCreate(Bundle savedInstanceState) {
     private String initModel() {
         String targetDir =  getActivity().getFilesDir().getAbsolutePath();
 
-        //copy detect model to sdcard
+        //copy detect model to sdcard, .dlc is Qualcomm SNPE model format
         String[] modelPathsDetector = {
                 "version-slim-320_simplified.tnnmodel",
                 "version-slim-320_simplified.tnnproto",
+                "version-slim-320_simplified.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -177,6 +183,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if(mUseSNPE) {
+            device = 3;
         } else if(mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java
index 740e4aa1d..a4462b743 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageOCRDetector/ImageOCRDetectFragment.java
@@ -41,6 +41,8 @@ public class ImageOCRDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -49,6 +51,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = mOCRDetector.checkNpu(modelPath);
     }
@@ -60,10 +65,13 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "angle_net.tnnmodel",
                 "angle_net.tnnproto",
+                "angle_net.dlc",
                 "crnn_lite_lstm.tnnmodel",
                 "crnn_lite_lstm.tnnproto",
+                "crnn_lite_lstm.dlc",
                 "dbnet.tnnmodel",
                 "dbnet.tnnproto",
+                "dbnet.dlc",
                 "keys.txt",
         };
 
@@ -178,6 +186,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if(mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java
index 61e066d6d..2e9642a80 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetector/ImageObjectDetectFragment.java
@@ -43,6 +43,8 @@ public class ImageObjectDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -51,6 +53,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = mObjectDetector.checkNpu(modelPath);
     }
@@ -62,6 +67,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "yolov5s.tnnmodel",
                 "yolov5s-permute.tnnproto",
+                "yolov5s.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -174,6 +180,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if(mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java
index c9b9e503b..7e0d2c157 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ImageObjectDetectorSSD/ImageObjectDetectSSDFragment.java
@@ -44,6 +44,9 @@ public class ImageObjectDetectSSDFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
+
 
     /**********************************     Get Preview Advised    **********************************/
 
@@ -52,6 +55,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         String modelPath = initModel();
         NpuEnable = imageObjectDetectorSSD.checkNpu(modelPath);
     }
@@ -63,6 +69,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "mobilenetv2_ssd_tf_fix_box.tnnmodel",
                 "mobilenetv2_ssd_tf_fix_box.tnnproto",
+                "mobilenetv2_ssd_tf_fix_box.dlc",
         };
         for (int i = 0; i < modelPathsDetector.length; i++) {
             String modelFilePath = modelPathsDetector[i];
@@ -174,6 +181,8 @@ private void startDetect() {
         int device = 0;
         if (mUseHuaweiNpu) {
             device = 2;
+        } else if (mUseSNPE) {
+            device = 3;
         } else if(mUseGPU) {
             device = 1;
         }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ReadingComprehension/ReadingComprehensionFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ReadingComprehension/ReadingComprehensionFragment.java
index dd402de0f..891b8c588 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/ReadingComprehension/ReadingComprehensionFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/ReadingComprehension/ReadingComprehensionFragment.java
@@ -39,6 +39,8 @@ public class ReadingComprehensionFragment extends BaseFragment {
     private EditText answer;
 
     private boolean mUseGPU = false;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     String material1 = "TNN: A high-performance, lightweight neural network inference framework open sourced by Tencent Youtu Lab. It also has many outstanding advantages such as cross-platform, high performance, model compression, and code tailoring. The TNN framework further strengthens the support and performance optimization of mobile devices on the basis of the original Rapidnet and ncnn frameworks. At the same time, it refers to the high performance and good scalability characteristics of the industry's mainstream open source frameworks, and expands the support for X86 and NV GPUs. On the mobile phone, TNN has been used by many applications such as mobile QQ, weishi, and Pitu. As a basic acceleration framework for Tencent Cloud AI, TNN has provided acceleration support for the implementation of many businesses. Everyone is welcome to participate in the collaborative construction to promote the further improvement of the TNN inference framework.";
     String question1 = "what advantages does TNN have?";
@@ -56,6 +58,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "tiny-bert-squad.tnnmodel",
                 "tiny-bert-squad.tnnproto",
+                "tiny-bert-squad.dlc",
                 "vocab.txt"
         };
 
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java
index b08a00607..640fb575c 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceAlign/StreamBlazeFaceAlignFragment.java
@@ -59,6 +59,8 @@ public class StreamBlazeFaceAlignFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -81,7 +83,8 @@ private String initModel() {
         //copy detect model to sdcard
         String[] detectModelPathsDetector = {
                 "blazeface.tnnmodel",
-                "blazeface.tnnproto"
+                "blazeface.tnnproto",
+                "blazeface.dlc"
         };
 
         for (int i = 0; i < detectModelPathsDetector.length; i++) {
@@ -94,8 +97,10 @@ private String initModel() {
         String[] alignModelPathsDetector = {
                 "youtu_face_alignment_phase1.tnnmodel",
                 "youtu_face_alignment_phase1.tnnproto",
+                "youtu_face_alignment_phase1.dlc",
                 "youtu_face_alignment_phase2.tnnmodel",
-                "youtu_face_alignment_phase2.tnnproto"
+                "youtu_face_alignment_phase2.tnnproto",
+                "youtu_face_alignment_phase2.dlc"
         };
 
         for (int i = 0; i < alignModelPathsDetector.length; i++) {
@@ -308,6 +313,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -351,6 +358,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -376,6 +385,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java
index 98a772a2e..8588c64c0 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamBlazeFaceDetector/StreamBlazeFaceDetectFragment.java
@@ -58,6 +58,8 @@ public class StreamBlazeFaceDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -68,6 +70,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -81,6 +86,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "blazeface.tnnmodel",
                 "blazeface.tnnproto",
+                "blazeface.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -287,6 +293,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -330,6 +338,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -355,6 +365,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java
index 184d8da35..c4ed22c72 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamFaceDetector/StreamFaceDetectFragment.java
@@ -57,6 +57,8 @@ public class StreamFaceDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -67,6 +69,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -80,6 +85,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "version-slim-320_simplified.tnnmodel",
                 "version-slim-320_simplified.tnnproto",
+                "version-slim-320_simplified.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -285,6 +291,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -328,6 +336,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -353,6 +363,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java
index 70ee85343..89ee65477 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamHairSegmentation/StreamHairSegmentationFragment.java
@@ -60,6 +60,8 @@ public class StreamHairSegmentationFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
     private byte[] mColor = {(byte)0, (byte)0, (byte)185, (byte)90};
@@ -71,6 +73,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -84,6 +89,7 @@ private String initModel() {
         String[] modelPathsSegmentation = {
                 "segmentation.tnnmodel",
                 "segmentation.tnnproto",
+                "segmentation.dlc",
         };
 
         for (int i = 0; i < modelPathsSegmentation.length; i++) {
@@ -350,6 +356,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -393,6 +401,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -419,6 +429,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java
index d09a4a23f..41a812889 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamOCRDetector/StreamOCRDetectFragment.java
@@ -57,6 +57,8 @@ public class StreamOCRDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -67,6 +69,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -80,10 +85,13 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "angle_net.tnnmodel",
                 "angle_net.tnnproto",
+                "angle_net.dlc",
                 "crnn_lite_lstm.tnnmodel",
                 "crnn_lite_lstm.tnnproto",
+                "crnn_lite_lstm.dlc",
                 "dbnet.tnnmodel",
                 "dbnet.tnnproto",
+                "dbnet.dlc",
                 "keys.txt",
         };
 
@@ -290,6 +298,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -333,6 +343,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -358,6 +370,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java
index a5073ebb5..ae00c63ca 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetector/StreamObjectDetectFragment.java
@@ -60,6 +60,8 @@ public class StreamObjectDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -70,6 +72,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -83,6 +88,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "yolov5s.tnnmodel",
                 "yolov5s-permute.tnnproto",
+                "yolov5s.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -293,6 +299,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -336,6 +344,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -361,6 +371,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorNanodet/StreamObjectDetectNanodetFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorNanodet/StreamObjectDetectNanodetFragment.java
index 6e966cfce..f2cbcc389 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorNanodet/StreamObjectDetectNanodetFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorNanodet/StreamObjectDetectNanodetFragment.java
@@ -57,6 +57,8 @@ public class StreamObjectDetectNanodetFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -67,6 +69,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -80,10 +85,13 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "nanodet_m.tnnmodel",
                 "nanodet_m.tnnproto",
+                "nanodet_m.dlc",
                 "nanodet_e1.tnnmodel",
                 "nanodet_e1.tnnproto",
+                "nanodet_e1.dlc",
                 "nanodet_t.tnnmodel",
                 "nanodet_t.tnnproto",
+                "nanodet_t.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -291,6 +299,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -334,6 +344,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -359,6 +371,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java
index 85d069f1d..8f08aa100 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamObjectDetectorSSD/StreamObjectDetectSSDFragment.java
@@ -61,6 +61,8 @@ public class StreamObjectDetectSSDFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
 
@@ -71,6 +73,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -84,6 +89,7 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "mobilenetv2_ssd_tf_fix_box.tnnmodel",
                 "mobilenetv2_ssd_tf_fix_box.tnnproto",
+                "mobilenetv2_ssd_tf_fix_box.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -290,6 +296,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -333,6 +341,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -358,6 +368,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java
index 350130751..e04a8bd07 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamPoseDetectLandmark/StreamPoseDetectLandmarkFragment.java
@@ -64,6 +64,8 @@ public class StreamPoseDetectLandmarkFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
     private int detector_type = 0; // 0 : big, 1 : middle, 2 : small
@@ -75,6 +77,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -88,10 +93,13 @@ private String initModel() {
         String[] modelPathsDetector = {
                 "pose_detection.tnnproto",
                 "pose_detection.tnnmodel",
+                "pose_detection.dlc",
                 "pose_landmark_upper_body.tnnproto",
                 "pose_landmark_upper_body.tnnmodel",
+                "pose_landmark_upper_body.dlc",
                 "pose_landmark_full_body.tnnproto",
                 "pose_landmark_full_body.tnnmodel",
+                "pose_landmark_full_body.dlc",
         };
 
         for (int i = 0; i < modelPathsDetector.length; i++) {
@@ -310,6 +318,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -353,6 +363,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -378,6 +390,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java
index 0f9e7eff4..e02fb9ec9 100644
--- a/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java
+++ b/examples/android/demo/src/main/java/com/tencent/tnn/demo/StreamSkeletonDetector/StreamSkeletonDetectFragment.java
@@ -62,6 +62,8 @@ public class StreamSkeletonDetectFragment extends BaseFragment {
     private ToggleButton mHuaweiNPUswitch;
     private boolean mUseHuaweiNpu = false;
     private TextView HuaweiNpuTextView;
+    // add for qualcomm SNPE
+    private boolean mUseSNPE = false;
 
     private boolean mDeviceSwitched = false;
     private int detector_type = 0; // 0 : high precision, 1 : fast
@@ -73,6 +75,9 @@ public void onCreate(Bundle savedInstanceState) {
         Log.d(TAG, "onCreate");
         super.onCreate(savedInstanceState);
         System.loadLibrary("tnn_wrapper");
+        if (mUseSNPE) {
+            System.loadLibrary("SNPE");
+        }
         //start SurfaceHolder
         mDemoSurfaceHolder = new DemoSurfaceHolder(this);
         String modelPath = initModel();
@@ -84,6 +89,7 @@ private String initModel() {
 
         //copy detect model to sdcard
         String[] modelPathsDetector = {
+                "skeleton.dlc",
                 "skeleton.tnnmodel",
                 "skeleton_big.tnnproto",
                 "skeleton_small.tnnproto",
@@ -306,6 +312,8 @@ private void openCamera(int cameraFacing) {
                     int device = 0;
                     if (mUseHuaweiNpu) {
                         device = 2;
+                    } else if (mUseSNPE) {
+                        device = 3;
                     } else if (mUseGPU) {
                         device = 1;
                     }
@@ -349,6 +357,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                 int device = 0;
                                 if (mUseHuaweiNpu) {
                                     device = 2;
+                                } else if (mUseSNPE) {
+                                    device = 3;
                                 } else if (mUseGPU) {
                                     device = 1;
                                 }
@@ -376,6 +386,8 @@ public void onPreviewFrame(byte[] data, Camera camera) {
                                     monitorResult += "opencl\n";
                                 } else if (mUseHuaweiNpu) {
                                     monitorResult += "huawei_npu\n";
+                                } else if (mUseSNPE) {
+                                    monitorResult += "snpe\n";
                                 } else {
                                     monitorResult += "arm\n";
                                 }
diff --git a/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc b/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc
index 3ae93fafb..df4163674 100644
--- a/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/blazeface_align_jni.cc
@@ -12,7 +12,7 @@
 static std::shared_ptr<TNN_NS::FaceDetectAligner> gAligner;
 
 static jclass clsFaceInfo;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static int target_width = 128;
 static int target_height = 128;
 
@@ -44,11 +44,17 @@ std::shared_ptr<TNN_NS::BlazeFaceDetector> CreateBlazeFaceDetector(JNIEnv *env,
     auto predictor = std::make_shared<TNN_NS::BlazeFaceDetector>();
     std::string proto_content, model_content, lib_path = "";
     modelPathStr = jstring2string(env, modelPath);
-    proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
-    model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    
+    gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        proto_content = "-";  // protoContent just not empty.
+        model_content = fdLoadFile(modelPathStr + "/blazeface.dlc");
+    } else {
+        proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+        model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    }
     LOGI("proto content size %d model content size %d", proto_content.length(),
          model_content.length());
-    gComputeUnitType = computUnitType;
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
@@ -56,18 +62,18 @@ std::shared_ptr<TNN_NS::BlazeFaceDetector> CreateBlazeFaceDetector(JNIEnv *env,
 
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = predictor->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         LOGI("the device type  %d device huawei_npu", gComputeUnitType);
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         predictor->setNpuModelPath(modelPathStr + "/");
         predictor->setCheckNpuSwitch(false);
-        status = predictor->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
         option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-        status = predictor->Init(option);
     }
+    status = predictor->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int) status);
diff --git a/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc b/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc
index cb4166726..bf93f4dd7 100644
--- a/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/blazeface_detector_jni.cc
@@ -9,7 +9,7 @@
 
 static std::shared_ptr<TNN_NS::BlazeFaceDetector> gDetector;
 static jclass clsFaceInfo;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static int target_width = 128;
 static int target_height = 128;
 
@@ -30,11 +30,17 @@ JNIEXPORT jint JNICALL TNN_BLAZEFACE_DETECTOR(init)(JNIEnv *env, jobject thiz, j
     gDetector = std::make_shared<TNN_NS::BlazeFaceDetector>();
     std::string proto_content, model_content, lib_path = "";
     modelPathStr = jstring2string(env, modelPath);
-    proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
-    model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    
+    gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        proto_content = "-";  // protoContent just not empty.
+        model_content = fdLoadFile(modelPathStr + "/blazeface.dlc");
+    } else {
+        proto_content = fdLoadFile(modelPathStr + "/blazeface.tnnproto");
+        model_content = fdLoadFile(modelPathStr + "/blazeface.tnnmodel");
+    }
     LOGI("proto content size %d model content size %d", proto_content.length(),
          model_content.length());
-    gComputeUnitType = computUnitType;
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::BlazeFaceDetectorOption>();
@@ -42,18 +48,18 @@ JNIEXPORT jint JNICALL TNN_BLAZEFACE_DETECTOR(init)(JNIEnv *env, jobject thiz, j
 
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         LOGI("the device type  %d device huawei_npu", gComputeUnitType);
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
         option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-        status = gDetector->Init(option);
     }
+    status = gDetector->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int) status);
@@ -171,6 +177,8 @@ TNN_BLAZEFACE_DETECTOR(detectFromImage)(JNIEnv *env, jobject thiz, jobject image
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
 
     char temp[128] = "";
diff --git a/examples/android/demo/src/main/jni/cc/face_detector_jni.cc b/examples/android/demo/src/main/jni/cc/face_detector_jni.cc
index 7e5ef1bf8..59da245f7 100644
--- a/examples/android/demo/src/main/jni/cc/face_detector_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/face_detector_jni.cc
@@ -21,7 +21,7 @@
 #include <android/bitmap.h>
 
 static std::shared_ptr<TNN_NS::UltraFaceDetector> gDetector;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsFaceInfo;
 static jmethodID midconstructorFaceInfo;
 static jfieldID fidx1;
@@ -40,10 +40,16 @@ JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstrin
     gDetector = std::make_shared<TNN_NS::UltraFaceDetector>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnmodel");
-    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+
     gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/version-slim-320_simplified.tnnmodel");
+    }
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::UltraFaceDetectorOption>();
@@ -56,21 +62,21 @@ JNIEXPORT JNICALL jint TNN_FACE_DETECTOR(init)(JNIEnv *env, jobject thiz, jstrin
     option->input_height= height;
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
 	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-    	status = gDetector->Init(option);
     }
+    status = gDetector->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int)status);
-        return -1;
+        return status;
     }
 
     if (clsFaceInfo == NULL)
@@ -225,6 +231,8 @@ JNIEXPORT JNICALL jobjectArray TNN_FACE_DETECTOR(detectFromImage)(JNIEnv *env, j
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
     sprintf(temp, " device: %s \ntime:", device.c_str());
     std::string computeUnitTips(temp);
diff --git a/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc b/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc
index 1da269ad2..6da8db870 100644
--- a/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/hair_segmentation_jni.cc
@@ -22,7 +22,7 @@
 #include <android/bitmap.h>
 
 static std::shared_ptr<TNN_NS::HairSegmentation> gSegmentator;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsImageInfo;
 static jmethodID midconstructorImageInfo;
 static jfieldID fidimage_width;
@@ -37,10 +37,18 @@ JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(init)(JNIEnv *env, jobject thiz, js
     gSegmentator = std::make_shared<TNN_NS::HairSegmentation>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
+    
+    gComputeUnitType = computeUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/segmentation.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/segmentation.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/segmentation.tnnmodel");
+    }
     protoContent = fdLoadFile(modelPathStr + "/segmentation.tnnproto");
     modelContent = fdLoadFile(modelPathStr + "/segmentation.tnnmodel");
     LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
-    gComputeUnitType = computeUnitType;
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::HairSegmentationOption>();
@@ -52,17 +60,17 @@ JNIEXPORT JNICALL jint TNN_HAIR_SEGMENTATION(init)(JNIEnv *env, jobject thiz, js
     option->mode = 1;
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gSegmentator->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gSegmentator->setNpuModelPath(modelPathStr + "/");
         gSegmentator->setCheckNpuSwitch(false);
-        status = gSegmentator->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
 	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-    	status = gSegmentator->Init(option);
     }
+    status = gSegmentator->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int)status);
@@ -169,4 +177,4 @@ JNIEXPORT JNICALL jobjectArray TNN_HAIR_SEGMENTATION(predictFromStream)(JNIEnv *
         return 0;
     }
 
-}
\ No newline at end of file
+}
diff --git a/examples/android/demo/src/main/jni/cc/image_classify_jni.cc b/examples/android/demo/src/main/jni/cc/image_classify_jni.cc
index fd936dabd..e46e86be1 100644
--- a/examples/android/demo/src/main/jni/cc/image_classify_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/image_classify_jni.cc
@@ -18,7 +18,7 @@
 #include <android/bitmap.h>
 
 static std::shared_ptr<TNN_NS::ImageClassifier> gDetector;
-static int gComputeUnitType = 0;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 
 JNIEXPORT JNICALL jint TNN_CLASSIFY(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType)
 {
@@ -28,11 +28,17 @@ JNIEXPORT JNICALL jint TNN_CLASSIFY(init)(JNIEnv *env, jobject thiz, jstring mod
     gDetector = std::make_shared<TNN_NS::ImageClassifier>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnmodel");
+    
+    gComputeUnitType = computeUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnmodel");
+    }
     LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
     TNN_NS::Status status = TNN_NS::TNN_OK;
-    gComputeUnitType = computeUnitType;
 
     auto option = std::make_shared<TNN_NS::TNNSDKOption>();
     option->compute_units = TNN_NS::TNNComputeUnitsCPU;
@@ -47,6 +53,8 @@ JNIEXPORT JNICALL jint TNN_CLASSIFY(init)(JNIEnv *env, jobject thiz, jstring mod
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
 	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
     }
@@ -131,6 +139,8 @@ JNIEXPORT JNICALL jintArray TNN_CLASSIFY(detectFromImage)(JNIEnv *env, jobject t
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
     sprintf(temp, " device: %s \ntime: ", device.c_str());
     std::string computeUnitTips(temp);
diff --git a/examples/android/demo/src/main/jni/cc/image_classify_opengl_jni.cc b/examples/android/demo/src/main/jni/cc/image_classify_opengl_jni.cc
index 74ac5248d..e5de75e25 100644
--- a/examples/android/demo/src/main/jni/cc/image_classify_opengl_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/image_classify_opengl_jni.cc
@@ -22,7 +22,7 @@
 static std::shared_ptr<TNN_NS::OpenGLDirectMemAdapter> gAdapter;
 #endif
 static std::shared_ptr<TNN_NS::ImageClassifier> gDetector;
-static int gComputeUnitType = 1;
+static int gComputeUnitType = 1; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 
 JNIEXPORT JNICALL jint TNN_CLASSIFY_OPENGL(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint width, jint height, jint computeUnitType)
 {
@@ -35,11 +35,12 @@ JNIEXPORT JNICALL jint TNN_CLASSIFY_OPENGL(init)(JNIEnv *env, jobject thiz, jstr
     gDetector = std::make_shared<TNN_NS::ImageClassifier>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
+    
+    gComputeUnitType = computeUnitType;
     protoContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnproto");
     modelContent = fdLoadFile(modelPathStr + "/squeezenet_v1.1.tnnmodel");
     LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
     TNN_NS::Status status = TNN_NS::TNN_OK;
-    gComputeUnitType = computeUnitType;
 
     auto option = std::make_shared<TNN_NS::TNNSDKOption>();
     option->compute_units = TNN_NS::TNNComputeUnitsCPU;
diff --git a/examples/android/demo/src/main/jni/cc/object_detector_nanodet_jni.cc b/examples/android/demo/src/main/jni/cc/object_detector_nanodet_jni.cc
index c3a177568..fb1b28586 100644
--- a/examples/android/demo/src/main/jni/cc/object_detector_nanodet_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/object_detector_nanodet_jni.cc
@@ -11,7 +11,7 @@
 #include "tnn/utils/mat_utils.h"
 
 static std::shared_ptr<TNN_NS::ObjectDetectorNanodet> gDetector;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsObjectInfo;
 static jmethodID midconstructorObjectInfo;
 static jfieldID fidx1;
@@ -30,12 +30,19 @@ JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR_NANODET(init)(JNIEnv *env, jobject th
     gDetector = std::make_shared<TNN_NS::ObjectDetectorNanodet>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/nanodet_m.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/nanodet_m.tnnmodel");
-    // protoContent = fdLoadFile(modelPathStr + "/nanodet_e1.tnnproto");
-    // modelContent = fdLoadFile(modelPathStr + "/nanodet_e1.tnnmodel");
-    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    
     gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/nanodet_m.dlc");
+        //modelContent = fdLoadFile(modelPathStr + "/nanodet_e1.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/nanodet_m.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/nanodet_m.tnnmodel");
+        //protoContent = fdLoadFile(modelPathStr + "/nanodet_e1.tnnproto");
+        //modelContent = fdLoadFile(modelPathStr + "/nanodet_e1.tnnmodel");
+    }
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::ObjectDetectorNanodetOption>();
@@ -47,17 +54,17 @@ JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR_NANODET(init)(JNIEnv *env, jobject th
     LOGI("the device type  %d device huawei_npu" ,gComputeUnitType);
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
 	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-    	status = gDetector->Init(option);
     }
+    status = gDetector->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int)status);
diff --git a/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc b/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc
index 55e0db7f9..3b547e344 100644
--- a/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/objectdetector_jni.cc
@@ -11,7 +11,7 @@
 #include "tnn/utils/mat_utils.h"
 
 static std::shared_ptr<TNN_NS::ObjectDetectorYolo> gDetector;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsObjectInfo;
 static jmethodID midconstructorObjectInfo;
 static jfieldID fidx1;
@@ -31,10 +31,16 @@ JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(init)(JNIEnv *env, jobject thiz, jstr
     gDetector = std::make_shared<TNN_NS::ObjectDetectorYolo>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/yolov5s-permute.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/yolov5s.tnnmodel");
-    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    
     gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/yolov5s.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/yolov5s-permute.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/yolov5s.tnnmodel");
+    }
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::TNNSDKOption>();
@@ -45,17 +51,17 @@ JNIEXPORT JNICALL jint TNN_OBJECT_DETECTOR(init)(JNIEnv *env, jobject thiz, jstr
     LOGI("the device type  %d device huawei_npu" ,gComputeUnitType);
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
 	    option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-    	status = gDetector->Init(option);
     }
+    status = gDetector->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int)status);
@@ -227,6 +233,8 @@ JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTOR(detectFromImage)(JNIEnv *env,
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
     sprintf(temp, " device: %s \ntime:", device.c_str());
     std::string computeUnitTips(temp);
diff --git a/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc b/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc
index f2f99f24b..99f1e17bb 100644
--- a/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/objectdetectorssd_jni.cc
@@ -14,7 +14,7 @@
 #include "tnn/utils/mat_utils.h"
 
 static std::shared_ptr<TNN_NS::ObjectDetectorSSD> gDetector;
-static int gComputeUnitType = 0;
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static int target_height = 300;
 static int target_width = 300;
 static std::vector<int> target_dims = {1, 3, target_height, target_width};
@@ -38,11 +38,17 @@ TNN_OBJECT_DETECTORSSD(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint
     gDetector = std::make_shared<TNN_NS::ObjectDetectorSSD>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd_tf_fix_box.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd_tf_fix_box.tnnmodel");
+
+    gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd_tf_fix_box.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd_tf_fix_box.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/mobilenetv2_ssd_tf_fix_box.tnnmodel");
+    }
     LOGI("proto content size %d model content size %d", protoContent.length(),
          modelContent.length());
-    gComputeUnitType = computUnitType;
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::TNNSDKOption>();
@@ -54,18 +60,18 @@ TNN_OBJECT_DETECTORSSD(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint
     LOGI("the device type %d device huawei_npu", gComputeUnitType);
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         LOGI("the device type  %d device huawei_npu", gComputeUnitType);
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
         option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-        status = gDetector->Init(option);
     }
+    status = gDetector->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
         LOGE("detector init failed %d", (int) status);
@@ -235,6 +241,8 @@ JNIEXPORT JNICALL jobjectArray TNN_OBJECT_DETECTORSSD(detectFromImage)(JNIEnv *e
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
     sprintf(temp, " device: %s \ntime:", device.c_str());
     std::string computeUnitTips(temp);
diff --git a/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc b/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc
index c1a1e5548..b4a0f3285 100644
--- a/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/ocr_detector_jni.cc
@@ -31,7 +31,7 @@ static std::shared_ptr<TNN_NS::OCRTextboxDetector> gOCRTextboxDetector;
 static std::shared_ptr<TNN_NS::OCRAnglePredictor> gOCRAnglePredictor;
 static std::shared_ptr<TNN_NS::OCRTextRecognizer> gOCRTextRecognizer;
 #endif
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsObjectInfo;
 static jmethodID midconstructorObjectInfo;
 static jfieldID fidkeypoints;
@@ -50,10 +50,16 @@ JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring
     gOCRTextRecognizer = std::make_shared<TNN_NS::OCRTextRecognizer>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/dbnet.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/dbnet.tnnmodel");
-    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    
     gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/dbnet.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/dbnet.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/dbnet.tnnmodel");
+    }
+    LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
     {
@@ -66,17 +72,15 @@ JNIEXPORT JNICALL jint TNN_OCR_DETECTOR(init)(JNIEnv *env, jobject thiz, jstring
         option->padding = 10;
         if (gComputeUnitType == 1) {
             option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-            status = gOCRTextboxDetector->Init(option);
         } else if (gComputeUnitType == 2) {
             //add for huawei_npu store the om file
             option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
             gOCRTextboxDetector->setNpuModelPath(modelPathStr + "/");
             gOCRTextboxDetector->setCheckNpuSwitch(false);
-            status = gOCRTextboxDetector->Init(option);
         } else {
             option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-            status = gOCRTextboxDetector->Init(option);
         }
+        status = gOCRTextboxDetector->Init(option);
 
         if (status != TNN_NS::TNN_OK) {
             LOGE("ocr textbox detector init failed %d", (int)status);
@@ -383,6 +387,8 @@ JNIEXPORT JNICALL jobjectArray TNN_OCR_DETECTOR(detectFromImage)(JNIEnv *env, jo
         device = "gpu";
     } else if (gComputeUnitType == 2) {
         device = "huawei_npu";
+    } else if (gComputeUnitType == 3) {
+        device = "snpe";
     }
     sprintf(temp, " device: %s \n", device.c_str());
     std::string computeUnitTips(temp);
diff --git a/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc b/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc
index fe996094f..cb6430591 100644
--- a/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/pose_detect_landmark_jni.cc
@@ -17,7 +17,7 @@ static std::shared_ptr<TNN_NS::PoseDetectLandmark> gFullBodyDetector;
 static std::shared_ptr<TNN_NS::BlazePoseDetector> gBlazePoseDetector;
 static std::shared_ptr<TNN_NS::BlazePoseLandmark> gBlazePoseLandmark;
 static std::shared_ptr<TNN_NS::BlazePoseLandmark> gBlazePoseFullBodyLandmark;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsObjectInfo;
 static jmethodID midconstructorObjectInfo;
 static jfieldID fidx1;
@@ -41,10 +41,16 @@ JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(init)(JNIEnv *env, jobject thiz,
     gBlazePoseFullBodyLandmark = std::make_shared<TNN_NS::BlazePoseLandmark>();
     std::string protoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/pose_detection.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/pose_detection.tnnmodel");
-    LOGI("pose detection proto content size %d model content size %d", protoContent.length(), modelContent.length());
+    
     gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/pose_detection.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/pose_detection.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/pose_detection.tnnmodel");
+    }
+    LOGI("pose detection proto content size %d model content size %d", protoContent.length(), modelContent.length());
     LOGI("device type: %d", gComputeUnitType);
 
     TNN_NS::Status status = TNN_NS::TNN_OK;
@@ -58,17 +64,17 @@ JNIEXPORT JNICALL jint TNN_POSE_DETECT_LANDMARK(init)(JNIEnv *env, jobject thiz,
         option->min_suppression_threshold = 0.3;
         if (gComputeUnitType == 1) {
             option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-            status = gBlazePoseDetector->Init(option);
         } else if (gComputeUnitType == 2) {
             //add for huawei_npu store the om file
             option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
             gBlazePoseDetector->setNpuModelPath(modelPathStr + "/");
             gBlazePoseDetector->setCheckNpuSwitch(false);
-            status = gBlazePoseDetector->Init(option);
+        } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+            option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
         } else {
             option->compute_units = TNN_NS::TNNComputeUnitsCPU;
-            status = gBlazePoseDetector->Init(option);
         }
+        status = gBlazePoseDetector->Init(option);
 
         if (status != TNN_NS::TNN_OK) {
             LOGE("blaze pose detector init failed %d", (int)status);
diff --git a/examples/android/demo/src/main/jni/cc/reading_comprehension_jni.cc b/examples/android/demo/src/main/jni/cc/reading_comprehension_jni.cc
index 4b65ffe90..dc580b084 100644
--- a/examples/android/demo/src/main/jni/cc/reading_comprehension_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/reading_comprehension_jni.cc
@@ -18,7 +18,7 @@
 #import "bert_tokenizer.h"
 
 static std::shared_ptr<TNN_NS::TNNSDKSample> gResponder;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 
 JNIEXPORT JNICALL jint TNN_READING_COMPREHENSION(init)(JNIEnv *env, jobject thiz, jstring modelPath, jint computeUnitType){
 
@@ -26,11 +26,17 @@ JNIEXPORT JNICALL jint TNN_READING_COMPREHENSION(init)(JNIEnv *env, jobject thiz
     gResponder = std::make_shared<TNN_NS::TNNSDKSample>();
     std::string protoContent, modelContent, vocabContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/tiny-bert-squad.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/tiny-bert-squad.tnnmodel");
+
+    gComputeUnitType = computeUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent = "-";  // protoContent just not empty.
+        modelContent = fdLoadFile(modelPathStr + "/tiny-bert-squad.dlc");
+    } else {
+        protoContent = fdLoadFile(modelPathStr + "/tiny-bert-squad.tnnproto");
+        modelContent = fdLoadFile(modelPathStr + "/tiny-bert-squad.tnnmodel");
+    }
     LOGI("proto content size %d model content size %d", protoContent.length(), modelContent.length());
     TNN_NS::Status status = TNN_NS::TNN_OK;
-    gComputeUnitType = computeUnitType;
 
     auto option = std::make_shared<TNN_NS::TNNSDKOption>();
     option->compute_units = TNN_NS::TNNComputeUnitsCPU;
@@ -47,13 +53,13 @@ JNIEXPORT JNICALL jint TNN_READING_COMPREHENSION(init)(JNIEnv *env, jobject thiz
         gResponder->setNpuModelPath(modelPathStr + "/");
         gResponder->setCheckNpuSwitch(false);
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
-
         LOGE("tiny bert does not support NPU");
         return -1;
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
         option->compute_units = TNN_NS::TNNComputeUnitsCPU;
     }
-
     status = gResponder->Init(option);
 
     if (status != TNN_NS::TNN_OK) {
diff --git a/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc b/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc
index 11b351178..076234efe 100644
--- a/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc
+++ b/examples/android/demo/src/main/jni/cc/skeleton_detector_jni.cc
@@ -12,7 +12,7 @@
 
 static std::shared_ptr<TNN_NS::SkeletonDetector> gDetector;
 static std::shared_ptr<TNN_NS::SkeletonDetector> gSmallDetector;
-static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu
+static int gComputeUnitType = 0; // 0 is cpu, 1 is gpu, 2 is huawei_npu, 3 is qualcomm SNPE
 static jclass clsObjectInfo;
 static jmethodID midconstructorObjectInfo;
 static jfieldID fidx1;
@@ -34,14 +34,21 @@ JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(init)(JNIEnv *env, jobject thiz, js
     gSmallDetector = std::make_shared<TNN_NS::SkeletonDetector>();
     std::string protoContent, middleProtoContent, smallProtoContent, modelContent;
     std::string modelPathStr(jstring2string(env, modelPath));
-    protoContent = fdLoadFile(modelPathStr + "/skeleton_big.tnnproto");
-    smallProtoContent = fdLoadFile(modelPathStr + "/skeleton_small.tnnproto");
-    modelContent = fdLoadFile(modelPathStr + "/skeleton.tnnmodel");
+    
+    gComputeUnitType = computUnitType;
+    if (gComputeUnitType == 3) { // Qualcomm SNPE use SNPE .dlc model
+        protoContent      = "-";  // protoContent just not empty.
+        smallProtoContent = "-";  // protoContent just not empty.
+        modelContent      = fdLoadFile(modelPathStr + "/skeleton.dlc");
+    } else {
+        protoContent      = fdLoadFile(modelPathStr + "/skeleton_big.tnnproto");
+        smallProtoContent = fdLoadFile(modelPathStr + "/skeleton_small.tnnproto");
+        modelContent      = fdLoadFile(modelPathStr + "/skeleton.tnnmodel");
+    }
     LOGI("big proto content size: %d, "
          "small proto content size: %d, "
          "model content size %d", protoContent.length(),
          smallProtoContent.length(), modelContent.length());
-    gComputeUnitType = computUnitType;
 
     TNN_NS::Status status = TNN_NS::TNN_OK, status1 = TNN_NS::TNN_OK;
     auto option = std::make_shared<TNN_NS::SkeletonDetectorOption>();
@@ -56,25 +63,25 @@ JNIEXPORT JNICALL jint TNN_SKELETON_DETECTOR(init)(JNIEnv *env, jobject thiz, js
     LOGI("device type: %d", gComputeUnitType);
     if (gComputeUnitType == 1) {
         option->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status = gDetector->Init(option);
-
         smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsGPU;
-        status1 = gSmallDetector->Init(smallDetectorOption);
     } else if (gComputeUnitType == 2) {
         //add for huawei_npu store the om file
         option->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gDetector->setNpuModelPath(modelPathStr + "/");
         gDetector->setCheckNpuSwitch(false);
-        status = gDetector->Init(option);
 
         smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsHuaweiNPU;
         gSmallDetector->setNpuModelPath(modelPathStr + "/");
         gSmallDetector->setCheckNpuSwitch(false);
-        status1 = gSmallDetector->Init(smallDetectorOption);
+    } else if (gComputeUnitType == 3) { // Qualcomm SNPE
+        option->compute_units = TNN_NS::TNNComputeUnitsSNPE;
+        smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsSNPE;
     } else {
-    	status = gDetector->Init(option);
-        status1 = gSmallDetector->Init(smallDetectorOption);
+        option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+        smallDetectorOption->compute_units = TNN_NS::TNNComputeUnitsCPU;
     }
+    status  = gDetector->Init(option);
+    status1 = gSmallDetector->Init(smallDetectorOption);
 
     if (status != TNN_NS::TNN_OK || status1 != TNN_NS::TNN_OK) {
         LOGE("detector init failed high precision mode status: %d, fast mode status: %d",
diff --git a/examples/android/gradle.properties b/examples/android/gradle.properties
index 45a138d06..d3583451c 100644
--- a/examples/android/gradle.properties
+++ b/examples/android/gradle.properties
@@ -9,7 +9,7 @@
 
 # Specifies the JVM arguments used for the daemon process.
 # The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx1536m
+org.gradle.jvmargs=-Xmx2048m
 
 # When configured, Gradle will run in incubating parallel mode.
 # This option should only be used with decoupled projects. More details, visit
diff --git a/examples/android/gradle/wrapper/gradle-wrapper.properties b/examples/android/gradle/wrapper/gradle-wrapper.properties
index 59ac33d39..49f864284 100644
--- a/examples/android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
-#Fri Apr 24 15:10:50 CST 2020
+#Mon Jul 10 15:53:50 CST 2023
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
diff --git a/examples/base/tnn_sdk_sample.cc b/examples/base/tnn_sdk_sample.cc
index 4224b0170..d9ad93127 100644
--- a/examples/base/tnn_sdk_sample.cc
+++ b/examples/base/tnn_sdk_sample.cc
@@ -564,10 +564,18 @@ TNN_NS::Status TNNSDKSample::Init(std::shared_ptr<TNNSDKOption> option) {
 #if TNN_SDK_USE_NCNN_MODEL
         config.model_type = TNN_NS::MODEL_TYPE_NCNN;
 #else
-        config.model_type = TNN_NS::MODEL_TYPE_TNN;
-#endif
+    #if defined(_ATLAS_)
+        config.model_type = TNN_NS::MODEL_TYPE_ATLAS;
+        config.params.push_back(option->model_content);
+    #else
+        if (option->compute_units == TNNComputeUnitsSNPE) {
+            config.model_type = TNN_NS::MODEL_TYPE_SNPE;
+        } else {
+            config.model_type = TNN_NS::MODEL_TYPE_TNN;
+        }
         config.params = {option->proto_content, option->model_content, model_path_str_};
-
+    #endif
+#endif
         auto net = std::make_shared<TNN_NS::TNN>();
         status   = net->Init(config);
         if (status != TNN_NS::TNN_OK) {
@@ -600,9 +608,15 @@ TNN_NS::Status TNNSDKSample::Init(std::shared_ptr<TNNSDKOption> option) {
     else if (option->compute_units == TNNComputeUnitsAppleNPU) {
         device_type_      = TNN_NS::DEVICE_APPLE_NPU;
     }
+    else if (option->compute_units == TNNComputeUnitsSNPE) {
+        device_type_      = TNN_NS::DEVICE_DSP;
+    }
     else if (option->compute_units == TNNComputeUnitsNaive) {
         device_type_ = TNN_NS::DEVICE_NAIVE;
     }
+    else if (option->compute_units == TNNComputeUnitsAtlas) {
+        device_type_ = TNN_NS::DEVICE_ATLAS;
+    }
     
     //创建实例instance
     {
@@ -611,7 +625,7 @@ TNN_NS::Status TNNSDKSample::Init(std::shared_ptr<TNNSDKOption> option) {
         network_config.device_type  = device_type_;
         network_config.precision = option->precision;
         network_config.cache_path = option->cache_path;
-        if(device_type_ == TNN_NS::DEVICE_HUAWEI_NPU){
+        if (device_type_ == TNN_NS::DEVICE_HUAWEI_NPU) {
             network_config.network_type = NETWORK_TYPE_HUAWEI_NPU;
         }
         else if (option->compute_units == TNNComputeUnitsAppleNPU) {
@@ -625,6 +639,14 @@ TNN_NS::Status TNNSDKSample::Init(std::shared_ptr<TNNSDKOption> option) {
         else if (device_type_ == TNN_NS::DEVICE_CUDA) {
             network_config.network_type = NETWORK_TYPE_TENSORRT;
         }
+        else if (device_type_ == TNN_NS::DEVICE_ATLAS) {
+            network_config.network_type = NETWORK_TYPE_ATLAS;
+        } 
+        else if (device_type_ == TNN_NS::DEVICE_DSP) {
+            // Qualcomm SNPE DSP only right now.
+            network_config.network_type = NETWORK_TYPE_SNPE;
+            network_config.precision    = PRECISION_HIGH;
+        }
         std::shared_ptr<TNN_NS::Instance> instance;
         if (device_type_ == TNN_NS::DEVICE_CUDA && !(option->max_input_shapes.empty()))
             instance = net_->CreateInst(network_config, status, option->input_shapes, option->max_input_shapes);
diff --git a/examples/base/tnn_sdk_sample.h b/examples/base/tnn_sdk_sample.h
index 84e8db799..40f26977e 100644
--- a/examples/base/tnn_sdk_sample.h
+++ b/examples/base/tnn_sdk_sample.h
@@ -121,11 +121,15 @@ typedef enum {
     // run on gpu, if failed run on cpu
     TNNComputeUnitsGPU              = 0x0100,
     // run on huawei_npu, if failed run on cpu
-    TNNComputeUnitsHuaweiNPU  = 0x0300,
+    TNNComputeUnitsHuaweiNPU        = 0x0300,
     // run on huawei_npu, if failed run on cpu
-    TNNComputeUnitsAppleNPU     = 0x0301,
+    TNNComputeUnitsAppleNPU         = 0x0301,
+    // run on atlas
+    TNNComputeUnitsAtlas            = 0x0302,
+    // run on Qualcomm SNPE (gpu by default, dsp in the future), if failed run on cpu
+    TNNComputeUnitsSNPE             = 0x0303,
     // run on cpu Naive
-    TNNComputeUnitsNaive             = 0x0400,
+    TNNComputeUnitsNaive            = 0x0400,
 } TNNComputeUnits;
 
 struct RGBA{
diff --git a/examples/linux/atlas/CMakeLists.txt b/examples/linux/atlas/CMakeLists.txt
new file mode 100644
index 000000000..58b2e0da5
--- /dev/null
+++ b/examples/linux/atlas/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.1)
+project(TNN-demo)
+message(${CMAKE_SOURCE_DIR})
+message(${TNN_LIB_PATH})
+set(CMAKE_SYSTEM_NAME Linux)
+set(TNN_OPENMP_ENABLE ON)
+set(CMAKE_CXX_STANDARD 11)
+set(TNN_ARM_ENABLE ON)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+
+set(TNNRoot ${CMAKE_CURRENT_SOURCE_DIR}/../../../)
+set(TNNInclude ${TNNRoot}/include)
+
+include_directories(${TNNInclude})
+include_directories(${TNNRoot}/third_party/stb)
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/../../base)
+include_directories(${CMAKE_SOURCE_DIR}/../../utils)
+link_directories(${TNN_LIB_PATH})
+
+add_subdirectory(${TNNRoot}/third_party/gflags ${TNNRoot}/third_party/gflags)
+get_target_property(GFLAGS_INCLUDE_DIRS gflags INTERFACE_INCLUDE_DIRECTORIES)
+include_directories(BEFORE "${GFLAGS_INCLUDE_DIRS}")
+link_libraries(gflags)
+
+link_libraries(-Wl,--whole-archive TNN rt -Wl,--no-whole-archive)
+
+file(GLOB FLAG_SRC "${CMAKE_SOURCE_DIR}/../src/*.cc")
+
+file(GLOB_RECURSE BASE_SRC "${CMAKE_SOURCE_DIR}/../../base/*.cc")
+file(GLOB_RECURSE UTIL_SRC "${CMAKE_SOURCE_DIR}/../../utils/*.cc")
+
+add_definitions(-D_ATLAS_)
+add_executable(demo_atlas_imageclassify ../src/TNNImageClassify/TNNImageClassify.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+#add_executable(demo_atlas_facedetector ../src/TNNFaceDetector/TNNFaceDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
+#add_executable(demo_atlas_objectdetector ../src/TNNObjectDetector/TNNObjectDetector.cc ${BASE_SRC} ${UTIL_SRC} ${FLAG_SRC})
diff --git a/examples/linux/atlas/build_atlas.sh b/examples/linux/atlas/build_atlas.sh
new file mode 100755
index 000000000..95873f97b
--- /dev/null
+++ b/examples/linux/atlas/build_atlas.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#CC=aarch64-linux-gnu-gcc
+#CXX=aarch64-linux-gnu-g++
+TNN_LIB_PATH=../../../scripts/build_atlas/
+
+cd ../../../scripts
+./build_atlas.sh
+cd -
+
+rm -r build_atlas
+mkdir build_atlas
+cd build_atlas
+
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DTNN_LIB_PATH=$TNN_LIB_PATH
+
+make -j4
diff --git a/examples/linux/cuda/CMakeLists.txt b/examples/linux/cuda/CMakeLists.txt
index 3b1f14314..13975abb9 100644
--- a/examples/linux/cuda/CMakeLists.txt
+++ b/examples/linux/cuda/CMakeLists.txt
@@ -14,7 +14,7 @@ endif()
 find_package(CUDA REQUIRED)
 
 set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(TNN_CUDA_ENABLE ON)
 set(TNN_TENSORRT_ENABLE ON)
 
@@ -23,8 +23,8 @@ if (TNN_DEMO_WITH_OPENCV)
     include_directories(${OpenCV_INCLUDE_DIRS})
 endif()
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++11 -fPIC")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -pthread -fPIC")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -std=c++17 -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++17 -pthread -fPIC")
 
 set(TNNRoot ${CMAKE_SOURCE_DIR}/../../../)
 set(TNNInclude ${TNNRoot}/include)
diff --git a/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc b/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc
index c4562347c..d5b1039cd 100644
--- a/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc
+++ b/examples/linux/src/TNNFaceDetector/TNNFaceDetector.cc
@@ -122,6 +122,9 @@ int main(int argc, char** argv) {
         #ifdef _CUDA_
             option->compute_units = TNN_NS::TNNComputeUnitsGPU;
         #endif
+        #ifdef _ATLAS_
+            option->compute_units = TNN_NS::TNNComputeUnitsAtlas;
+        #endif
     
         option->score_threshold = 0.95;
         option->iou_threshold = 0.15;
@@ -150,7 +153,11 @@ int main(int argc, char** argv) {
         DimsVector orig_dims = {1, 3, image_height, image_width};
 
         //Predict
+    #ifdef _ATLAS_
+        auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ATLAS, TNN_NS::N8UC3, orig_dims, data);
+    #else
         auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, orig_dims, data);
+    #endif
         auto resize_mat = predictor->ProcessSDKInputMat(image_mat, "data_input");
         CHECK_TNN_STATUS(predictor->Predict(std::make_shared<UltraFaceDetectorInput>(resize_mat), sdk_output));
         std::vector<FaceInfo> face_info;
diff --git a/examples/linux/src/TNNImageClassify/TNNImageClassify.cc b/examples/linux/src/TNNImageClassify/TNNImageClassify.cc
index b62591635..ff9d40c47 100644
--- a/examples/linux/src/TNNImageClassify/TNNImageClassify.cc
+++ b/examples/linux/src/TNNImageClassify/TNNImageClassify.cc
@@ -50,7 +50,11 @@ int main(int argc, char** argv) {
         option->proto_content = proto_content;
         option->model_content = model_content;
         option->library_path = "";
+#if defined(_ATLAS_)
+        option->compute_units = TNN_NS::TNNComputeUnitsAtlas;
+#else
         option->compute_units = TNN_NS::TNNComputeUnitsCPU;
+#endif
         // if enable openvino, set option compute_units to openvino
         // if enable openvino/tensorrt, set option compute_units to openvino/tensorrt
         #if defined(_CUDA_) || defined(_OPENCL_)
@@ -77,23 +81,24 @@ int main(int argc, char** argv) {
     char img_buff[256];
     char *input_imgfn = img_buff;
     strncpy(input_imgfn, FLAGS_i.c_str(), 256);
-    printf("Classify is about to start, and the picture is %s\n",input_imgfn);
 
     int image_width, image_height, image_channel;
     unsigned char *data = stbi_load(input_imgfn, &image_width, &image_height, &image_channel, 3);
     if (!data) {
         fprintf(stderr, "ImageClassifier open file %s failed.\n", input_imgfn);
     }
-
     std::vector<int> nchw = {1, image_channel, image_height, image_width};
 
     //Init
     std::shared_ptr<TNNSDKOutput> sdk_output = predictor->CreateSDKOutput();
     CHECK_TNN_STATUS(predictor->Init(option));
-    //Predict
-    auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
-    CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output));
 
+#if defined(_ATLAS_)
+        auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_ATLAS, TNN_NS::N8UC3, nchw, data);
+#else
+        auto image_mat = std::make_shared<TNN_NS::Mat>(TNN_NS::DEVICE_NAIVE, TNN_NS::N8UC3, nchw, data);
+#endif
+    CHECK_TNN_STATUS(predictor->Predict(std::make_shared<TNNSDKInput>(image_mat), sdk_output));
     int class_id = -1;
     if (sdk_output && dynamic_cast<ImageClassifierOutput *>(sdk_output.get())) {
         auto classfy_output = dynamic_cast<ImageClassifierOutput *>(sdk_output.get());
diff --git a/include/tnn/core/common.h b/include/tnn/core/common.h
index 646ac0614..dc831248d 100644
--- a/include/tnn/core/common.h
+++ b/include/tnn/core/common.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <string>
 #include <vector>
+#include <map>
 
 #include "tnn/core/macro.h"
 
@@ -45,7 +46,9 @@ typedef enum {
     // int64
     DATA_TYPE_INT64 = 5,
     // uint32
-    DATA_TYPE_UINT32 = 6
+    DATA_TYPE_UINT32 = 6,
+    // uint8
+    DATA_TYPE_UINT8 = 8
 } DataType;
 
 typedef enum {
@@ -60,6 +63,7 @@ typedef enum {
     DATA_FORMAT_NC16HW16 = 6,
     DATA_FORMAT_NCDHW    = 7,
     DATA_FORMAT_NHC4W4   = 8,
+    DATA_FORMAT_NC32HW32 = 9,
     // special for LSTM ONNX
     DATA_FORMAT_CNH4     = 1000,
 } DataFormat;
@@ -98,6 +102,8 @@ typedef enum {
     NETWORK_TYPE_HUAWEI_NPU = 0x6000,
     NETWORK_TYPE_RK_NPU     = 0x7000,
     NETWORK_TYPE_TENSORRT   = 0x8000,
+    NETWORK_TYPE_TNNTORCH   = 0x9000,
+    NETWORK_TYPE_ZIXIAO     = 0xa000
 } NetworkType;
 
 typedef enum {
@@ -112,6 +118,14 @@ typedef enum {
     DEVICE_HUAWEI_NPU = 0x1050,
     DEVICE_RK_NPU     = 0x1060,
     DEVICE_APPLE_NPU  = 0x1070,
+    DEVICE_ZIXIAO     = 0x1080,
+
+    // General CPU
+    DEVICE_GROUP_CPU  = 0x4010,
+    // General GPU
+    DEVICE_GROUP_GPU  = 0x4020,
+    // General NPU
+    DEVICE_GROUP_NPU  = 0x4030,
 } DeviceType;
 
 typedef enum {
@@ -121,18 +135,27 @@ typedef enum {
     SHARE_MEMORY_MODE_SHARE_ONE_THREAD = 1,
     // set blob memory from external, different thread share blob memory need
     // synchronize
-    SHARE_MEMORY_MODE_SET_FROM_EXTERNAL = 2
+    SHARE_MEMORY_MODE_SET_FROM_EXTERNAL = 2,
+    // tnn instane share net resource
+    SHARE_MEMORY_MODE_SHARE_NET_RESOURCE = 3,
+    // set memory from external, including blob memory and forward memory
+    SHARE_MEMORY_MODE_SET_ALL_FROM_EXTERNAL = 4,
 } ShareMemoryMode;
 
 typedef enum {
-    MODEL_TYPE_TNN      = 0x0001,
-    MODEL_TYPE_NCNN     = 0x0100,
-    MODEL_TYPE_OPENVINO = 0x1000,
-    MODEL_TYPE_COREML   = 0x2000,
-    MODEL_TYPE_SNPE     = 0x3000,
-    MODEL_TYPE_HIAI     = 0x4000,
-    MODEL_TYPE_ATLAS    = 0x5000,
-    MODEL_TYPE_RKCACHE  = 0x6000
+    MODEL_TYPE_TNN             = 0x0001,
+    // reserved for 0x0002
+    MODEL_TYPE_TNNIR           = 0x0010,
+    MODEL_TYPE_NCNN            = 0x0100,
+    MODEL_TYPE_OPENVINO        = 0x1000,
+    MODEL_TYPE_COREML          = 0x2000,
+    MODEL_TYPE_SNPE            = 0x3000,
+    MODEL_TYPE_HIAI            = 0x4000,
+    MODEL_TYPE_ATLAS           = 0x5000,
+    MODEL_TYPE_RKCACHE         = 0x6000,
+    MODEL_TYPE_TORCHSCRIPT     = 0x7000,
+    MODEL_TYPE_TORCHSCRIPT_BIN = 0x7001,
+    MODEL_TYPE_LRT             = 0x8000
 } ModelType;
 
 using DimsVector = std::vector<int>;
@@ -167,6 +190,12 @@ struct PUBLIC NetworkConfig {
     // network init or reshape may cost more time to select opt kernel implement if enable tune kernel
     // cache_path can set to store tune kernel info.
     bool enable_tune_kernel = false;
+
+    // extra is used by some networks
+    // e.g.:
+    // the min block size (min_block_size) of a tensorrt network
+    // disalbe fast GELU (disable_fast_gelu) for accuracy issue
+    std::map<std::string, std::string> extra;
 };
 
 struct PUBLIC ModelConfig {
diff --git a/include/tnn/core/instance.h b/include/tnn/core/instance.h
index 2ad16d8c5..57d1541fd 100644
--- a/include/tnn/core/instance.h
+++ b/include/tnn/core/instance.h
@@ -46,16 +46,16 @@ class PUBLIC Instance {
     ~Instance();
 
     // init with model interpeter and inputs shape.
-    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape);
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape, InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
     // init with model interpeter, min inputs shape and max inputs shape.
-    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+    Status Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
     // deinit, release network
     Status DeInit();
 
     //  return memory bytes required for forward
-    Status GetForwardMemorySize(int& memory_size);
+    Status GetForwardMemorySize(size_t& memory_size);
 
     //  set memory to tnn instance. if success, return status code zero.
     //  only instance created with SHARE_MEMORY_MODE_SET_FROM_EXTERNAL can be set from external.
@@ -69,11 +69,18 @@ class PUBLIC Instance {
 
     // get tnn command queue
     Status GetCommandQueue(void** command_queue);
+
+    // set tnn command queue
+    Status SetCommandQueue(void* command_queue);
     
     // @brief share command queue with another instance
     // @param instance to share command queue
     Status ShareCommandQueue(Instance *instance);
 
+    // @brief share network resource with another instance
+    // @param instance to share network resource
+    Status ShareNetResource(Instance *instance);
+
     // @brief tnn instance network infer, it will wait until all layer infer complete.
     Status Forward();
 
@@ -87,6 +94,10 @@ class PUBLIC Instance {
     std::shared_ptr<AbstractModelInterpreter> GetInterpreter();
 #endif  // end of GET_INTERP_ENABLE
 
+#ifdef GET_NETWORK_ENABLE 
+    AbstractNetwork *GetNetwork();
+#endif
+
     // tnn instance network infer async.
     // device gpu, all layer infer complete will call Callback.
     Status ForwardAsync(Callback call_back);
@@ -115,8 +126,10 @@ class PUBLIC Instance {
     NetworkConfig net_config_;
     ModelConfig model_config_;
     
+#ifndef GET_NETWORK_ENABLE 
     AbstractNetwork *GetNetwork();
-    
+#endif
+
     //Mat interface for simple use
 public:
     // set input Mat, if input_name is not set, take the first input as default
diff --git a/include/tnn/core/macro.h b/include/tnn/core/macro.h
index 609311fc2..f3fc38b2c 100644
--- a/include/tnn/core/macro.h
+++ b/include/tnn/core/macro.h
@@ -119,6 +119,8 @@
     fprintf(stderr, "%s", var_name)
 #endif  //__ANDROID__
 
+#define LOGDR(fmt, ...) fprintf(stdout, fmt, ##__VA_ARGS__)
+
 #define LOGD(fmt, ...) LOGDT(fmt, DEFAULT_TAG, ##__VA_ARGS__)
 #define LOGI(fmt, ...) LOGIT(fmt, DEFAULT_TAG, ##__VA_ARGS__)
 #define LOGE(fmt, ...) LOGET(fmt, DEFAULT_TAG, ##__VA_ARGS__)
@@ -139,8 +141,10 @@
 #ifndef DEBUG
 #undef LOGDT
 #undef LOGD
+#undef LOGDR
 #define LOGDT(fmt, tag, ...)
 #define LOGD(fmt, ...)
+#define LOGDR(fmt, ...)
 #undef ASSERT
 #define ASSERT(x)
 #endif  // DEBUG
@@ -232,6 +236,16 @@
 
 #define RETURN_ON_FAIL(status)  RETURN_ON_NEQ(status, TNN_NS::TNN_OK)
 
+#define LOG_AND_RETURN_ON_FAIL(status, log)     \
+    do {                                        \
+        auto _status = (status);                \
+        if (_status != (TNN_OK)) {              \
+            log;                                \
+            return _status;                     \
+        }                                       \
+    } while (0)
+
+
 #define CHECK_PARAM_NULL(param)                                                   \
     do {                                                                                                         \
         if (!param) {                                                                                        \
@@ -250,4 +264,16 @@
 #define DEPRECATED
 #endif
 
+#define TNN_CHECK(condition, fail_msg, ...)                     \
+    do {                                                        \
+        auto _status = (condition);                             \
+        if (!_status)  {                                        \
+            char _msg[2000];                                    \
+            snprintf(_msg, 2000, fail_msg, ##__VA_ARGS__);      \
+            return Status(TNNERR_COMMON_ERROR, _msg);           \
+        }                                                       \
+    } while(0)
+
+#define CACHE_MEMORY_TAG "%memory*"
+
 #endif  // TNN_INCLUDE_TNN_CORE_MACRO_H_
diff --git a/include/tnn/core/mat.h b/include/tnn/core/mat.h
index 96031f6bf..143aae6f2 100644
--- a/include/tnn/core/mat.h
+++ b/include/tnn/core/mat.h
@@ -42,11 +42,25 @@ typedef enum {
     //NCDi[0-4]: float
     NCHW_FLOAT = 0x20,
     //NCDi[0-4]: int32
-    NC_INT32 = 0x21,
-    
-    // RESERVED FOR INTERNAL TEST USE
+    NC_INT32   = 0x21,
+    //NCDi[0-4]: int64
+    NC_INT64   = 0x22,
+    //NCDi[0-4]: int8
+    NC_INT8    = 0x23,
+    //NCDi[0-4]: uint8
+    NC_UINT8   = 0x24,
+    //NCDi[0-4]: half
+    NCHW_HALF  = 0x25,
+    //NCDi[0-4]: bfp16
+    NCHW_BFP16 = 0x26,
+
+    // DEPRECATED:
+    // WOULD BE REMOVED IN FUTURE VERSION
+    // DEPRECATED, WILL CONVERT TO NCHW_BFP16
     RESERVED_BFP16_TEST = 0x200,
+    // DEPRECATED, WILL CONVERT TO NCHW_HALF
     RESERVED_FP16_TEST  = 0x201,
+    // DEPRECATED, WILL CONVERT TO NC_INT8
     RESERVED_INT8_TEST  = 0x202,
 } PUBLIC MatType;
 
@@ -71,6 +85,7 @@ class PUBLIC Mat {
     int GetHeight();
     int GetWidth();
     int GetDim(int index);
+    void ReAlloc(DimsVector dims);
     DimsVector GetDims();
 
 private:
diff --git a/include/tnn/core/status.h b/include/tnn/core/status.h
index cc388e238..ddff08d80 100644
--- a/include/tnn/core/status.h
+++ b/include/tnn/core/status.h
@@ -106,7 +106,10 @@ enum StatusCode {
     TNNERR_ATLAS_RUNTIME_ERROR    = 0xC001,
     TNNERR_ATLAS_TIMEOUT_ERROR    = 0xC002,
     TNNERR_ATLAS_MALLOC_ERROR     = 0xC003,
-    TNNERR_ATLAS_GRAPH_INIT_ERROR = 0xC004,
+    TNNERR_ATLAS_FREE_ERROR       = 0xC004,
+    TNNERR_ATLAS_GRAPH_INIT_ERROR = 0xC005,
+    TNNERR_ATLAS_AIPP_NOT_SUPPORT = 0xC006,
+    TNNERR_ATLAS_DVPP_NOT_SUPPORT = 0xC007,
 
     // Hiai
     TNNERR_HIAI_API_ERROR = 0xD001,
diff --git a/include/tnn/core/tnn.h b/include/tnn/core/tnn.h
index 75ee71c15..e1016fd1b 100644
--- a/include/tnn/core/tnn.h
+++ b/include/tnn/core/tnn.h
@@ -50,17 +50,28 @@ class PUBLIC TNN {
     // return input shapes map from model
     Status GetModelInputShapesMap(InputShapesMap& shapes_map);
 
+    // return input data types map from model
+    Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    // return input names from model
+    Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    // return output names from model
+    Status GetModelOutputNames(std::vector<std::string>& output_names);
+
     // create tnn network instance with network config and inputs shape.
     // if inputs shape not set, use default from model.
     std::shared_ptr<Instance> CreateInst(
         NetworkConfig& config, Status& status,
-        InputShapesMap inputs_shape = InputShapesMap());
+        InputShapesMap inputs_shape = InputShapesMap(),
+        InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
     // create tnn network instance with network config and min max inputs shape,
     // instance reshape can support range from min inputs shape to max inputs shape.
     std::shared_ptr<Instance> CreateInst(
         NetworkConfig& config, Status& status,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+        InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 private:
     std::shared_ptr<TNNImpl> impl_ = nullptr;
diff --git a/include/tnn/utils/blob_converter.h b/include/tnn/utils/blob_converter.h
index 83d4aa48c..82482f9ca 100644
--- a/include/tnn/utils/blob_converter.h
+++ b/include/tnn/utils/blob_converter.h
@@ -36,7 +36,6 @@ struct PUBLIC MatConvertParam {
     bool reverse_channel     = false;
 };
 
-bool NeedDoScaleBias(const MatConvertParam& param);
 
 class BlobConverterAcc;
 class PUBLIC BlobConverter {
@@ -53,7 +52,7 @@ class PUBLIC BlobConverter {
     std::shared_ptr<BlobConverterAcc> impl_ = nullptr;
 
     Status CheckScaleBiasInParam(Mat& image, MatConvertParam& param, bool convert_to_mat);
-    
+    bool NeedDoScaleBias(MatConvertParam& param);
 };
 
 }  // namespace TNN_NS
diff --git a/include/tnn/utils/device_utils.h b/include/tnn/utils/device_utils.h
new file mode 100644
index 000000000..9215ad4aa
--- /dev/null
+++ b/include/tnn/utils/device_utils.h
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_UTILS_DEVICE_UTILS_H_
+#define TNN_INCLUDE_TNN_UTILS_DEVICE_UTILS_H_
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+// @brief Get a specified device from a device group
+PUBLIC DeviceType GetConcreteDeviceType(DeviceType type);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_UTILS_DEVICE_UTILS_H_
diff --git a/include/tnn/utils/mat_utils.h b/include/tnn/utils/mat_utils.h
index b5ea1331c..eb24b2345 100644
--- a/include/tnn/utils/mat_utils.h
+++ b/include/tnn/utils/mat_utils.h
@@ -64,6 +64,14 @@ struct PUBLIC WarpAffineParam {
     float border_val       = 0.0f;
 };
 
+typedef enum {
+    PASTE_TYPE_TOP_LEFT_ALIGN = 0x00,
+    PASTE_TYPE_CENTER_ALIGN   = 0x01,
+} PUBLIC PasteType;
+struct PUBLIC PasteParam {
+    PasteType type = PASTE_TYPE_TOP_LEFT_ALIGN;
+    int pad_value  = 0;
+};
 struct PUBLIC CopyMakeBorderParam {
     int top    = 0;
     int bottom = 0;
@@ -92,6 +100,7 @@ class PUBLIC MatUtils {
     //src and dst device type must be same.
     static Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue);
 
+    static Status GetMatByteSize(Mat& src, int& byte_size);
     //src and dst device type must be same. param top, bottom, left and right must be non-negative.
     static Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue);
 };
diff --git a/include/tnn/version.h b/include/tnn/version.h
index 4f327e0ac..a14d9ca65 100644
--- a/include/tnn/version.h
+++ b/include/tnn/version.h
@@ -15,7 +15,7 @@
 #define TNN_INCLUDE_TNN_VERSION_H_
 
 static char *branch_name_tnn = "master";
-static char *commit_date_tnn = "2021-10-14";
+static char *commit_date_tnn = "2023-04-12";
 static char *commit_hash_tnn = "1790ad63";
 
 #endif //TNN_INCLUDE_TNN_VERSION_H_
diff --git a/platforms/android/CMakeLists.txt b/platforms/android/CMakeLists.txt
index 65ca3fcd2..0531a1a14 100644
--- a/platforms/android/CMakeLists.txt
+++ b/platforms/android/CMakeLists.txt
@@ -33,6 +33,16 @@ if(TNN_OPENCL_ENABLE)
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/source/device/opencl)
 endif()
 
+if(TNN_SNPE_ENABLE)
+    if(ANDROID_ABI STREQUAL "armeabi-v7a")
+        message(STATUS "TNN Qualcomm SNPE does not support android ARMv7")
+    else()
+        message(STATUS "Build TNN Qualcomm SNPE on android ARM64")
+        target_link_libraries(TNN SNPE)
+        include_directories(${PROJECT_SOURCE_DIR}/source/tnn/device/snpe)
+    endif()
+endif()
+
 if(TNN_HUAWEI_NPU_ENABLE)
     message(STATUS "Build TNN NPU")
     target_link_libraries(TNN hiai hiai_ir hiai_ir_build)
diff --git a/platforms/android/test_android.sh b/platforms/android/test_android.sh
index abfb271f9..8a02681c6 100755
--- a/platforms/android/test_android.sh
+++ b/platforms/android/test_android.sh
@@ -5,7 +5,7 @@ ADB=adb
 ANDROID_DIR=/data/local/tmp/tnn-test
 
 TEST_PROTO_PATH=
-#DEVIVE: ARM/OPENCL/HUAWEI_NPU
+#DEVIVE: ARM/OPENCL/HUAWEI_NPU/SNPE
 DEVICE="ARM"
 WARM_UP_COUNT=0
 ITERATOR_COUNT=1
@@ -21,19 +21,26 @@ function usage() {
     echo "        -32   Build 32 bit."
     echo "        -c    Clean up build folders."
     echo "        -d    run with specified device"
-    echo "        -t    ARM/OPENCL/HUAWEI_NPU specify the platform to run (default: ARM)"
+    echo "        -t    ARM/OPENCL/HUAWEI_NPU/SNPE specify the platform to run (default: ARM)"
     echo "        -m    tnnproto"
     echo "        -i    input file"
 }
 
 function android_test() {
 
+    if [ "$DEVICE" == "SNPE" ]
+    then
+        export SNPE="ON"
+    else
+        export SNPE="OFF"
+    fi
     if [ "$DEVICE" == "HUAWEI_NPU" ]
     then
         export HUAWEI_NPU="ON"
     else
         export HUAWEI_NPU="OFF"
     fi
+
     if $NEED_CLEAN
     then
         rm -r build32 build64
@@ -56,21 +63,47 @@ function android_test() {
         fi
         if [ -n "$INPUT_PATH" ]
         then
-            echo "input path"
+            echo "push input file to android device"
             $ADB push ${INPUT_PATH} ${ANDROID_DIR}/input.txt
         fi
-        TEST_MODEL_PATH=${TEST_PROTO_PATH/proto/model}
-        $ADB push ${TEST_PROTO_PATH} ${ANDROID_DIR}/test.tnnproto
-        $ADB push ${TEST_MODEL_PATH} ${ANDROID_DIR}/test.tnnmodel
+        if [ "$DEVICE" == "SNPE" ]
+        then
+            # TEST_PROTO_PATH is path to SNPE .dlc model file
+            $ADB push ${TEST_PROTO_PATH} ${ANDROID_DIR}/test.dlc
+        else
+            TEST_MODEL_PATH=${TEST_PROTO_PATH/proto/model}
+            $ADB push ${TEST_PROTO_PATH} ${ANDROID_DIR}/test.tnnproto
+            $ADB push ${TEST_MODEL_PATH} ${ANDROID_DIR}/test.tnnmodel
+        fi
     fi
 
     $ADB shell "echo "${DEVICE}" > $ANDROID_DIR/test.log"
-    if [ "$DEVICE" == "HUAWEI_NPU" ]
+
+    if [ "$DEVICE" == "SNPE" ]
+    then
+        # push SNPE libraries to android device
+        $ADB shell "mkdir -p $ANDROID_DIR/lib"
+        if [ "$ABI" == "armeabi-v7a" ]
+        then
+            echo "Run Qualcomm SNPE armv7 32-bit"
+            $ADB push $WORK_DIR/../../third_party/snpe/lib/arm-android-clang8.0/* $ANDROID_DIR/lib
+        else
+            echo "Run Qualcomm SNPE armv8 64-bit"
+            $ADB push $WORK_DIR/../../third_party/snpe/lib/aarch64-android-clang8.0/* $ANDROID_DIR/lib
+        fi
+        # run SNPE TNNTest on android device
+        if [ -n "$INPUT_PATH" ]
+        then
+          $ADB shell "cd $ANDROID_DIR; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ANDROID_DIR}/lib:$ANDROID_DIR; ./TNNTest -mt=SNPE -dt=DSP -nt=SNPE -mp=./test.dlc -ip=input.txt -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        else
+          $ADB shell "cd $ANDROID_DIR; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ANDROID_DIR}/lib:$ANDROID_DIR; ./TNNTest -mt=SNPE -dt=DSP -nt=SNPE -mp=./test.dlc -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
+        fi
+    elif [ "$DEVICE" == "HUAWEI_NPU" ]
     then
         echo "Run Huawei Npu"
         $ADB shell "mkdir -p $ANDROID_DIR/lib"
         $ADB push $WORK_DIR/../../third_party/huawei_npu/cpp_lib/$ABI/* $ANDROID_DIR/lib
-        $ADB push $WORK_DIR/../../third_party/huawei_npu/hiai_ddk_latest/$ABI/* $ANDROID_DIR/lib
+        $ADB push $WORK_DIR/../../third_party/huawei_npu/hiai_ddk_latest/ddk/ai_ddk_lib/lib64/* $ANDROID_DIR/lib
         if [ -n "$INPUT_PATH" ]
         then
           $ADB shell "cd $ANDROID_DIR; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ANDROID_DIR}/lib:$ANDROID_DIR; ./TNNTest -dt=${DEVICE} -nt=HUAWEI_NPU -mp=./test.tnnproto -ip=input.txt -op=${DEVICE}_output.data -wc=$WARM_UP_COUNT -ic=$ITERATOR_COUNT"
diff --git a/platforms/ios/tnn.xcodeproj/project.pbxproj b/platforms/ios/tnn.xcodeproj/project.pbxproj
index cdfcad50b..f4ae7e861 100644
--- a/platforms/ios/tnn.xcodeproj/project.pbxproj
+++ b/platforms/ios/tnn.xcodeproj/project.pbxproj
@@ -63,7 +63,6 @@
 		36289E792987F4770091FDA9 /* graph_matcher.h in Headers */ = {isa = PBXBuildFile; fileRef = 36289E612987F4770091FDA9 /* graph_matcher.h */; };
 		36289E7A2987F4770091FDA9 /* net_optimizer_fuse_conv_activation.cc in Sources */ = {isa = PBXBuildFile; fileRef = 36289E622987F4770091FDA9 /* net_optimizer_fuse_conv_activation.cc */; };
 		36289E7B2987F4770091FDA9 /* net_optimizer_fuse_conv_activation.h in Headers */ = {isa = PBXBuildFile; fileRef = 36289E632987F4770091FDA9 /* net_optimizer_fuse_conv_activation.h */; };
-		36289E7C2987F4770091FDA9 /* optimizer_const.cc in Sources */ = {isa = PBXBuildFile; fileRef = 36289E642987F4770091FDA9 /* optimizer_const.cc */; };
 		36289E7F2987F4940091FDA9 /* swish_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 36289E7D2987F4940091FDA9 /* swish_layer.cc */; };
 		36289E802987F4940091FDA9 /* glu_layer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 36289E7E2987F4940091FDA9 /* glu_layer.cc */; };
 		36289E822987F4AA0091FDA9 /* glu_layer_interpreter.cc in Sources */ = {isa = PBXBuildFile; fileRef = 36289E812987F4AA0091FDA9 /* glu_layer_interpreter.cc */; };
@@ -664,7 +663,6 @@
 		9D4A4E002994B46500407FE5 /* net_optimizer_insert_int8_reformat.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D4A44FE2994B46200407FE5 /* net_optimizer_insert_int8_reformat.h */; };
 		9D4A4E012994B46500407FE5 /* net_optimizer_fuse_conv_add.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4A44FF2994B46200407FE5 /* net_optimizer_fuse_conv_add.cc */; };
 		9D4A4E022994B46500407FE5 /* net_optimizer_convert_matmul_to_conv.h in Headers */ = {isa = PBXBuildFile; fileRef = 9D4A45002994B46200407FE5 /* net_optimizer_convert_matmul_to_conv.h */; };
-		9D4A4E032994B46500407FE5 /* optimizer_const.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4A45012994B46200407FE5 /* optimizer_const.cc */; };
 		9D4A4E042994B46500407FE5 /* net_optimizer_remove_layers.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4A45022994B46200407FE5 /* net_optimizer_remove_layers.cc */; };
 		9D4A4E052994B46500407FE5 /* net_optimizer_manager.cc in Sources */ = {isa = PBXBuildFile; fileRef = 9D4A45032994B46200407FE5 /* net_optimizer_manager.cc */; };
 		9D4A4E7B2994B46600407FE5 /* coreml_network.mm in Sources */ = {isa = PBXBuildFile; fileRef = 9D4A457E2994B46200407FE5 /* coreml_network.mm */; };
@@ -2260,6 +2258,12 @@
 		ECEC5D6C24FCE0780044DDF1 /* mat_converter_acc.h in Headers */ = {isa = PBXBuildFile; fileRef = ECEC5D6924FCE0780044DDF1 /* mat_converter_acc.h */; };
 		ECEC5D6D24FCE0780044DDF1 /* mat_utils.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5D6A24FCE0780044DDF1 /* mat_utils.cc */; };
 		ECEC5DA824FFC6FE0044DDF1 /* mat.cc in Sources */ = {isa = PBXBuildFile; fileRef = ECEC5DA724FFC6FD0044DDF1 /* mat.cc */; };
+		F580E8E02A8CF65D005CA266 /* any.h in Headers */ = {isa = PBXBuildFile; fileRef = F580E8DD2A8CF65D005CA266 /* any.h */; };
+		F580E8E22A8CF65D005CA266 /* any.cc in Sources */ = {isa = PBXBuildFile; fileRef = F580E8DE2A8CF65D005CA266 /* any.cc */; };
+		F580E8E82A8CF6B5005CA266 /* any.h in Headers */ = {isa = PBXBuildFile; fileRef = F580E8E62A8CF6B5005CA266 /* any.h */; };
+		F580E8E92A8CF6B5005CA266 /* any.h in Headers */ = {isa = PBXBuildFile; fileRef = F580E8E62A8CF6B5005CA266 /* any.h */; };
+		F580E8EA2A8CF6B5005CA266 /* any.cc in Sources */ = {isa = PBXBuildFile; fileRef = F580E8E72A8CF6B5005CA266 /* any.cc */; };
+		F580E8EB2A8CF6B5005CA266 /* any.cc in Sources */ = {isa = PBXBuildFile; fileRef = F580E8E72A8CF6B5005CA266 /* any.cc */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -2319,7 +2323,6 @@
 		36289E612987F4770091FDA9 /* graph_matcher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = graph_matcher.h; sourceTree = "<group>"; };
 		36289E622987F4770091FDA9 /* net_optimizer_fuse_conv_activation.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_fuse_conv_activation.cc; sourceTree = "<group>"; };
 		36289E632987F4770091FDA9 /* net_optimizer_fuse_conv_activation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_fuse_conv_activation.h; sourceTree = "<group>"; };
-		36289E642987F4770091FDA9 /* optimizer_const.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = optimizer_const.cc; sourceTree = "<group>"; };
 		36289E7D2987F4940091FDA9 /* swish_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = swish_layer.cc; sourceTree = "<group>"; };
 		36289E7E2987F4940091FDA9 /* glu_layer.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = glu_layer.cc; sourceTree = "<group>"; };
 		36289E812987F4AA0091FDA9 /* glu_layer_interpreter.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = glu_layer_interpreter.cc; sourceTree = "<group>"; };
@@ -2923,7 +2926,6 @@
 		9D4A44FE2994B46200407FE5 /* net_optimizer_insert_int8_reformat.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_insert_int8_reformat.h; sourceTree = "<group>"; };
 		9D4A44FF2994B46200407FE5 /* net_optimizer_fuse_conv_add.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_fuse_conv_add.cc; sourceTree = "<group>"; };
 		9D4A45002994B46200407FE5 /* net_optimizer_convert_matmul_to_conv.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = net_optimizer_convert_matmul_to_conv.h; sourceTree = "<group>"; };
-		9D4A45012994B46200407FE5 /* optimizer_const.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = optimizer_const.cc; sourceTree = "<group>"; };
 		9D4A45022994B46200407FE5 /* net_optimizer_remove_layers.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_remove_layers.cc; sourceTree = "<group>"; };
 		9D4A45032994B46200407FE5 /* net_optimizer_manager.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = net_optimizer_manager.cc; sourceTree = "<group>"; };
 		9D4A457E2994B46200407FE5 /* coreml_network.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = coreml_network.mm; sourceTree = "<group>"; };
@@ -4522,6 +4524,10 @@
 		ECEC5D6924FCE0780044DDF1 /* mat_converter_acc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mat_converter_acc.h; sourceTree = "<group>"; };
 		ECEC5D6A24FCE0780044DDF1 /* mat_utils.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat_utils.cc; sourceTree = "<group>"; };
 		ECEC5DA724FFC6FD0044DDF1 /* mat.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mat.cc; sourceTree = "<group>"; };
+		F580E8DD2A8CF65D005CA266 /* any.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = any.h; path = "../../../../../.Trash/TNN-dev-0.4.2/source/tnn/core/any.h"; sourceTree = "<group>"; };
+		F580E8DE2A8CF65D005CA266 /* any.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = any.cc; path = "../../../../../.Trash/TNN-dev-0.4.2/source/tnn/core/any.cc"; sourceTree = "<group>"; };
+		F580E8E62A8CF6B5005CA266 /* any.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = any.h; sourceTree = "<group>"; };
+		F580E8E72A8CF6B5005CA266 /* any.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = any.cc; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -4642,7 +4648,6 @@
 				36289E4D2987F4770091FDA9 /* net_optimizer_convert_matmul_to_conv.h */,
 				36289E622987F4770091FDA9 /* net_optimizer_fuse_conv_activation.cc */,
 				36289E632987F4770091FDA9 /* net_optimizer_fuse_conv_activation.h */,
-				36289E642987F4770091FDA9 /* optimizer_const.cc */,
 				9E9758522865CC3F0094339C /* net_optimizer_dynamic_range_dequant.cc */,
 				9E9758512865CC3F0094339C /* net_optimizer_dynamic_range_dequant.h */,
 				EC12EC7E25E67549007ADDE4 /* net_optimizer_cbam_fused_pooling.cc */,
@@ -5134,6 +5139,8 @@
 		9D32FC6424557EEB002DCDAB /* core */ = {
 			isa = PBXGroup;
 			children = (
+				F580E8E72A8CF6B5005CA266 /* any.cc */,
+				F580E8E62A8CF6B5005CA266 /* any.h */,
 				EC8BDE0725E3B13D0085CCC2 /* const_folder.cc */,
 				ECEC5DA724FFC6FD0044DDF1 /* mat.cc */,
 				9D32FC7424557EEB002DCDAB /* abstract_device.cc */,
@@ -5466,6 +5473,8 @@
 		9D4A44B42994B46200407FE5 /* core */ = {
 			isa = PBXGroup;
 			children = (
+				F580E8DE2A8CF65D005CA266 /* any.cc */,
+				F580E8DD2A8CF65D005CA266 /* any.h */,
 				9D4A44B52994B46200407FE5 /* layer_type.cc */,
 				9D4A44B62994B46200407FE5 /* abstract_network.cc */,
 				9D4A44B72994B46200407FE5 /* default_network.h */,
@@ -5529,7 +5538,6 @@
 				9D4A44FE2994B46200407FE5 /* net_optimizer_insert_int8_reformat.h */,
 				9D4A44FF2994B46200407FE5 /* net_optimizer_fuse_conv_add.cc */,
 				9D4A45002994B46200407FE5 /* net_optimizer_convert_matmul_to_conv.h */,
-				9D4A45012994B46200407FE5 /* optimizer_const.cc */,
 				9D4A45022994B46200407FE5 /* net_optimizer_remove_layers.cc */,
 				9D4A45032994B46200407FE5 /* net_optimizer_manager.cc */,
 			);
@@ -7634,6 +7642,7 @@
 				9D32FC9D24557EEC002DCDAB /* memory_unify_assign_strategy.h in Headers */,
 				9EEED1E72869C659000E6E0B /* coreml_binary_layer.h in Headers */,
 				9E12F14B2865BCD00089E089 /* arm_conv_layer_acc_factory.h in Headers */,
+				F580E8E82A8CF6B5005CA266 /* any.h in Headers */,
 				9D32FF6424557EED002DCDAB /* layer_resource_generator.h in Headers */,
 				9DD1FB3F247CE9BE00800139 /* metal_context.h in Headers */,
 				9E12F13D2865BCD00089E089 /* arm_conv_int8_sdot_layer_common.h in Headers */,
@@ -7853,6 +7862,7 @@
 				9D4A4DB52994B46500407FE5 /* blob_memory_pool.h in Headers */,
 				9D4A4E022994B46500407FE5 /* net_optimizer_convert_matmul_to_conv.h in Headers */,
 				9D4A54162994B46B00407FE5 /* x86_conv_int8_layer_common.h in Headers */,
+				F580E8E92A8CF6B5005CA266 /* any.h in Headers */,
 				9D4A4F4E2994B46600407FE5 /* blob_transfer_utils.h in Headers */,
 				9D4A4F722994B46600407FE5 /* npu_common_utils.h in Headers */,
 				9D4A4DAC2994B46500407FE5 /* memory_mode_state_factory.h in Headers */,
@@ -7997,6 +8007,7 @@
 				9D4A53E42994B46B00407FE5 /* data_packing.h in Headers */,
 				9D4A4DEF2994B46500407FE5 /* ir.h in Headers */,
 				9D4A50CB2994B46700407FE5 /* metal_conv1d_layer_acc.h in Headers */,
+				F580E8E02A8CF65D005CA266 /* any.h in Headers */,
 				9D4A4DCC2994B46500407FE5 /* abstract_network.h in Headers */,
 				9D4A53D02994B46B00407FE5 /* sgemm_fetch_t_16.h in Headers */,
 				9D4A53C02994B46B00407FE5 /* conv_gemm_config.h in Headers */,
@@ -8143,6 +8154,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				F580E8EA2A8CF6B5005CA266 /* any.cc in Sources */,
 				6178F34E2590AA8C00B4B153 /* md5.cc in Sources */,
 				3623EFAB2695996E00932B9F /* metal_padv2_layer_acc.mm in Sources */,
 				9D32FCF824557EEC002DCDAB /* elu_layer.cc in Sources */,
@@ -8481,7 +8493,6 @@
 				ECCDCF4C25E10B9D00D7D297 /* cast_layer_interpreter.cc in Sources */,
 				9D32FF2B24557EED002DCDAB /* hdrguide_layer_interpreter.cc in Sources */,
 				36289EB72987F5660091FDA9 /* arm_group_norm_fp16_layer.cc in Sources */,
-				36289E7C2987F4770091FDA9 /* optimizer_const.cc in Sources */,
 				ECCDCEAD25DF536000D7D297 /* cpu_log_sigmoid_layer_acc.cc in Sources */,
 				9E12F1432865BCD00089E089 /* arm_conv_layer_depthwise.cc in Sources */,
 				9EA794CE2869BD3900E61ED7 /* arm_expand_layer_acc.cc in Sources */,
@@ -9201,6 +9212,7 @@
 				9D4A54302994B46B00407FE5 /* x86_log_sigmoid_layer_acc.cc in Sources */,
 				9D4A502E2994B46700407FE5 /* metal_tanh_layer_acc.mm in Sources */,
 				9D4A506F2994B46700407FE5 /* metal_gelu_layer_acc.metal in Sources */,
+				F580E8EB2A8CF6B5005CA266 /* any.cc in Sources */,
 				9D4A4D4B2994B46500407FE5 /* hdrguide_layer_interpreter.cc in Sources */,
 				9D4A4FD02994B46700407FE5 /* min_layer.cc in Sources */,
 				9D4A4E832994B46600407FE5 /* Model.proto in Sources */,
@@ -9581,6 +9593,7 @@
 				9D4A4D0E2994B46500407FE5 /* shape_layer_interpreter.cc in Sources */,
 				9D4A537E2994B46A00407FE5 /* x86_arg_max_or_min_layer_acc.cc in Sources */,
 				9D4A50AF2994B46700407FE5 /* metal_reshape_layer_acc.metal in Sources */,
+				F580E8E22A8CF65D005CA266 /* any.cc in Sources */,
 				9D4A4DE62994B46500407FE5 /* net_optimizer_fuse_conv_post.cc in Sources */,
 				9D4A50BA2994B46700407FE5 /* metal_asin_layer_acc.mm in Sources */,
 				9D4A4FD42994B46700407FE5 /* concat_layer.cc in Sources */,
@@ -9764,7 +9777,6 @@
 				9D4A501C2994B46700407FE5 /* metal_sigmoid_layer_acc.mm in Sources */,
 				9D4A4EA52994B46600407FE5 /* coreml_hard_sigmoid_layer.cc in Sources */,
 				9D4A53EF2994B46B00407FE5 /* x86_sqrt_layer_acc.cc in Sources */,
-				9D4A4E032994B46500407FE5 /* optimizer_const.cc in Sources */,
 				9D4A50D72994B46700407FE5 /* metal_pow_layer_acc.metal in Sources */,
 				9D4A510A2994B46800407FE5 /* cpu_reformat_layer_acc.cc in Sources */,
 				9D4A4D592994B46500407FE5 /* unary_op_layer_interpreter.cc in Sources */,
diff --git a/platforms/jetson/CMakeLists.txt b/platforms/jetson/CMakeLists.txt
index 33db5dd7a..9b8088ab0 100644
--- a/platforms/jetson/CMakeLists.txt
+++ b/platforms/jetson/CMakeLists.txt
@@ -34,25 +34,13 @@ if(TNN_OPENVINO_ENABLE)
         set(LIB_EXT ".a")
     endif()
 
-    add_library(inference_engine ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_legacy ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_transformations ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_lp_transformations ${LINK_TYPE} IMPORTED)
-    # add_library(MKLDNNPlugin ${LINK_TYPE} IMPORTED)
-    add_library(ngraph ${LINK_TYPE} IMPORTED)
-    # add_library(pugixml STATIC IMPORTED)
+    add_library(openvino ${LINK_TYPE} IMPORTED)
     add_library(tbb ${LINK_TYPE} IMPORTED)
 
-    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine${LIB_EXT})
-    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_legacy${LIB_EXT})
-    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_transformations${LIB_EXT})
-    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
-    # set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}MKLDNNPlugin.so)
-    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
-    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/external/tbb/lib/libtbb.so.2)
-    # set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.a)
+    set_target_properties(openvino PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/lib/intel64/${LIB_PFX}openvino${LIB_EXT})
+    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/3rdparty/tbb/lib/libtbb.so)
     
-    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations ngraph tbb)
+    target_link_libraries(TNN openvino tbb)
 endif()
 
 if(TNN_CUDA_ENABLE)
diff --git a/platforms/linux/CMakeLists.txt b/platforms/linux/CMakeLists.txt
index 5ce3aff57..c54ff870f 100644
--- a/platforms/linux/CMakeLists.txt
+++ b/platforms/linux/CMakeLists.txt
@@ -34,25 +34,13 @@ if(TNN_OPENVINO_ENABLE)
         set(LIB_EXT ".a")
     endif()
 
-    add_library(inference_engine ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_legacy ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_transformations ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_lp_transformations ${LINK_TYPE} IMPORTED)
-    # add_library(MKLDNNPlugin ${LINK_TYPE} IMPORTED)
-    add_library(ngraph ${LINK_TYPE} IMPORTED)
-    # add_library(pugixml STATIC IMPORTED)
+    add_library(openvino ${LINK_TYPE} IMPORTED)
     add_library(tbb ${LINK_TYPE} IMPORTED)
 
-    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine${LIB_EXT})
-    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_legacy${LIB_EXT})
-    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_transformations${LIB_EXT})
-    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
-    # set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}MKLDNNPlugin.so)
-    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
-    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/external/tbb/lib/libtbb.so.2)
-    # set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.a)
-    
-    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations ngraph tbb)
+    set_target_properties(openvino PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/lib/intel64/${LIB_PFX}openvino${LIB_EXT})
+    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/3rdparty/tbb/lib/libtbb.so)
+ 
+    target_link_libraries(TNN openvino tbb)
 endif()
 
 if(TNN_CUDA_ENABLE)
@@ -63,12 +51,46 @@ if(TNN_CUDA_ENABLE)
     add_library(cudnn SHARED IMPORTED)
     set_target_properties(nvinfer PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}/lib/libnvinfer.so)
     set_target_properties(nvinfer_plugin PROPERTIES IMPORTED_LOCATION $ENV{TENSORRT_ROOT_DIR}/lib/libnvinfer_plugin.so)
-    set_target_properties(cudnn PROPERTIES IMPORTED_LOCATION $ENV{CUDNN_ROOT_DIR}/lib64/libcudnn.so)
+    set_target_properties(cudnn PROPERTIES IMPORTED_LOCATION $ENV{CUDNN_ROOT_DIR}/lib/libcudnn.so)
     target_link_libraries(TNN nvinfer nvinfer_plugin cudnn)
+    target_link_options(TNN PRIVATE "LINKER:--no-as-needed")
+    target_link_options(TNN PRIVATE "LINKER:-lcublas,-lcublasLt")
 endif()
 
+if (TNN_TNNTORCH_ENABLE)
+    find_package(Torch REQUIRED)
+    target_link_libraries(TNN ${TORCH_LIBRARIES})
+    if(TNN_TORCHVISION_ENABLE)
+        find_package(TorchVision REQUIRED)
+        target_link_libraries(TNN TorchVision::TorchVision)
+    endif()
+endif()
+
+if(TNN_ATLAS_ENABLE)
+    target_link_libraries(TNN ascendcl acl_dvpp)
+    if(LINUX_TEST_ENABLE)
+        add_executable(AtlasTest ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_atlas.cc)
+        target_link_libraries(AtlasTest TNN)
+        add_executable(AtlasTestImageList ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_atlas_image_list.cc)
+        target_link_libraries(AtlasTestImageList TNN)
+        add_executable(AtlasTestMultiThread ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_atlas_multi_thread.cc)
+        target_link_libraries(AtlasTestMultiThread TNN)
+        add_executable(ArmTestMultiThread ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_arm_multi_thread.cc)
+        target_link_libraries(ArmTestMultiThread TNN)
+        add_executable(AtlasTestMultiModel ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_atlas_multi_model.cc)
+        target_link_libraries(AtlasTestMultiModel TNN)
+        add_executable(AtlasTestResizeCrop ${COMMON_SRC} ${CMAKE_SOURCE_DIR}/platforms/linux/src/test_atlas_resize_crop.cc)
+        target_link_libraries(AtlasTestResizeCrop TNN)
+    endif()
+endif()
 if(TNN_RK_NPU_ENABLE)
     message(STATUS "Build TNN RKNPU")
     target_link_libraries(TNN rknpu_ddk)
 endif()
 
+if (TNN_ZIXIAO_ENABLE)
+    message(STATUS "Build TNN ZIXIAO")
+    add_library(light_runtime_core SHARED IMPORTED)
+    set_target_properties(light_runtime_core PROPERTIES IMPORTED_LOCATION $ENV{LRT_ROOT_DIR}/lib/liblight_runtime_core.so)
+    target_link_libraries(TNN light_runtime_core)
+endif()
diff --git a/platforms/mac/CMakeLists.txt b/platforms/mac/CMakeLists.txt
index 08e8baf58..cd32c1df1 100644
--- a/platforms/mac/CMakeLists.txt
+++ b/platforms/mac/CMakeLists.txt
@@ -33,23 +33,11 @@ if(TNN_OPENVINO_ENABLE)
         set(LIB_EXT ".a")
     endif()
 
-    add_library(inference_engine ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_legacy ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_transformations ${LINK_TYPE} IMPORTED)
-    add_library(inference_engine_lp_transformations ${LINK_TYPE} IMPORTED)
-    # add_library(MKLDNNPlugin SHARED IMPORTED)
-    add_library(ngraph ${LINK_TYPE} IMPORTED)
-    # add_library(pugixml STATIC IMPORTED)
-    add_library(tbb ${LINK_TYPE} IMPORTED)
-
-    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine${LIB_EXT})
-    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_legacy${LIB_EXT})
-    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_transformations${LIB_EXT})
-    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
-    # set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/${LIB_PFX}MKLDNNPlugin.dylib)
-    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
-    # set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.a)
-    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/external/tbb/lib/libtbb.so.2)
-
-    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations ngraph)
+    add_library(openvino ${LINK_TYPE} IMPORTED)
+    #add_library(tbb ${LINK_TYPE} IMPORTED)
+
+    set_target_properties(openvino PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/lib/intel64/Release/${LIB_PFX}openvino${LIB_EXT})
+    #set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/3rdparty/tbb/lib/libtbb.so)
+
+    target_link_libraries(TNN openvino)
 endif()
diff --git a/platforms/windows/CMakeLists.txt b/platforms/windows/CMakeLists.txt
index 41b91a7e9..02827fb4a 100644
--- a/platforms/windows/CMakeLists.txt
+++ b/platforms/windows/CMakeLists.txt
@@ -13,23 +13,13 @@ if (TNN_OPENVINO_ENABLE)
 
     set(LIB_EXT ".lib")
 
-    add_library(inference_engine STATIC IMPORTED)
-    add_library(inference_engine_legacy STATIC IMPORTED)
-    add_library(inference_engine_transformations STATIC IMPORTED)
-    add_library(inference_engine_lp_transformations STATIC IMPORTED)
-    add_library(MKLDNNPlugin STATIC IMPORTED)
-    add_library(ngraph STATIC IMPORTED)
-    add_library(pugixml STATIC IMPORTED)
-
-    set_target_properties(inference_engine PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine${LIB_EXT})
-    set_target_properties(inference_engine_legacy PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_legacy${LIB_EXT})
-    set_target_properties(inference_engine_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_transformations${LIB_EXT})
-    set_target_properties(inference_engine_lp_transformations PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}inference_engine_lp_transformations${LIB_EXT})
-    set_target_properties(MKLDNNPlugin PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/lib/intel64/Release/${LIB_PFX}MKLDNNPlugin${LIB_EXT})
-    set_target_properties(ngraph PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/lib/${LIB_PFX}ngraph${LIB_EXT})
-    set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/lib/${LIB_PFX}pugixml.lib)
-
-    target_link_libraries(TNN inference_engine inference_engine_legacy inference_engine_transformations inference_engine_lp_transformations MKLDNNPlugin ngraph pugixml)
+    add_library(openvino STATIC IMPORTED)
+    add_library(tbb STATIC IMPORTED)
+
+    set_target_properties(openvino PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/lib/intel64/Release/${LIB_PFX}openvino${LIB_EXT})
+    set_target_properties(tbb PROPERTIES IMPORTED_LOCATION $ENV{OPENVINO_ROOT_DIR}/runtime/3rdparty/tbb/lib/Release/${LIB_PFX}tbb${LIB_EXT})
+
+    target_link_libraries(TNN openvino tbb)
 
 endif()
 
@@ -52,3 +42,9 @@ if(TNN_CUDA_ENABLE)
     message($ENV{CUDNN_ROOT_DIR}\\lib\\x64\\cudnn${LIB_EXT})
     target_link_libraries(TNN nvinfer nvinfer_plugin cudnn cublas)
 endif()
+
+if(TNN_TNNTORCH_ENABLE)
+    find_package(TorchVision REQUIRED)
+    find_package(Torch REQUIRED)
+    target_link_libraries(TNN ${TORCH_LIBRARIES} TorchVision::TorchVision)
+endif()
diff --git a/scripts/.gitignore b/scripts/.gitignore
index e4935b0a8..19cc8a0a4 100644
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -14,4 +14,10 @@ cuda_linux_release
 build_cuda_msvc
 cuda_msvc_release
 build_linux_native
+build_tnntorch_linux
+tnntorch_linux_release
+build_tnntorch_linux_debug
+tnntorch_linux_debug
 build_win
+build_zixiao_linux
+zixiao_linux_release
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index 5bfd89f54..aac0ac821 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -10,6 +10,7 @@ ARM="ON"
 ARM82="ON"
 OPENMP="ON"
 OPENCL="ON"
+SNPE="OFF"
 #HUAWEI_NPU="ON"
 if [ -z "$HUAWEI_NPU" ]; then
     HUAWEI_NPU="OFF"
@@ -97,6 +98,7 @@ cmake ${TNN_ROOT_PATH} \
       -DTNN_CPU_ENABLE:BOOL=ON \
       -DTNN_ARM_ENABLE:BOOL=$ARM \
       -DTNN_ARM82_ENABLE:BOOL=$ARM82 \
+      -DTNN_SNPE_ENABLE:BOOL=$SNPE \
       -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
       -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
       -DTNN_BENCHMARK_MODE:BOOL=$BENMARK_MODE \
@@ -138,6 +140,7 @@ cmake ${TNN_ROOT_PATH} \
       -DTNN_CPU_ENABLE:BOOL=ON \
       -DTNN_ARM_ENABLE:BOOL=$ARM \
       -DTNN_ARM82_ENABLE:BOOL=$ARM82 \
+      -DTNN_SNPE_ENABLE:BOOL=$SNPE \
       -DTNN_HUAWEI_NPU_ENABLE:BOOL=$HUAWEI_NPU \
       -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
       -DTNN_TEST_ENABLE:BOOL=ON \
diff --git a/scripts/build_atlas.sh b/scripts/build_atlas.sh
new file mode 100755
index 000000000..574a9dd88
--- /dev/null
+++ b/scripts/build_atlas.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+DEBUG=0
+TARGET_ARCH=aarch64
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+TNN_BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_atlas
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/release_atlas
+if [ $DEBUG == "ON" ]; then
+    TNN_BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_atlas_debug
+    TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/release_atlas_debug
+fi
+
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+
+mkdir -p ${TNN_BUILD_DIR}
+cd ${TNN_BUILD_DIR}
+
+
+cmake ${TNN_ROOT_PATH} \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DDEBUG=$DEBUG \
+      -DTNN_TEST_ENABLE:BOOL="ON" \
+      -DTNN_BENCHMARK_MODE:BOOL="OFF" \
+      -DTNN_CPU_ENABLE:BOOL="ON"  \
+      -DTNN_ARM_ENABLE:BOOL="ON" \
+      -DTNN_OPENMP_ENABLE:BOOL="ON" \
+      -DTNN_X86_ENABLE:BOOL="OFF"  \
+      -DTNN_BUILD_SHARED:BOOL="ON" \
+      -DTNN_GLIBCXX_USE_CXX11_ABI_ENABLE="OFF" \
+      -DCMAKE_SYSTEM_PROCESSOR=$TARGET_ARCH \
+      -DTNN_ATLAS_ENABLE:BOOL="ON"
+
+echo "Building TNN on ATLAS ..."
+make -j $(nproc)
+
+
+if [ -d ${TNN_INSTALL_DIR} ]
+then
+    rm -rf ${TNN_INSTALL_DIR}
+fi
+mkdir ${TNN_INSTALL_DIR}
+mkdir ${TNN_INSTALL_DIR}/lib
+mkdir ${TNN_INSTALL_DIR}/bin
+
+cp -r ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+cp ${TNN_BUILD_DIR}/libTNN.so* ${TNN_INSTALL_DIR}/lib
+cp ${TNN_BUILD_DIR}/test/TNNTest ${TNN_INSTALL_DIR}/bin
+
+echo "Building TNN on ATLAS ... done!"
diff --git a/scripts/build_cuda_aarch64_linux.sh b/scripts/build_cuda_aarch64_linux.sh
new file mode 100755
index 000000000..c2d5c9e54
--- /dev/null
+++ b/scripts/build_cuda_aarch64_linux.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+TNNTORCH="OFF"
+ARM="ON"
+OPENMP="ON"
+OPENCL="OFF"
+RKNPU="OFF"
+CC=aarch64-linux-gnu-gcc
+CXX=aarch64-linux-gnu-g++
+TARGET_ARCH=aarch64
+
+#export CUDNN_ROOT_DIR=/your/cudnn/dir/like/usr/local/cudnn-arm-8.9.3
+#export TENSORRT_ROOT_DIR=/your/trt/dir/like/usr/local/TensorRT-arm-8.5.3.1
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_cuda_aarch64_linux
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/cuda_aarch64_linux_release
+
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+cmake ${TNN_ROOT_PATH} \
+    -DTNN_TEST_ENABLE=ON \
+    -DCMAKE_C_COMPILER=$CC \
+    -DCMAKE_CXX_COMPILER=$CXX \
+    -DTNN_ARM_ENABLE:BOOL=$ARM \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_CUDA_ENABLE=ON \
+    -DTNN_TENSORRT_ENABLE=ON \
+    -DTNN_TNNTORCH_ENABLE=${TNNTORCH} \
+    -DTNN_RK_NPU_ENABLE:BOOL=$RKNPU \
+    -DTNN_OPENMP_ENABLE:BOOL=$OPENMP \
+    -DTNN_OPENCL_ENABLE:BOOL=$OPENCL \
+    -DTNN_BENCHMARK_MODE=OFF \
+    -DTNN_GLIBCXX_USE_CXX11_ABI_ENABLE=OFF \
+    -DTNN_BUILD_SHARED=ON \
+    -DTNN_CONVERTER_ENABLE=OFF \
+    -DTNN_PYBIND_ENABLE=OFF
+
+echo "Building TNN ..."
+make -j32
+
+if [ -d ${TNN_INSTALL_DIR} ]
+then 
+    rm -rf ${TNN_INSTALL_DIR}
+fi
+mkdir ${TNN_INSTALL_DIR}
+mkdir ${TNN_INSTALL_DIR}/lib
+mkdir ${TNN_INSTALL_DIR}/bin
+
+cp -r ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+cp libTNN.so* ${TNN_INSTALL_DIR}/lib
+cp test/TNNTest ${TNN_INSTALL_DIR}/bin
+
+echo "Done"
diff --git a/scripts/build_cuda_linux.sh b/scripts/build_cuda_linux.sh
index f93e8970e..7dbeaea15 100755
--- a/scripts/build_cuda_linux.sh
+++ b/scripts/build_cuda_linux.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+TNNTORCH="OFF"
+
 if [ -z $TNN_ROOT_PATH ]
 then
     TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
@@ -22,9 +24,12 @@ cmake ${TNN_ROOT_PATH} \
     -DTNN_X86_ENABLE=OFF \
     -DTNN_CUDA_ENABLE=ON \
     -DTNN_TENSORRT_ENABLE=ON \
+    -DTNN_TNNTORCH_ENABLE=${TNNTORCH} \
     -DTNN_BENCHMARK_MODE=OFF \
+    -DTNN_GLIBCXX_USE_CXX11_ABI_ENABLE=OFF \
     -DTNN_BUILD_SHARED=ON \
-    -DTNN_CONVERTER_ENABLE=OFF 
+    -DTNN_CONVERTER_ENABLE=OFF \
+    -DTNN_PYBIND_ENABLE=OFF
 
 echo "Building TNN ..."
 make -j32
diff --git a/scripts/build_macos.sh b/scripts/build_macos.sh
index 900125bab..08d3b2189 100755
--- a/scripts/build_macos.sh
+++ b/scripts/build_macos.sh
@@ -43,7 +43,8 @@ clone_openvino() {
         git clone --recursive https://github.com/openvinotoolkit/openvino.git
     fi
     cd openvino
-    git reset --hard 4795391
+    # to commit of openvino release tag 2023.0.0 : b4452d5 
+    git reset --hard b4452d5
     git submodule update --init --recursive
 
     # 编译静态库
@@ -72,6 +73,8 @@ build_openvino() {
         -DTHREADING=TBB_AUTO \
         -DTHREADING=SEQ \
         -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" \
+        -DENABLE_INTEL_GPU=OFF \
+        -DENABLE_SYSTEM_OPENCL=OFF \
         -DENABLE_MYRIAD=OFF \
         -DENABLE_CLDNN=OFF \
         -DENABLE_GNA=OFF \
@@ -99,18 +102,6 @@ copy_openvino_libraries() {
 
     cd ${BUILD_DIR}
 
-    if [ -d ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/ ]
-    then
-        mkdir -p ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/libngraph${LIB_EXT} ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/
-    fi
-
-    if [ -d ${OPENVINO_INSTALL_PATH}/lib64/ ]
-    then
-        mkdir -p ${OPENVINO_INSTALL_PATH}/lib
-        cp ${OPENVINO_INSTALL_PATH}/lib64/libpugixml.a ${OPENVINO_INSTALL_PATH}/lib/
-    fi
-
     if [ ! -d ${TNN_INSTALL_DIR} ] 
     then
         mkdir -p ${TNN_INSTALL_DIR}
@@ -126,19 +117,10 @@ copy_openvino_libraries() {
         mkdir -p ${TNN_INSTALL_DIR}/lib
     fi
 
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${TNN_INSTALL_DIR}/lib/
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${BUILD_DIR}/
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.dylib ${TNN_INSTALL_DIR}/lib/
-    # cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/external/tbb/lib/* ${TNN_INSTALL_DIR}/lib/
-
-
     if [ "${OPENVINO_BUILD_SHARED}" = "ON" ]
     then
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_legacy${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/libngraph${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/runtime/lib/intel64/Release/libopenvino${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        #cp ${OPENVINO_INSTALL_PATH}/runtime/3rdparty/tbb/lib/Release/libtbb${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
     fi
 }
 
diff --git a/scripts/build_tnntorch_linux.sh b/scripts/build_tnntorch_linux.sh
new file mode 100755
index 000000000..16bc65b16
--- /dev/null
+++ b/scripts/build_tnntorch_linux.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+if [ -z $TNN_ROOT_PATH ]
+then
+    TNN_ROOT_PATH=$(cd `dirname $0`; pwd)/..
+fi
+
+TORCHVISION_ENABLE="OFF"
+PYBIND_ENABLE="OFF"
+PACK_TORCH_LIB="OFF"
+DEBUG="OFF"
+
+export CUDNN_ROOT_DIR=/usr/local/cudnn-9.5.1.17
+export TENSORRT_ROOT_DIR=/usr/local/TensorRT-8.6.0.12
+if [ -z $1 ];then export LIBTORCH_ROOT_DIR=`find /usr/local/ -name "libtorch-shared-*"`
+else
+    export LIBTORCH_ROOT_DIR=$1
+fi
+
+export LIBTORCHVISION_ROOT_DIR=`find /usr/local/ -name "libtorchvision*-*"`
+
+BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_tnntorch_linux
+TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/tnntorch_linux_release
+
+if [ $DEBUG == "ON" ]; then
+    BUILD_DIR=${TNN_ROOT_PATH}/scripts/build_tnntorch_linux_debug
+    TNN_INSTALL_DIR=${TNN_ROOT_PATH}/scripts/tnntorch_linux_debug
+fi
+
+TNN_VERSION_PATH=$TNN_ROOT_PATH/scripts/version
+cd $TNN_VERSION_PATH
+source $TNN_VERSION_PATH/version.sh
+source $TNN_VERSION_PATH/add_version_attr.sh
+
+# rm -rf ${BUILD_DIR}
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+cmake ${TNN_ROOT_PATH} \
+    -DDEBUG=${DEBUG} \
+    -DTNN_TEST_ENABLE=ON \
+    -DTNN_CPU_ENABLE=ON \
+    -DTNN_X86_ENABLE=ON \
+    -DTNN_CUDA_ENABLE=ON \
+    -DTNN_TNNTORCH_ENABLE=ON \
+    -DTNN_TORCHVISION_ENABLE=${TORCHVISION_ENABLE} \
+    -DTNN_PYBIND_ENABLE=${PYBIND_ENABLE} \
+    -DTNN_GLIBCXX_USE_CXX11_ABI_ENABLE=OFF \
+    -DTNN_TENSORRT_ENABLE=ON \
+    -DTNN_BENCHMARK_MODE=OFF \
+    -DTNN_BUILD_SHARED=ON \
+    -DTNN_CONVERTER_ENABLE=OFF \
+    -DTNN_TIACC_MODE=ON \
+    -DTNN_PACK_TORCH_LIB=${PACK_TORCH_LIB}
+
+echo Building TNN ...
+make -j4
+
+if [ -d ${TNN_INSTALL_DIR} ]
+then 
+    rm -rf ${TNN_INSTALL_DIR}
+fi
+
+mkdir -p ${TNN_INSTALL_DIR}
+mkdir -p ${TNN_INSTALL_DIR}/lib
+
+echo $CUDA_TOOLKIT_ROOT_DIR
+
+cp -r ${TNN_ROOT_PATH}/include ${TNN_INSTALL_DIR}/
+cp -d libTNN.so* ${TNN_INSTALL_DIR}/lib/
+
+# deps
+cuda_dep_list=$( ldd libTNN.so | awk '{if (match($3, "/usr/local/cuda")){ print $3}}' )
+cp $cuda_dep_list ${TNN_INSTALL_DIR}/lib/
+
+# nvrtc
+nvrtc_dep_list=$( find /usr/local/cuda/ -name "*nvrtc-builtins*" )
+cp $nvrtc_dep_list ${TNN_INSTALL_DIR}/lib/
+
+#cublas special, tensorrt nedd all files
+cublas_dep_list=$( ldd libTNN.so | awk '{if (match($3, "cublas")){ print $3}}' )
+for element in ${cublas_dep_list[@]}
+do
+    cp -d ${element%%.*}*so* ${TNN_INSTALL_DIR}/lib/
+done
+
+#tensorrt
+tensorrt_dep_list=$( ldd libTNN.so | awk '{if (match($3, "TensorRT")){ print $3}}' )
+cp ${tensorrt_dep_list} ${TNN_INSTALL_DIR}/lib/
+#tensorrt8 special 
+tensorrt_builder_resource=`find ${TENSORRT_ROOT_DIR} -name "libnvinfer_builder_resource.so*"`
+if [ -n "$tensorrt_builder_resource" ]; then  
+    cp ${tensorrt_builder_resource} ${TNN_INSTALL_DIR}/lib/
+fi
+
+#cudnn
+cudnn_dep_list=$( ldd libTNN.so | awk '{if (match($3, "cudnn")){ print $3}}' )
+cp $cudnn_dep_list ${TNN_INSTALL_DIR}/lib/
+cp ${CUDNN_ROOT_DIR}/lib/libcudnn_cnn_infer.so.9 ${TNN_INSTALL_DIR}/lib/
+cp ${CUDNN_ROOT_DIR}/lib/libcudnn_ops_infer.so.9 ${TNN_INSTALL_DIR}/lib/
+cp ${CUDNN_ROOT_DIR}/lib/libcudnn_adv_infer.so.9 ${TNN_INSTALL_DIR}/lib/
+
+# torch
+if [ "$PACK_TORCH_LIB" = "ON" ]; then
+    torch_dep_list=$( ldd libTNN.so | awk '{if (match($3,"libtorch-shared")){ print $3}}' )
+    cp $torch_dep_list ${TNN_INSTALL_DIR}/lib/
+fi
+
+# torchvision libs
+if [ "$TORCHVISION_ENABLE" = "ON" ]; then
+    cp -d ${LIBTORCHVISION_ROOT_DIR}/lib/libtorchvision.so ${TNN_INSTALL_DIR}/lib/libtorchvision.so
+fi
+
+if [ "$PYBIND_ENABLE" = "ON" ]; then
+    cp -d _pytnn.*.so ${TNN_INSTALL_DIR}/lib/
+    cp ${TNN_ROOT_PATH}/source/pytnn/*.py ${TNN_INSTALL_DIR}/lib/
+fi
+
+echo Done
diff --git a/scripts/build_x86_linux.sh b/scripts/build_x86_linux.sh
index f55f7e210..0c4b12361 100755
--- a/scripts/build_x86_linux.sh
+++ b/scripts/build_x86_linux.sh
@@ -52,20 +52,21 @@ clone_openvino() {
         git clone --recursive https://github.com/openvinotoolkit/openvino.git
     fi
     cd openvino
-    git reset --hard 18e83a2
+    # to commit of openvino release tag 2023.0.0 : b4452d5
+    git reset --hard b4452d5
     git submodule update --init --recursive
     #sed -i '152 i /*' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
     #sed -i '157 i */' inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
 
-    # 编译静态库
-    if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
-    then
-        sed -i '152,152s/SHARED/STATIC/g' inference-engine/src/inference_engine/CMakeLists.txt
-        sed -i 's/SHARED/STATIC/g' inference-engine/src/legacy_api/CMakeLists.txt
-        sed -i 's/SHARED/STATIC/g' inference-engine/src/transformations/CMakeLists.txt
-        sed -i 's/SHARED/STATIC/g' inference-engine/src/low_precision_transformations/CMakeLists.txt
-        sed -i 's/SHARED/STATIC/g' ngraph/src/ngraph/CMakeLists.txt
-    fi
+    # 编译静态库, deprecated
+    #if [ "${OPENVINO_BUILD_SHARED}" = "OFF" ]
+    #then
+    #    sed -i '152,152s/SHARED/STATIC/g' inference-engine/src/inference_engine/CMakeLists.txt
+    #    sed -i 's/SHARED/STATIC/g' inference-engine/src/legacy_api/CMakeLists.txt
+    #    sed -i 's/SHARED/STATIC/g' inference-engine/src/transformations/CMakeLists.txt
+    #    sed -i 's/SHARED/STATIC/g' inference-engine/src/low_precision_transformations/CMakeLists.txt
+    #    sed -i 's/SHARED/STATIC/g' ngraph/src/ngraph/CMakeLists.txt
+    #fi
 }
 
 build_openvino() {
@@ -82,6 +83,8 @@ build_openvino() {
         -DENABLE_TBB_RELEASE_ONLY=OFF \
         -DTHREADING=TBB_AUTO \
         -DNGRAPH_COMPONENT_PREFIX="deployment_tools/ngraph/" \
+        -DENABLE_INTEL_GPU=OFF \
+        -DENABLE_SYSTEM_OPENCL=OFF \
         -DENABLE_MYRIAD=OFF \
         -DENABLE_CLDNN=OFF \
         -DENABLE_GNA=OFF \
@@ -109,18 +112,6 @@ copy_openvino_libraries() {
 
     cd ${BUILD_DIR}
 
-    if [ -d ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/ ]
-    then
-        mkdir -p ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib64/libngraph${LIB_EXT} ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/
-    fi
-
-    # if [ -d ${OPENVINO_INSTALL_PATH}/lib64/ ]
-    # then
-    #     mkdir -p ${OPENVINO_INSTALL_PATH}/lib
-    #     cp ${OPENVINO_INSTALL_PATH}/lib64/libpugixml.a ${OPENVINO_INSTALL_PATH}/lib/
-    # fi
-
     if [ ! -d ${TNN_INSTALL_DIR} ] 
     then
         mkdir -p ${TNN_INSTALL_DIR}
@@ -136,19 +127,10 @@ copy_openvino_libraries() {
         mkdir -p ${TNN_INSTALL_DIR}/lib
     fi
 
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${TNN_INSTALL_DIR}/lib
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/plugins.xml ${BUILD_DIR}/
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libMKLDNNPlugin.so ${TNN_INSTALL_DIR}/lib/
-    cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/external/tbb/lib/* ${TNN_INSTALL_DIR}/lib/
-
-
     if [ "${OPENVINO_BUILD_SHARED}" = "ON" ]
     then
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_legacy${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/inference_engine/lib/intel64/libinference_engine_lp_transformations${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
-        cp ${OPENVINO_INSTALL_PATH}/deployment_tools/ngraph/lib/libngraph${LIB_EXT} ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/runtime/lib/intel64/libopenvino.so ${TNN_INSTALL_DIR}/lib/
+        cp ${OPENVINO_INSTALL_PATH}/runtime/3rdparty/tbb/lib/libtbb.so ${TNN_INSTALL_DIR}/lib/
     fi
 }
 
@@ -187,7 +169,7 @@ cmake ${TNN_ROOT_PATH} \
 -DTNN_OPENVINO_BUILD_SHARED=${OPENVINO_BUILD_SHARED} \
 
 echo "Building TNN ..."
-make -j7
+make -j10
 
 if [ 0 -ne $? ]
 then
diff --git a/source/pytnn/__init__.py b/source/pytnn/__init__.py
new file mode 100644
index 000000000..06c9f6bda
--- /dev/null
+++ b/source/pytnn/__init__.py
@@ -0,0 +1,2 @@
+from tiacc_inference._pytnn import *
+from tiacc_inference.pytnn import *
diff --git a/source/pytnn/core/blob_py.cc b/source/pytnn/core/blob_py.cc
new file mode 100644
index 000000000..916b1e132
--- /dev/null
+++ b/source/pytnn/core/blob_py.cc
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/blob.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitBlobPy(py::module& m) {
+    py::class_<BlobDesc>(m, "BlobDesc")
+        .def(py::init<>())
+        .def_readwrite("device_type", &BlobDesc::device_type)
+        .def_readwrite("data_type", &BlobDesc::data_type)
+        .def_readwrite("data_format", &BlobDesc::data_format)
+        .def_readwrite("dims", &BlobDesc::dims)
+        .def_readwrite("name", &BlobDesc::name);
+
+    py::class_<BlobHandle>(m, "BlobHandle")
+        .def(py::init<>())
+        .def_readwrite("base", &BlobHandle::base)
+        .def_readwrite("bytes_offset", &BlobHandle::bytes_offset); 
+
+    py::class_<Blob>(m, "Blob")
+	    .def(py::init<BlobDesc>())
+        .def(py::init<BlobDesc, bool>())
+        .def(py::init<BlobDesc, BlobHandle>())
+        .def("GetBlobDesc", &Blob::GetBlobDesc)
+        .def("SetBlobDesc", &Blob::SetBlobDesc)
+        .def("GetHandle", &Blob::GetHandle)
+        .def("SetHandle", &Blob::SetHandle);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/core/common_py.cc b/source/pytnn/core/common_py.cc
new file mode 100644
index 000000000..1c75c48cb
--- /dev/null
+++ b/source/pytnn/core/common_py.cc
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/common.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitCommonPy(py::module &m) {
+
+    // DataType
+    py::enum_<DataType>(m, "DataType")
+        .value("DATA_TYPE_AUTO", DataType::DATA_TYPE_AUTO)
+        .value("DATA_TYPE_FLOAT", DataType::DATA_TYPE_FLOAT)
+        .value("DATA_TYPE_HALF", DataType::DATA_TYPE_HALF)
+        .value("DATA_TYPE_INT8", DataType::DATA_TYPE_INT8)
+        .value("DATA_TYPE_INT32", DataType::DATA_TYPE_INT32)
+        .value("DATA_TYPE_BFP16", DataType::DATA_TYPE_BFP16)
+        .value("DATA_TYPE_INT64", DataType::DATA_TYPE_INT64)
+        .value("DATA_TYPE_UINT32", DataType::DATA_TYPE_UINT32)
+        .export_values();
+
+    // DataFormat
+    py::enum_<DataFormat>(m, "DataFormat")
+        .value("DATA_FORMAT_AUTO", DataFormat::DATA_FORMAT_AUTO)
+        .value("DATA_FORMAT_NCHW", DataFormat::DATA_FORMAT_NCHW)
+        .value("DATA_FORMAT_NHWC", DataFormat::DATA_FORMAT_NHWC)
+        .value("DATA_FORMAT_NHWC4", DataFormat::DATA_FORMAT_NHWC4)
+        .value("DATA_FORMAT_NC2HW2", DataFormat::DATA_FORMAT_NC2HW2)
+        .value("DATA_FORMAT_NC4HW4", DataFormat::DATA_FORMAT_NC4HW4)
+        .value("DATA_FORMAT_NC8HW8", DataFormat::DATA_FORMAT_NC8HW8)
+        .value("DATA_FORMAT_NC16HW16", DataFormat::DATA_FORMAT_NC16HW16)
+        .value("DATA_FORMAT_NCDHW", DataFormat::DATA_FORMAT_NCDHW)
+        .value("DATA_FORMAT_NHC4W4", DataFormat::DATA_FORMAT_NHC4W4)
+        .value("DATA_FORMAT_CNH4", DataFormat::DATA_FORMAT_CNH4)
+        .export_values();
+
+    // Precison
+    py::enum_<Precision>(m, "Precision")
+        .value("PRECISION_AUTO", Precision::PRECISION_AUTO)
+        .value("PRECISION_NORMAL", Precision::PRECISION_NORMAL)
+        .value("PRECISION_HIGH", Precision::PRECISION_HIGH)
+        .value("PRECISION_LOW", Precision::PRECISION_LOW)
+        .export_values();
+
+    // NetworkType
+    py::enum_<NetworkType>(m, "NetworkType")
+        .value("NETWORK_TYPE_AUTO", NetworkType::NETWORK_TYPE_AUTO)
+        .value("NETWORK_TYPE_DEFAULT", NetworkType::NETWORK_TYPE_DEFAULT)
+        .value("NETWORK_TYPE_OPENVINO", NetworkType::NETWORK_TYPE_OPENVINO)
+        .value("NETWORK_TYPE_COREML", NetworkType::NETWORK_TYPE_COREML)
+        .value("NETWORK_TYPE_SNPE", NetworkType::NETWORK_TYPE_SNPE)
+        .value("NETWORK_TYPE_HIAI", NetworkType::NETWORK_TYPE_HIAI)
+        .value("NETWORK_TYPE_ATLAS", NetworkType::NETWORK_TYPE_ATLAS)
+        .value("NETWORK_TYPE_HUAWEI_NPU", NetworkType::NETWORK_TYPE_HUAWEI_NPU)
+        .value("NETWORK_TYPE_RK_NPU", NetworkType::NETWORK_TYPE_RK_NPU)
+        .value("NETWORK_TYPE_TENSORRT", NetworkType::NETWORK_TYPE_TENSORRT)
+        .value("NETWORK_TYPE_TNNTORCH", NetworkType::NETWORK_TYPE_TNNTORCH)
+        .export_values();
+
+    // DeviceType
+    py::enum_<DeviceType>(m, "DeviceType")
+        .value("DEVICE_NAIVE", DeviceType::DEVICE_NAIVE)
+        .value("DEVICE_X86", DeviceType::DEVICE_X86)
+        .value("DEVICE_ARM", DeviceType::DEVICE_ARM)
+        .value("DEVICE_OPENCL", DeviceType::DEVICE_OPENCL)
+        .value("DEVICE_METAL", DeviceType::DEVICE_METAL)
+        .value("DEVICE_CUDA", DeviceType::DEVICE_CUDA)
+        .value("DEVICE_DSP", DeviceType::DEVICE_DSP)
+        .value("DEVICE_ATLAS", DeviceType::DEVICE_ATLAS)
+        .value("DEVICE_HUAWEI_NPU", DeviceType::DEVICE_HUAWEI_NPU)
+        .value("DEVICE_RK_NPU", DeviceType::DEVICE_RK_NPU)
+        .export_values();
+
+    // ShareMemoryMode
+    py::enum_<ShareMemoryMode>(m, "ShareMemoryMode")
+        .value("SHARE_MEMORY_MODE_DEFAULT", ShareMemoryMode::SHARE_MEMORY_MODE_DEFAULT)
+        .value("SHARE_MEMORY_MODE_SHARE_ONE_THREAD", ShareMemoryMode::SHARE_MEMORY_MODE_SHARE_ONE_THREAD)
+        .value("SHARE_MEMORY_MODE_SET_FROM_EXTERNAL", ShareMemoryMode::SHARE_MEMORY_MODE_SET_FROM_EXTERNAL)
+        .export_values();
+
+    // ModelType
+    py::enum_<ModelType>(m, "ModelType")
+        .value("MODEL_TYPE_TNN", ModelType::MODEL_TYPE_TNN)
+        .value("MODEL_TYPE_NCNN", ModelType::MODEL_TYPE_NCNN)
+        .value("MODEL_TYPE_OPENVINO", ModelType::MODEL_TYPE_OPENVINO)
+        .value("MODEL_TYPE_COREML", ModelType::MODEL_TYPE_COREML)
+        .value("MODEL_TYPE_SNPE", ModelType::MODEL_TYPE_SNPE)
+        .value("MODEL_TYPE_HIAI", ModelType::MODEL_TYPE_HIAI)
+        .value("MODEL_TYPE_ATLAS", ModelType::MODEL_TYPE_ATLAS)
+        .value("MODEL_TYPE_RKCACHE", ModelType::MODEL_TYPE_RKCACHE)
+        .value("MODEL_TYPE_TORCHSCRIPT", ModelType::MODEL_TYPE_TORCHSCRIPT)
+	.export_values();
+
+    py::class_<NetworkConfig>(m, "NetworkConfig")
+        .def(py::init<>())
+    	.def_readwrite("device_type", &NetworkConfig::device_type)
+        .def_readwrite("device_id", &NetworkConfig::device_id)
+        .def_readwrite("data_format", &NetworkConfig::data_format)
+        .def_readwrite("network_type", &NetworkConfig::network_type)
+	.def_readwrite("share_memory_mode", &NetworkConfig::share_memory_mode)
+        .def_readwrite("library_path", &NetworkConfig::library_path)
+        .def_readwrite("precision", &NetworkConfig::precision)
+        .def_readwrite("cache_path", &NetworkConfig::cache_path)
+        .def_readwrite("enable_tune_kernel", &NetworkConfig::enable_tune_kernel);
+
+    py::class_<ModelConfig>(m, "ModelConfig")
+        .def(py::init<>())
+    	.def_readwrite("model_type", &ModelConfig::model_type)
+        .def_readwrite("params", &ModelConfig::params);
+};
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/core/instance_py.cc b/source/pytnn/core/instance_py.cc
new file mode 100644
index 000000000..7462ade61
--- /dev/null
+++ b/source/pytnn/core/instance_py.cc
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/instance.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+   
+    /*
+    Status GetOutputMat(Instance* instance, std::shared_ptr<Mat>& mat,
+                        MatConvertParam param = MatConvertParam(),
+                        std::string output_name = "",
+                        DeviceType device = DEVICE_ARM, MatType mat_type = NCHW_FLOAT) {
+	std::shared_ptr<Mat> tmp_mat;
+        instance->GetOutputMat(tmp_mat, param, output_name, device, mat_type);
+     	using HolderType = std::shared_ptr<Mat>;
+        auto * mat_inst = reinterpret_cast<py::detail::instance *>(py::cast(mat).ptr());
+        auto * type_info = py::detail::get_type_info(typeid(HolderType));
+	auto v_h = mat_inst->get_value_and_holder(type_info);
+	py::detail::deregister_instance(mat_inst, v_h.value_ptr(), type_info);
+        Mat* p_new = new Mat(tmp_mat->GetDeviceType(), tmp_mat->GetMatType(), tmp_mat->GetDims(), tmp_mat->GetData());
+        v_h.value_ptr<Mat>() = p_new;
+	if(v_h.holder_constructed()) {
+            v_h.holder<HolderType>().reset(p_new);
+        } else {
+            new (&v_h.holder<HolderType>()) std::shared_ptr<Mat>(p_new);
+	    v_h.set_holder_constructed();
+        }
+        py::detail::register_instance(mat_inst, v_h.value_ptr(), type_info);
+	return TNN_OK;
+    }
+    */
+
+    std::shared_ptr<Mat> GetOutputMat(Instance* instance, 
+                        MatConvertParam param = MatConvertParam(),
+                        std::string output_name = "",
+                        DeviceType device = DEVICE_ARM, MatType mat_type = NCHW_FLOAT) {
+	std::shared_ptr<Mat> output_mat;
+        instance->GetOutputMat(output_mat, param, output_name, device, mat_type);
+	return output_mat;
+    }
+ 
+
+    BlobMap GetAllInputBlobs(Instance* instance) {
+        BlobMap input_blobs;
+	instance->GetAllInputBlobs(input_blobs);
+        return input_blobs;
+    }
+
+    BlobMap GetAllOutputBlobs(Instance* instance) {
+        BlobMap output_blobs;
+	instance->GetAllOutputBlobs(output_blobs);
+        return output_blobs;
+    }
+
+    void* GetCommandQueue(Instance* instance) {
+        void* command_queue;
+        instance->GetCommandQueue(&command_queue);
+        return command_queue;
+    }
+
+    void InitInstancePy(py::module &m){
+        py::class_<Instance, std::shared_ptr<Instance>>(m, "Instance")
+            .def("Forward", &Instance::Forward)
+            .def("GetForwardMemorySize", &Instance::GetForwardMemorySize)
+            .def("SetForwardMemory", &Instance::SetForwardMemory)
+            .def("Reshape", &Instance::Reshape)
+            .def("GetCommandQueue", GetCommandQueue) 
+	    .def("ShareCommandQueue", &Instance::ShareCommandQueue)
+	    .def("SetCpuNumThreads", &Instance::SetCpuNumThreads)
+            .def("GetAllInputBlobs", GetAllInputBlobs, py::return_value_policy::reference)
+            .def("GetAllOutputBlobs", GetAllOutputBlobs, py::return_value_policy::reference)
+            .def("SetInputMat", &Instance::SetInputMat, py::arg("mat"), py::arg("param"), py::arg("input_name") = "")
+            .def("GetOutputMat", GetOutputMat, py::arg("param")=MatConvertParam(), py::arg("output_name")="", py::arg("device")=DEVICE_ARM, py::arg("mat_type")=NCHW_FLOAT);
+//            .def("GetOutputMat", &Instance::GetOutputMat, py::arg("mat"), py::arg("param")=MatConvertParam(), py::arg("output_name")="", py::arg("device")=DEVICE_ARM, py::arg("mat_type")=NCHW_FLOAT);
+    }
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/core/mat_py.cc b/source/pytnn/core/mat_py.cc
new file mode 100644
index 000000000..6a72485ed
--- /dev/null
+++ b/source/pytnn/core/mat_py.cc
@@ -0,0 +1,129 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/mat.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+    void InitMatPy(py::module& m) {
+    // MatType
+    py::enum_<MatType>(m, "MatType")
+        .value("INVALID", MatType::INVALID)
+        .value("N8UC3", MatType::N8UC3)
+        .value("N8UC4", MatType::N8UC4)
+        .value("NGRAY", MatType::NGRAY)
+        .value("NNV21", MatType::NNV21)
+        .value("NNV12", MatType::NNV12)
+        .value("NCHW_FLOAT", MatType::NCHW_FLOAT)
+        .value("NC_INT32", MatType::NC_INT32)
+        .value("RESERVED_BFP16_TEST", MatType::RESERVED_BFP16_TEST)
+        .value("RESERVED_FP16_TEST", MatType::RESERVED_FP16_TEST)
+        .value("RESERVED_INT8_TEST", MatType::RESERVED_INT8_TEST)
+        .export_values();
+
+    py::class_<Mat, std::shared_ptr<Mat>>(m, "Mat", py::buffer_protocol())
+    	.def(py::init<DeviceType, MatType, std::vector<int>>())
+        .def(py::init<DeviceType, MatType, std::vector<int>, char*>())
+        .def(py::init<DeviceType, MatType>())
+        .def(py::init([](py::buffer input) {
+            py::buffer_info input_info = input.request();
+            void *input_ptr = static_cast<void *>(input_info.ptr);
+            auto format = input_info.format;
+            auto shape = input_info.shape;
+            if(format == py::format_descriptor<unsigned char>::format()) {
+                if(shape.size() != 4) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, INVALID);
+                }
+                DimsVector input_dims = {(int)shape[0], (int)shape[3], (int)shape[1], (int)shape[2]};
+                if(shape[3] == 1) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, NGRAY, input_dims, input_ptr);
+                } else if(shape[3] == 3) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, N8UC3, input_dims, input_ptr);
+                } else if(shape[4] == 4) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, N8UC4, input_dims, input_ptr);
+                } else {
+                    return TNN_NS::Mat(DEVICE_NAIVE, INVALID); 
+                }
+            } else {
+                DimsVector input_dims;
+                for(auto dim : input_info.shape) {
+                    input_dims.push_back(dim);
+                }
+                if(format == py::format_descriptor<float>::format()) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, NCHW_FLOAT, input_dims, input_ptr);
+                } else if(format == py::format_descriptor<int>::format()) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, NC_INT32, input_dims, input_ptr);
+                } else if(format == py::format_descriptor<long long>::format()) {
+                    return TNN_NS::Mat(DEVICE_NAIVE, NC_INT64, input_dims, input_ptr);
+                } else {
+                    return TNN_NS::Mat(DEVICE_NAIVE, INVALID);
+                }
+            }
+        }))
+	.def("GetDeviceType", &Mat::GetDeviceType)
+        .def("GetMatType", &Mat::GetMatType)
+        .def("GetData", &Mat::GetData)
+        .def("GetBatch", &Mat::GetBatch)
+        .def("GetChannel", &Mat::GetChannel)
+        .def("GetHeight", &Mat::GetHeight)
+        .def("GetWidth", &Mat::GetWidth)
+        .def("GetDim", &Mat::GetDim)
+        .def("GetDims", &Mat::GetDims)
+        .def_buffer([](Mat &mat) -> py::buffer_info {
+            auto output_dims = mat.GetDims();
+            auto mat_type = mat.GetMatType();
+            auto device_type = mat.GetDeviceType();
+	    int item_size = 0;
+	    std::string format;
+	    if(mat_type == NGRAY || mat_type == N8UC3 || mat_type == N8UC4) {
+                item_size = 1;
+		format = py::format_descriptor<unsigned char>::format();
+            } else if(mat_type == NCHW_FLOAT) {
+	        item_size = 4;
+		format = py::format_descriptor<float>::format();
+	    } else if(mat_type == NC_INT32) {
+                item_size = 4;
+		format = py::format_descriptor<int>::format();
+	    } else if(mat_type == NC_INT64) {
+                item_size = 8;
+		format = py::format_descriptor<long long>::format();
+	    }
+            std::vector<size_t> shape;
+            if(item_size == 1) {
+                shape = {(size_t)output_dims[0], (size_t)output_dims[2], (size_t)output_dims[3], (size_t)output_dims[1]};
+	    } else {
+                for(auto dim : output_dims) {
+                    shape.push_back(dim);
+                }
+	    }
+            int stride = item_size;
+            std::vector<size_t> strides(shape.size());
+            for(int i = shape.size() - 1; i >=0; --i) {
+                strides[i] = stride;
+                stride *= shape[i];
+            }
+            return  py::buffer_info(
+                    mat.GetData(),
+                    item_size,
+                    format,
+                    shape.size(),
+                    shape,
+                    strides);
+        });
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/core/status_py.cc b/source/pytnn/core/status_py.cc
new file mode 100644
index 000000000..944e51742
--- /dev/null
+++ b/source/pytnn/core/status_py.cc
@@ -0,0 +1,28 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/status.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitStatusPy(py::module &m) {
+    py::class_<Status>(m, "Status")
+        .def(py::init<int, std::string>(), py::arg("code") = 0, py::arg("message") = "")
+        .def("description", &Status::description);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/core/tnn_py.cc b/source/pytnn/core/tnn_py.cc
new file mode 100644
index 000000000..b4ccd64ce
--- /dev/null
+++ b/source/pytnn/core/tnn_py.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/core/tnn.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+    InputShapesMap GetModelInputShapesMap(TNN* net) {
+        InputShapesMap shapes_map;
+        net->GetModelInputShapesMap(shapes_map);
+        return shapes_map;
+    }
+
+    std::vector<std::string> GetModelInputNames(TNN* net) {
+        std::vector<std::string> input_names;
+        net->GetModelInputNames(input_names);
+        return input_names;
+    }
+
+    std::vector<std::string> GetModelOutputNames(TNN* net) {
+        std::vector<std::string> output_names;
+        net->GetModelOutputNames(output_names);
+        return output_names;
+    }
+
+    void InitTNNPy(py::module &m) {
+        py::class_<TNN>(m, "TNN")
+     	    .def(py::init<>())
+	    .def("Init", &TNN::Init)
+            .def("DeInit", &TNN::DeInit)
+            .def("AddOutput", &TNN::AddOutput)
+            .def("GetModelInputShapesMap", GetModelInputShapesMap)
+            .def("GetModelInputNames", GetModelInputNames)
+            .def("GetModelOutputNames", GetModelOutputNames)
+            .def("CreateInst", static_cast<std::shared_ptr<Instance> (TNN::*)(NetworkConfig&, Status& ,InputShapesMap, InputDataTypeMap)>(&TNN::CreateInst), py::arg("config"), py::arg("status"), py::arg("inputs_shape")=InputShapesMap(), py::arg("inputs_data_type")=InputDataTypeMap())
+            .def("CreateInst", static_cast<std::shared_ptr<Instance> (TNN::*)(NetworkConfig&, Status& ,InputShapesMap, InputShapesMap, InputDataTypeMap)>(&TNN::CreateInst), py::arg("config"), py::arg("status"), py::arg("min_inputs_shape"), py::arg("max_inputs_shape"), py::arg("inputs_data_type")=InputDataTypeMap());
+    }
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/pytnn.cc b/source/pytnn/pytnn.cc
new file mode 100644
index 000000000..d7db1aec6
--- /dev/null
+++ b/source/pytnn/pytnn.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/core/tnn.h>
+#include <tnn/core/mat.h>
+#include <pytnn/pytnn.h>
+
+namespace py = pybind11; 
+
+namespace TNN_NS {
+
+PYBIND11_MODULE(_pytnn, m) {
+    m.doc() = "pybind11 tnn torch plugin"; // optional module docstring
+
+    InitStatusPy(m);
+    InitCommonPy(m);
+    InitMatPy(m);
+    InitBlobPy(m);
+    InitBlobConverterPy(m);
+
+    InitTNNPy(m);
+    InitInstancePy(m);
+
+    InitBFP16UtilsPy(m);
+    InitCpuUtilsPy(m);
+    InitDataTypeUtilsPy(m);
+    InitDeviceUtilsPy(m);
+    InitDimsVectorUtilsPy(m);
+    InitHalfUtilsPy(m);
+    InitMatUtilsPy(m);
+    InitStringUtilsPy(m);
+}
+
+} // TNN_NS
diff --git a/source/pytnn/pytnn.h b/source/pytnn/pytnn.h
new file mode 100644
index 000000000..08f2c0485
--- /dev/null
+++ b/source/pytnn/pytnn.h
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the 
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_PYTNN_PYTNN_H_
+#define TNN_SOURCE_PYTNN_PYTNN_H_
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <pybind11/numpy.h>
+
+#include <tnn/core/tnn.h>
+#include <tnn/core/instance.h>
+#include <tnn/core/macro.h>
+#include <tnn/core/status.h>
+
+#pragma warning(push)
+#pragma warning(disable:4251)
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitStatusPy(py::module &m);
+void InitCommonPy(py::module &m);
+void InitMatPy(py::module &m);
+void InitBlobPy(py::module& m);
+void InitTNNPy(py::module &m);
+void InitInstancePy(py::module &m);
+
+void InitBFP16UtilsPy(py::module &m);
+void InitBlobConverterPy(py::module &m);
+void InitCpuUtilsPy(py::module &m);
+void InitDataTypeUtilsPy(py::module &m);
+void InitDeviceUtilsPy(py::module &m);
+void InitDimsVectorUtilsPy(py::module &m);
+void InitHalfUtilsPy(py::module &m);
+void InitMatUtilsPy(py::module& m);
+void InitStringUtilsPy(py::module& m);
+
+}  // namespace TNN_NS
+
+#pragma warning(pop)
+
+#endif  // TNN_SOURCE_PYTNN_PYTNN_H_
diff --git a/source/pytnn/pytnn.py b/source/pytnn/pytnn.py
new file mode 100644
index 000000000..6b977a4b1
--- /dev/null
+++ b/source/pytnn/pytnn.py
@@ -0,0 +1,1074 @@
+from tiacc_inference._pytnn import *
+#from pytnn._pytnn import *
+from typing import List, Dict, Any
+import numpy
+import pickle
+import genericpath
+from subprocess import getoutput
+#from tkinter.messagebox import NO
+from typing import *
+import torch
+import sys
+import GPUtil
+import json
+import re
+import copy
+import os
+from threading import Lock
+import hashlib
+import onnxruntime as ort
+import time
+import shutil
+import torch.nn as nn
+import torch.nn.functional as F
+
+def _supported_input_size_type(input_size) -> bool:
+    if isinstance(input_size, tuple):
+        return True
+    elif isinstance(input_size, list):
+        return True
+    elif isinstance(input_size, dict):
+        return True
+    else:
+        raise TypeError(
+            "input size is required to be a List, tuple or a Dict of two sizes (min, max), found type: "
+            + str(type(input_size)))
+
+
+def _parse_input_ranges(input_sizes, input_names):
+    if not isinstance(input_sizes, list) and not isinstance(input_sizes, dict):
+        raise KeyError("input sizes required to be a List or Dict, found type: " + str(type(input_sizes)))
+    if isinstance(input_sizes, list) and any(not _supported_input_size_type(i) for i in input_sizes):
+        raise KeyError("An input size must either be a static size or a range of two sizes (min, max) as Dict")
+    if isinstance(input_sizes, dict) and any(not _supported_input_size_type(i) for i in input_sizes.values()):
+        raise KeyError("An input size must either be a static size or a range of two sizes (min, max) as Dict")
+    min_input_shapes = {}
+    max_input_shapes = {}
+    if isinstance(input_sizes, list):
+        for index, value in enumerate(input_sizes):
+            input_name = "input_" + str(index)
+            if len(input_names) > index:
+                input_name = input_names[index]
+            min_value, max_value = _parse_min_max_value(value)
+            min_input_shapes[input_name] = min_value
+            max_input_shapes[input_name] = max_value
+    if isinstance(input_sizes, dict):
+        for key, value in input_sizes.items():
+            min_value, max_value = _parse_min_max_value(value)
+            min_input_shapes[key] = min_value
+            max_input_shapes[key] = max_value
+    return (min_input_shapes, max_input_shapes)
+
+def _parse_min_max_value(value):
+    if isinstance(value, dict):
+        if all(k in value for k in ["min", "max"]):
+            return (value["min"], value["max"])
+        else:
+            raise KeyError(
+                "An input size must either be a static size or a range of two sizes (min, max) as Dict")
+    elif isinstance(value, list):
+        return (value, value)
+    elif isinstance(value, tuple):
+        return (value, value)
+
+def _parse_device_type(device_type):
+    if isinstance(device_type, DeviceType):
+        return device_type
+    elif isinstance(device_type, str):
+        if device_type == "cuda" or device_type == "CUDA":
+            return DEVICE_CUDA
+        elif device_type == "x86" or device_type == "X86":
+            return DEVICE_X86
+        elif device_type == "arm" or device_type == "ARM":
+            return DEVICE_ARM
+        elif device_type == "naive" or device_type == "NAIVE":
+            return DEVICE_NAIVE
+        elif device_type == "metal" or device_type == "METAL":
+            return DEVICE_METAL
+        elif device_type == "opencl" or device_type == "OPENCL":
+            return DEVICE_OPENCL
+        else:
+            raise ValueError("Got a device_type unsupported (type: " + device_type + ")")
+    else:
+        raise TypeError("device_type must be of type string or DeviceType, but got: " +
+                        str(type(device_type)))         
+
+def _parse_network_type(network_type):
+    if isinstance(network_type, NetworkType):
+        return network_type
+    elif isinstance(network_type, str):
+        if network_type == "auto" or network_type == "AUTO":
+            return NETWORK_TYPE_AUTO
+        elif network_type == "default" or network_type == "DEFAULT":
+            return NETWORK_TYPE_DEFAULT
+        elif network_type == "openvino" or network_type == "OPENVINO":
+            return NETWORK_TYPE_OPENVINO
+        elif network_type == "coreml" or network_type == "COREML":
+            return NETWORK_TYPE_COREML
+        elif network_type == "tensorrt" or network_type == "TENSORRT":
+            return NETWORK_TYPE_TENSORRT
+        elif network_type == "tnntorch" or network_type == "ATLAS":
+            return NETWORK_TYPE_TNNTORCH
+        elif network_type == "atlas" or network_type == "ATLAS":
+            return NETWORK_TYPE_ATLAS
+        else:
+            raise ValueError("Got a network_type unsupported (type: " + network_type + ")")
+    else:
+        raise TypeError("network_type must be of type string or NetworkType, but got: " +
+                        str(type(network_type)))
+
+def _parse_precision(precision):
+    if isinstance(precision, Precision):
+        return precision
+    elif isinstance(precision, str):
+        if precision == "auto" or precision == "AUTO":
+            return PRECISION_AUTO
+        if precision == "normal" or precision == "NORMAL":
+            return PRECISION_NORMAL
+        elif precision == "high" or precision == "HIGH" or precision == "fp32" or precision == "FP32" \
+            or precision == "float32" or precision == "FLOAT32":
+            return PRECISION_HIGH
+        elif precision == "low" or precision == "LOW" or precision == "fp16" or precision == "FP16" \
+            or precision == "float16" or precision == "FLOAT16" or precision == "bfp16" or precision == "BFP16":
+            return PRECISION_LOW
+        else:
+            raise ValueError("Got a precision unsupported (type: " + precision + ")")
+    else:
+        raise TypeError("precision must be of type string or Precision, but got: " +
+                        str(type(precision)))
+
+def _parse_share_memory_mode(share_memory_mode):
+    if isinstance(share_memory_mode, ShareMemoryMode):
+        return share_memory_mode
+    elif isinstance(share_memory_mode, str):
+        if share_memory_mode == "default" or share_memory_mode == "DEFAULT":
+            return SHARE_MEMORY_MODE_DEFAULT
+        elif share_memory_mode == "share_one_thread" or share_memory_mode == "SHARE_ONE_THREAD":
+            return SHARE_MEMORY_MODE_SHARE_ONE_THREAD
+        elif share_memory_mode == "set_from_external" or share_memory_mode == "SET_FROM_EXTERNAL":
+            return SHARE_MEMORY_MODE_SET_FROM_EXTERNAL
+        else:
+            raise ValueError("Got a share_memory_mode unsupported (type: " + share_memory_mode + ")")
+    else:
+        raise TypeError("share_memory_mode must be of type string or ShareMemoryMode, but got: " +
+                        str(type(share_memory_mode)))
+
+def _parse_data_format(data_format):
+    if isinstance(data_format, DataFormat):
+        return data_format
+    elif isinstance(data_format, str):
+        if data_format == "NCHW" or data_format == "nchw":
+            return DATA_FORMAT_NCHW
+        elif data_format == "NC4HW4" or data_format == "nc4hw4":
+            return DATA_FORMAT_NC4HW4
+        elif data_format == "NHC4W4" or data_format == "nhc4w4":
+            return DATA_FORMAT_NHC4W4
+        else:
+            raise ValueError("Got a data_format unsupported (type: " + data_format + ")")
+    else:
+        raise TypeError("data_format must be of type string or DataFormat, but got: " +
+                        str(type(data_format)))
+
+def _parse_network_config(config_dict):
+    network_config = NetworkConfig()
+    if "device_type" in config_dict:
+        network_config.device_type = _parse_device_type(config_dict["device_type"])
+    else:
+        network_config.device_type = DEVICE_CUDA
+    if "device_id" in config_dict:
+        assert isinstance(config_dict["device_id"], int)
+        network_config.device_id = config_dict["device_id"]
+    if "data_format" in config_dict:
+        network_config.data_format = _parse_data_format(config_dict["data_format"])
+    if "network_type" in config_dict:
+        network_config.network_type = _parse_network_type(config_dict["network_type"])
+    if "share_memory_mode" in config_dict:
+        network_config.share_memory_mode = _parse_share_memory_mode(config_dict["share_memory_mode"])
+    if "library_path" in config_dict:
+        assert isinstance(config_dict["library_path"], str)
+        network_config.library_path = config_dict["library_path"]
+    if "precision" in config_dict:
+        network_config.precision = _parse_precision(config_dict["precision"])
+    if "cache_path" in config_dict:
+        assert isinstance(config_dict["cache_path"], str)
+        network_config.cache_path = config_dict["cache_path"]
+    if "enable_tune_kernel" in config_dict:
+        assert isinstance(config_dict["enable_tune_kernel"], bool)
+        network_config.enable_tune_kernel = config_dict["enable_tune_kernel"]
+    return network_config
+
+def _replace_last(source_string, replace_what, replace_with):
+    head, _sep, tail = source_string.rpartition(replace_what)
+    return head + replace_with + tail
+
+infer_framework_map = dict()
+
+try:
+    import torch
+except ModuleNotFoundError:
+    infer_framework_map['torch'] = False
+else:
+    infer_framework_map['torch'] = True
+
+def convert_data_to_shape(obj, pre):
+    status = Status(StatusCode.TIACC_OK, '')
+    shape = {}
+    types = {}
+    if isinstance(obj, dict):
+        # print('dict')
+        for key,value in obj.items():
+            shape_tmp, types_tmp, rtn = convert_data_to_shape(value, pre + '[' + key + ']')
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, None, rtn
+            shape = {**shape, **shape_tmp}
+            types = {**types, **types_tmp}
+        return shape, types, status
+    elif isinstance(obj, list):
+        # print('list')
+        for i in range(len(obj)):
+            shape_tmp, types_tmp, rtn = convert_data_to_shape(obj[i], pre + '[' + str(i) + ']')
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, None, rtn
+            shape = {**shape, **shape_tmp}
+            types = {**types, **types_tmp}
+        return shape, types, status
+    elif isinstance(obj, tuple):
+        # print('tuple')
+        for i in range(len(obj)):
+            shape_tmp, types_tmp, rtn = convert_data_to_shape(obj[i], pre + '(' + str(i) + ')')
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, None, rtn
+            shape = {**shape, **shape_tmp}
+            types = {**types, **types_tmp}
+        return shape, types, status
+    elif infer_framework_map['torch'] and torch.is_tensor(obj):
+        # print('tensor')
+        shape[pre] = list(obj.shape)
+        types[pre] = convert_tensor_type(obj.dtype)
+        return shape, types, status
+    else:
+        print("unsupport data type")
+        status = Status(StatusCode.TIACC_ERR, "unsupport data type")
+        return None, None, status
+
+def gen_shape_from_data(data):
+    min_input_shapes, max_input_shapes, status = {}, {}, Status(StatusCode.TIACC_OK, '')
+    types = {}
+    for ii in range(len(data)):
+        name = "input_" + str(ii)
+        shape, type, rtn = convert_data_to_shape(data[ii], name)
+        return None, None, None, rtn
+        # if rtn.code != StatusCode.TIACC_OK: #？？?
+            # return None, None, None, rtn
+
+        min_input_shapes = {**min_input_shapes, **shape}
+        max_input_shapes = {**max_input_shapes, **shape}
+        types = {**types, **type}
+    return min_input_shapes, max_input_shapes, types, status
+
+onnx_framework_map = dict()
+try:
+    import onnxruntime
+except ModuleNotFoundError:
+    onnx_framework_map['onnx'] = False
+else:
+    onnx_framework_map['onnx'] = True
+
+def gen_report(test_shape):
+    # generate report info
+    software_env = []
+    if infer_framework_map['torch']:
+        torch_version = ""
+        try:
+            torch_version = torch.__version__.split('+')[0]
+        except:
+            torch_version = ""
+        software_env.append({"software_environment": "pytorch", "version": torch_version})
+    if onnx_framework_map['onnx']:
+        onnx_version = ""
+        try:
+            onnx_version = onnxruntime.__version__.split('+')[0]
+        except:
+            onnx_version = ""
+        software_env.append({"software_environment": "onnx", "version": onnx_version})
+    # software_env.append({"software_environment": "cuda", "version": GetCUDAVersion()})
+    # software_env.append({"software_envrionment": "cudnn", "version": GetCudnnVersion()})
+    # software_env.append({"software_environment": "TensorRT", "version": GetTensorRTVersion()})
+    software_env.append({"software_environment": "cuda", "version": "11.1"})
+    software_env.append({"software_envrionment": "cudnn", "version": "8.3.2"})
+    software_env.append({"software_environment": "TensorRT", "version": "8.4.0.6"})
+    
+    hardware_env = {}
+    hardware_env['device_type'] = "gpu"
+    gpu_name = ""
+    try:
+        gpu_name = GPUtil.getGPUs()[0].name
+    except:
+        gpu_name = ""
+    hardware_env['microarchitecture'] = gpu_name
+
+    test_data_info = {}
+    test_data_info['test_data_source'] = 'user provided'
+    test_data_info['test_data_shape'] = str(test_shape)
+    test_data_info['test_data_type'] = ''
+
+    optimization_result = {}
+    optimization_result['baseline_time'] = ""
+    optimization_result['optimized_time'] = ""
+    optimization_result['baseline_qps'] = ""
+    optimization_result['optimized_qps'] = ""
+    optimization_result['speed_up'] = ""
+
+    status = {}
+    status['code'] = StatusCode.TIACC_OK
+    status['message'] = ""
+
+    result = {}
+    result['software_environment'] = software_env
+    result['hardware_environment'] = hardware_env
+    result['test_data_info'] = test_data_info
+    result['optimization_result'] = optimization_result
+    result['status'] = status
+
+    return result
+
+from enum import Enum, unique
+
+@unique
+class StatusCode(int, Enum):
+    TIACC_OK    = 0
+
+    TIACC_ERR      = 1000
+
+class Status:
+    def __init__(self, code=StatusCode.TIACC_OK, message=''):
+        self.code = code
+        self.message = message
+    def get_dict(self):
+        return {'code': self.code, 'message': self.message}
+
+shape_type = "^[0-9]+(\*[0-9]+)*$"
+
+def seperate_shapes(shapes : str):
+    status = Status(StatusCode.TIACC_OK, '')
+    dim = shapes[0].count('*') + 1
+    min_shape = [100000] * dim
+    max_shape = [-1] * dim
+    for shape in shapes:
+        if re.match(shape_type, shape) == None:
+            print("Error shape format! Shape format should be positive numbers splited by '*', \n\
+                   e.g. 'n*c*h*w'.")
+
+            status = Status(StatusCode.TIACC_ERR, "Error shape format! Shape format should be positive numbers splited by '*', \n\
+                            e.g. 'n*c*h*w'.")
+            return None, None, status
+        if shape.count('*') != dim - 1:
+            print("Range shape should keep the dim size consistent!")
+
+            status = Status(StatusCode.TIACC_ERR, "Range shape should keep the dim size consistent!")
+            return None, None, status
+        shape = shape.replace('min:', '')
+        shape = shape.replace('max:', '')
+        shape = shape.split('*')
+        shape = list(map(int, shape))
+        for i in range(dim):
+            if shape[i] < 1:
+                print("Shapes should be positive!")
+
+                status = Status(StatusCode.TIACC_ERR, "Shapes should be positive!")
+                return None, None, status
+            if shape[i] < min_shape[i]:
+                min_shape[i] = shape[i]
+            if shape[i] > max_shape[i]:
+                max_shape[i] = shape[i]
+    return min_shape, max_shape, status
+
+type_pattern = '(int)|(float)|(fp16)|(int8)'
+range_pattern = '(range)|(seperate)'
+def seperate_key(key: str):
+    if re.search('long', key):
+        return 'long'
+    if re.search('int', key):
+        return 'int32'
+    if re.search('int8', key):
+        return 'int8'
+    if re.search('fp16', key):
+        return 'fp16'
+    return 'float'
+
+def convert_to_tnn_name(obj, pre) -> dict:
+    '''
+    Returns:
+        {
+            'min_shapes' : dict of prefix and shape,
+            'max_shapes' : dict of prefix and shape,
+        },
+        Status(TIACC_OK/TIACC_ERR, msg)
+    '''
+    status = Status(StatusCode.TIACC_OK, '')
+    min_shapes, max_shapes, types = {}, {}, {}
+    if isinstance(obj, dict):
+        # filter keyword 'range'
+        key = list(obj.keys())[0]
+        if re.search(type_pattern, key) or re.search(range_pattern, key):
+            if isinstance(obj[key], list):
+                min_shape, max_shape, rtn = seperate_shapes(obj[key])
+                if rtn.code != StatusCode.TIACC_OK:
+                    return None, rtn
+            else:
+                min_shape, max_shape, rtn = seperate_shapes([obj[key]])
+                if rtn.code != StatusCode.TIACC_OK:
+                    return None, rtn
+            min_shapes[pre] = min_shape
+            max_shapes[pre] = max_shape
+            types[pre] = seperate_key(key)
+            return {'min_shapes': min_shapes, 'max_shapes': max_shapes, 'types': types}, status
+        # if 'range' in obj:
+        #     min_shape, max_shape = seperate_shapes(obj['range'])
+        #     min_shapes[pre] = min_shape
+        #     max_shapes[pre] = max_shape
+        #     return {'min_shapes': min_shapes, 'max_shapes': max_shapes}
+        # if 'seperate' in obj:
+        #     min_shape, max_shape = seperate_shapes(obj['seperate'])
+        #     min_shapes[pre] = min_shape
+        #     max_shapes[pre] = max_shape
+        #     return {'min_shapes': min_shapes, 'max_shapes': max_shapes}
+        for key,value in obj.items():
+            shapes, status = convert_to_tnn_name(value, pre + '[' + key + ']')
+            min_shapes = {**min_shapes, **shapes['min_shapes']}
+            max_shapes = {**max_shapes, **shapes['max_shapes']}
+            types      = {**types, **shapes['types']}
+        return {'min_shapes': min_shapes, 'max_shapes': max_shapes, 'types': types}, status
+
+    elif isinstance(obj, list):
+        # print("list")
+        for i in range(len(obj)):
+            shapes, status = convert_to_tnn_name(obj[i], pre + '[' + str(i) + ']')
+            min_shapes = {**min_shapes, **shapes['min_shapes']}
+            max_shapes = {**max_shapes, **shapes['max_shapes']}
+            types      = {**types, **shapes['types']}
+        return {'min_shapes': min_shapes, 'max_shapes': max_shapes, 'types': types}, status
+
+    elif isinstance(obj, tuple):
+        # print("tuple")
+        for i in range(len(obj)):
+            shapes, status = convert_to_tnn_name(obj[i], pre + '(' + str(i) + ')')
+            min_shapes = {**min_shapes, **shapes['min_shapes']}
+            max_shapes = {**max_shapes, **shapes['max_shapes']}
+            types      = {**types, **shapes['types']}
+        return {'min_shapes': min_shapes, 'max_shapes': max_shapes, 'types': types}, status
+
+    elif isinstance(obj, str):
+        # print("string")
+        if re.match(shape_type, obj) != None:
+            min_shape, max_shape, rtn = seperate_shapes([obj])
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, rtn
+            min_shapes[pre] = min_shape
+            max_shapes[pre] = max_shape
+            types[pre] = 'float'
+            return {'min_shapes': min_shapes, 'max_shapes': max_shapes, 'types': types}, status
+        else:
+            print("Error shape format! Shape format should be positive numbers splited by '*', \n\
+                           e.g. 'n*c*h*w'.")
+
+            status = Status(StatusCode.TIACC_ERR, "Error shape format! Shape format should be positive numbers splited by '*', \n\
+                            e.g. 'n*c*h*w'.")
+            return None, status
+    else:
+        print('Error type for tnn input name convert!')
+
+        status = Status(StatusCode.TIACC_ERR, 'Error type for tnn input name convert!')
+        return None, status
+
+def seperate_shape(input):
+    status = Status(StatusCode.TIACC_OK, '')
+    min_input, max_input, types = {}, {}, {}
+    for ii in range(len(input)):
+        name = "input_" + str(ii)
+        shapes, rtn = convert_to_tnn_name(input[ii], name)
+        if rtn.code != StatusCode.TIACC_OK:
+            return None, None, None, rtn
+
+        min_input = {**min_input, **shapes['min_shapes']}
+        max_input = {**max_input, **shapes['max_shapes']}
+        types     = {**types,     **shapes['types']}
+    return min_input, max_input, types, status
+
+def seperate_type_v2(key: str):
+    type = 'float'
+    if (re.search('int32', key)):
+        type = 'int32'
+    if (re.search('int64', key)):
+        type = 'long'
+    if (re.search('half', key)):
+        type = 'fp16'
+    if (re.search('bool', key)):
+        type = 'bool'
+    
+    format = 'tensor'
+    if (re.search('scalar', key)):
+        format = 'scalar'
+    if (re.search('array', key)):
+        format = 'array'
+    
+    return type, format
+
+def sepearte_info_v2(info: str):
+    status = Status(StatusCode.TIACC_OK, '')
+    type, shape = info.split('(')
+    data_type, format = seperate_type_v2(type)
+    shape = shape.replace(')', '')
+    shape = shape.replace(' ', '')
+    shape = shape.replace('[', '')
+    shape = shape.replace(']', '')
+    shapes = shape.split(',')
+    min_shape, max_shape, status = seperate_shapes(shapes)
+    return data_type, format, min_shape, max_shape, status
+
+def seperate_shape_v2(inputs):
+    status = Status(StatusCode.TIACC_OK, '')
+    min_input, max_input, types, formats = {}, {}, {}, {}
+    input: str
+    for input in inputs:
+        try:
+            name, info = input.split(':')
+            data_type, format, min_shape, max_shape, status = sepearte_info_v2(info)
+            min_input[name] = min_shape
+            max_input[name] = max_shape
+            types[name]     = data_type
+            formats[name]   = format
+        except Exception as ex:
+            print('shape format error: {}'.format(ex))
+            status = Status(StatusCode.TIACC_ERR, 'shape format error: {}'.format(ex))
+            return (None, None, None, None, status)
+    
+    return min_input, max_input, types, formats, status
+
+def build_val(shape, type, format, device):
+    if (format == 'tensor'):
+        if type == 'float':
+            res = torch.rand(*shape)
+        if type == 'long':
+            res = (torch.rand(*shape)*256.0).long()
+        if type == 'int32':
+            res = (torch.rand(*shape) * 256.0).int()
+        if type == 'fp16':
+            res = torch.rand(*shape, dtype = torch.float16)
+        if (device == 0):
+            res = res.cuda()
+        return res
+
+    if (format == 'scalar'):
+        if type == 'float':
+            res = torch.rand(1)
+        if type == 'long':
+            res = (torch.rand(*shape)*256.0).long()
+        if type == 'int32':
+            res = (torch.rand(*shape)*256.0).int()
+        if type == 'fp16':
+            res = torch.rand(*shape, dtype = torch.float16)
+        if (device == 0):
+            res = res.cuda()
+        return res.item()
+    
+    if (format == 'array'):
+        import numpy as np
+        if type == 'float':
+            res = np.random.random(shape).astype(np.float32)
+        if type == 'long':
+            res = np.random.randint(shape).astype(np.int64)
+        if type == 'int32':
+            res = np.random.randint(shape)
+        if type == 'fp16':
+            res = np.random.random(shape).astype(np.float16)
+        return res
+
+def iterate_name_v2(name: str, inputs, val):
+    status = Status(StatusCode.TIACC_OK, '')
+
+    if (len(name) == 0): 
+        return val, status
+    
+    if (re.match('^\[', name) != None):
+        res = re.match('^\[\d+\]', name)
+        if (res != None):
+            # list
+            index = int(name[res.start()+1:res.end()-1])
+            if (inputs == None):
+                inputs = []
+            # todo: incorrect order of index may get error
+            if (index < len(inputs)):
+                inputs[index], status = iterate_name_v2(name[res.end():], inputs[index], val)
+            elif (index == len(inputs)):
+                rtp = iterate_name_v2(name[res.end():], None, val)
+                inputs.append(rtp[0])
+                status = rtp[1]
+            else:
+                status = Status(StatusCode.TIACC_ERR, 'invalid data type')
+                return None, status
+            return inputs, status
+        else:
+            res = re.match('^\[.*?\]', name)
+            print(res)
+            print(name)
+            print(inputs)
+            if (res != None):
+                # dict
+                if (inputs == None):
+                    inputs = {}
+                # inputs = {}
+                key = name[res.start()+1:res.end()-1]
+                if (inputs.get(key) != None):
+                    inputs[key], status = iterate_name_v2(name[res.end():], inputs[key], val)
+                else:
+                    inputs[key], status = iterate_name_v2(name[res.end():], None, val)
+                return inputs, status
+            else:
+                print("Input name format error.")
+                status = Status(StatusCode.TIACC_ERR, 'Input name format error.')
+                return None, status
+    else:
+        print("Input name format error.")
+        status = Status(StatusCode.TIACC_ERR, 'Input name format error.')
+        return None, status
+                
+    # todo: tuple
+        
+def convert_shape_to_data_v2(input_shapes:dict, types:dict, format:dict, device:dict):
+    status = Status(StatusCode.TIACC_OK, '')
+    # for input_name, format_val in format.items():#???
+    #     if format_val = tensor: 
+    #         inputs = []
+    #         for name, val in input_shapes.items():
+    #             import copy
+    #             tmp_name = copy.deepcopy(name)
+    #             if (re.match('^input_', tmp_name)):
+    #                 tmp_name = tmp_name.lstrip('input_')
+    #                 mat = re.match('^\d+', tmp_name)
+    #                 name_list = list(tmp_name)
+    #                 name_list.insert(mat.end(), ']')
+    #                 name_list.insert(mat.start(), '[')
+    #                 tmp_name = ''.join(name_list)
+    #             else:
+    #                 print("Input name format error.")
+    #                 status = Status(StatusCode.TIACC_ERR, 'Input name format error.')
+    #                 return None, status
+    #             value = build_val(val, types[name], format[name], device)
+    #             inputs, status = iterate_name_v2(tmp_name, inputs, value)
+    #     elif format_val = array:
+    #         inputs = {}
+    #         for name, val in input_shapes.items():
+    #             inputs[name] = value
+    #             value = build_val(val, types[name], format[name], device)
+    # return inputs, status
+    inputs = {}
+    for name, val in input_shapes.items():
+        import copy
+        value = build_val(val, types[name], format[name], device)
+        inputs[name] = value
+    return inputs, status
+
+
+def gen_torch_tensor(shape, type = 'float'):
+    if type == 'long':
+        return (torch.rand(*shape)*256.0).long()
+    if type == 'int32':
+        return (torch.rand(*shape) * 256.0).int()
+    if type == 'int8':
+        return (torch.rand(*shape) * 256.0).type(torch.int8)
+    if type == 'fp16':
+        return torch.rand(*shape, dtype = torch.float16)
+    return torch.rand(*shape)
+
+def convert_shape_to_data(obj, device_type):
+    status = Status(StatusCode.TIACC_OK, '')
+    if isinstance(obj, list):
+        test_data = []
+        for i in range(len(obj)):
+            data, rtn = convert_shape_to_data(obj[i], device_type)
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, rtn
+            test_data.append(data)
+        return test_data, status
+
+    elif isinstance(obj, dict):
+        key = list(obj.keys())[0]
+        if re.search(type_pattern, key) or re.search(range_pattern, key):
+            if isinstance(obj[key], list):
+                min_shape, max_shape, rtn = seperate_shapes(obj[key])
+                if rtn.code != StatusCode.TIACC_OK:
+                    return None, rtn
+            else:
+                min_shape, max_shape, rtn = seperate_shapes([obj[key]])
+                if rtn.code != StatusCode.TIACC_OK:
+                    return None, rtn
+            type = seperate_key(key)
+            data_tmp = gen_torch_tensor(max_shape, type)
+            if device_type == 0:
+                data_tmp = data_tmp.cuda()
+            return data_tmp, status
+
+        test_data = {}
+        for key,value in obj.items():
+            data, rtn = convert_shape_to_data(obj[key], device_type)
+            if rtn.code != StatusCode.TIACC_OK:
+                return None, rtn
+            test_data[key] = data
+        return test_data, status
+
+    elif isinstance(obj, tuple):
+        test_data = []
+        for i in range(len(obj)):
+            data, rtn = convert_shape_to_data(obj[i], device_type)
+            if rtn.code != Status.TIACC_OK:
+                return None, rtn
+            test_data.append(data)
+        return list(test_data), status
+
+    elif isinstance(obj, str):
+        min_shape, max_shape, rtn = seperate_shapes([obj])
+        if rtn.code != StatusCode.TIACC_OK:
+            return None, rtn
+
+        data_tmp = torch.rand(*max_shape)
+        if device_type == 0:
+            data_tmp = data_tmp.cuda()
+        return data_tmp, status
+    
+    else:
+        print('Error input shape format!')
+        status = Status(StatusCode.TIACC_ERR, 'Error input shape format!')
+        return None, status
+
+def optimize(
+    input_model: Any,
+    # input_onnx: Any,
+    optimization_level: int,
+    device_type: int,
+    input_shapes = {},
+    input_nodes_names = [],
+    output_nodes_names = [],
+    test_data = [],
+    save_path = "",
+    device_id = 0,
+):
+    dirs = os.listdir(input_model)
+    flag1 = False
+    flag2 = False
+    flag3 = False
+    for file in dirs:
+        if "tnnproto" in file:
+            tnnproto_name = file
+            flag1 = True
+        elif "tnnmodel" in file:
+            tnnmodel_name = file
+            flag2 = True
+        elif "onnx" in file:
+            onnx_name = file
+            flag3 = True
+    if flag1 == False or flag2 == False or flag3 == False:
+        print("There is no tnnmodel or onnx in your path.")
+        return
+    module = Module(input_model + '/' + tnnproto_name)
+    # set min&max input shapes
+    types = {}
+    if len(input_shapes) > 0:
+        min_input_shapes, max_input_shapes, types, formats, status = seperate_shape_v2(input_shapes)
+        if status.code != StatusCode.TIACC_OK:
+            report = gen_report('')
+            report['status'] = status.get_dict()
+            report = json.dumps(report, indent=4, separators=(',', ': '))
+            return (None, report)
+    elif len(test_data) > 0:
+        min_input_shapes, max_input_shapes, types, status = gen_shape_from_data(test_data,list(min_input_shapes.keys()))
+        if status.code != StatusCode.TIACC_OK:
+            report = gen_report('')
+            report['status'] = status.get_dict()
+            report = json.dumps(report, indent=4, separators=(',', ': '))
+            return (None, report)
+    else:
+        report = gen_report('')
+        report['status'] = Status(StatusCode.TIACC_ERR,
+                                  'Error: At least one between input_shapes and test_data should be provieded!').get_dict()
+        report = json.dumps(report, indent=4, separators=(',', ': '))
+        return (None, report)
+    report = {}
+
+    # set test_data
+    if len(test_data) == 0:
+        # test_data, status = convert_shape_to_data(input_shapes, device_type)
+        test_data, status = convert_shape_to_data_v2(max_input_shapes, types, formats, device_type)
+        report = gen_report(max_input_shapes)
+        report['test_data_info']['test_data_source'] = 'tiacc provided'
+        if status.code != StatusCode.TIACC_OK:
+            report['status'] = status.get_dict()
+            report = json.dumps(report, indent=4, separators=(',', ': '))
+            return (None, report)
+    else:
+        real_input_shape_res  = gen_shape_from_data(test_data)
+        status = real_input_shape_res[3]
+        if status.code != StatusCode.TIACC_OK:
+            report = gen_report('')
+            report['status'] = status.get_dict()
+            report = json.dumps(report, indent=4, separators=(',', ': '))
+            return (None, report)
+        report = gen_report(real_input_shape_res[0])
+    # print(min_input_shapes)
+    # print(max_input_shapes)
+    # print(types)
+    # print(formats)
+    # print(test_data)
+    report['test_data_info']['test_data_type'] = str(types)
+
+    if device_type == 0:
+        report['hardware_environment']['device_type'] = 'GPU'
+    else:
+        report['hardware_environment']['device_type'] = 'CPU'
+        try:
+            cpu_name = get_cpu_name()
+        except:
+            cpu_name = ''
+        report['hardware_environment']['microarchitecture'] = cpu_name
+
+    config_dict={}
+    config_dict["precision"] = "fp32" if optimization_level == 0 else "fp16"
+    config_dict["device_type"] = "cuda" if device_type == 0 else "x86"
+    config_dict["cache_path"] = save_path
+    network_config = _parse_network_config(config_dict)
+    module.create_inst(network_config, min_input_shapes, max_input_shapes)
+    # save input info as pickle file in save_dir
+    a_dict = {'min_input_shapes':min_input_shapes, 'max_input_shapes':max_input_shapes, 'types':types, 'formats':formats, 'precision':config_dict["precision"], 'device_type':device_type}
+    file = open(save_path + '/' + 'input_info.pickle','wb')
+    pickle.dump(a_dict, file)
+    file.close()
+    # save tnnproto, tnnmodel and cache in save_dir
+    shutil.copyfile(input_model + '/' + tnnproto_name, save_path + '/' + tnnproto_name[:-8] + 'optimize.tnnproto' )
+    shutil.copyfile(input_model + '/' + tnnmodel_name, save_path + '/' + tnnproto_name[:-8] + 'optimize.tnnmodel' )
+
+    try:
+        # tnn runtime
+        N = 50
+        output=[]
+        output=module.forward(test_data)
+        # warm up
+        for i in range(10):
+            module.forward(test_data)
+        time_0=time.time()
+        for i in range(N):
+            output=module.forward(test_data)
+        time_1=time.time()
+        time_tnn = (time_1-time_0)/N*1000.0
+        # onnxruntime
+        test_data_onnx, status = convert_shape_to_data_v2(max_input_shapes, types, formats, device_type)
+        # test_data_onnx=test_data
+        for name, val in test_data_onnx.items():
+            if type(val) == torch.Tensor:
+                value = val.cpu().numpy()
+                test_data_onnx[name]=value
+        ort_sess = ort.InferenceSession(input_model + '/' + onnx_name, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        outputs_onnx = ort_sess.run(None, test_data_onnx)
+        # warm up
+        for i in range(10):
+            ort_sess.run(None, test_data_onnx)
+        time_2 = time.time()
+        for i in range(N):
+            outputs_onnx = ort_sess.run(None, test_data_onnx)
+        time_3 = time.time()
+        time_onnx = (time_3-time_2)/N*1000.0
+        baseline_time = time_onnx
+        optimized_time = time_tnn
+    except Exception as e:
+        print("Error: " + repr(e))
+        report['status'] = Status(StatusCode.TIACC_ERR,
+                                  'Error: input model incompatible with test data!').get_dict()
+        report['optimization_result']['baseline_time']  = None
+        report['optimization_result']['baseline_qps']   = None
+        report['optimization_result']['optimized_time'] = None
+        report['optimization_result']['optimized_qps']  = None
+        report['optimization_result']['speed_up']       = None 
+    else:
+        report['optimization_result']['baseline_time'] = "%.2f" % baseline_time + "ms"
+        report['optimization_result']['baseline_qps']  = "%.2f" % (1000 / baseline_time)
+        report['optimization_result']['optimized_time'] = "%.2f" % optimized_time + "ms"
+        report['optimization_result']['optimized_qps']  = "%.2f" % (1000 / optimized_time)
+        report['optimization_result']['speed_up'] = "%.2f" % (baseline_time / optimized_time) 
+        
+    report = json.dumps(report, indent=4, separators=(',', ': '))
+    print(report)
+    return (module, report)
+
+def load(model_path):
+    config_dict = {}
+    dirs = os.listdir(model_path)
+    flag1 = False
+    flag2 = False
+    flag3 = False
+    cache_file = ""
+    for file in dirs:
+        if "optimize.tnnproto" in file:
+            tnnproto_name = file
+            flag1 = True
+        elif "optimize.tnnmodel" in file:
+            tnnmodel_name = file
+            flag2 = True
+        elif "pickle" in file:
+            input_info = file
+            flag3 = True
+        elif file.endswith(".cache"):
+            cache_file = file
+    if flag1 == False or flag2 == False or flag3 == False:
+        print("There is no optimized tnnmodel or input info in your path.")
+        return
+    module = Module(model_path + '/' + tnnproto_name)
+    # input_names = module.parsed_input_names()
+    min_input_shapes = None
+    max_input_shapes = None
+    config_dict["cache_path"] = model_path + "/" + cache_file
+    # if "input_names" in config_dict:
+    #     input_names = config_dict["input_names"]
+    # if "input_shapes" in config_dict:
+    #     min_input_shapes, max_input_shapes = _parse_input_ranges(config_dict["input_shapes"], input_names)
+    f = open(model_path + '/' + input_info,'rb')
+    c = pickle.load(f)
+    min_input_shapes = c["min_input_shapes"]
+    max_input_shapes = c["max_input_shapes"]
+    config_dict["precision"] = c["precision"]
+    config_dict["device_type"] = "cuda" if c["device_type"] == 0 else "x86"
+    f.close()
+    network_config = _parse_network_config(config_dict)
+    module.create_inst(network_config, min_input_shapes, max_input_shapes)
+    return module
+
+def load_raw(model_path, network_config, input_shapes=None):
+    module = Module(model_path)
+    module.create_inst(network_config, input_shapes, input_shapes)
+    return module
+def load_raw_range(model_path, network_config, min_input_shapes, max_input_shapes):
+    module = Module(model_path)
+    module.create_inst(network_config, min_input_shapes, max_input_shapes)
+    return module
+
+
+class Module(nn.Module):
+    def __init__(self, model_path):
+        super(Module,self).__init__()
+        self.model_path = model_path
+        self.tnn=TNN()
+        self.model_config=ModelConfig()
+        if model_path.endswith("tnnproto"):
+            weights_path=_replace_last(model_path, "tnnproto", "tnnmodel")
+            self.model_config.model_type=MODEL_TYPE_TNN
+            params = []
+            with open(model_path, "r") as f:
+                params.append(f.read())
+            with open(weights_path, "rb") as f:
+                params.append(f.read())
+            self.model_config.params=params
+        else:
+            self.model_config.model_type=MODEL_TYPE_TORCHSCRIPT
+            self.model_config.params=[model_path]
+        self.tnn.Init(self.model_config)
+
+    def parsed_input_names(self):
+        return self.tnn.GetModelInputNames()
+
+    def parsed_output_names(self):
+        return self.tnn.GetModelOutputNames()
+
+    def create_inst(self, network_config, min_input_shapes, max_input_shapes):
+        import tiacc_inference
+        ret=tiacc_inference._pytnn.Status()
+        #import pytnn
+        #ret=pytnn._pytnn.Status()
+        if network_config is None:
+            network_config=NetworkConfig()
+            network_config.device_type=DEVICE_CUDA
+        if not isinstance(network_config, NetworkConfig):
+            raise TypeError("network_config can be None or of type NetworkConfig, but got: " +
+                          str(type(network_config)))
+        if self.model_config.model_type == MODEL_TYPE_TORCHSCRIPT:
+            network_config.network_type=NETWORK_TYPE_TNNTORCH
+        if min_input_shapes is None:
+            self.instance=self.tnn.CreateInst(network_config, ret)
+        elif max_input_shapes is None:
+            self.instance=self.tnn.CreateInst(network_config, ret, min_input_shapes)
+        else:
+            self.instance=self.tnn.CreateInst(network_config, ret, min_input_shapes, max_input_shapes)
+
+        self.input_names = self.parsed_input_names()
+        self.output_names = self.parsed_output_names()
+
+        if len(self.input_names) == 0:
+            self.input_names = list(self.instance.GetAllInputBlobs().keys())
+
+    def forward(self, *inputs, rtype="list"):
+        input_mats = {}
+        tensor_flag = False
+        tensor_gpu_flag = False
+        if len(inputs) > 1:
+            for index, value in enumerate(inputs):
+                if type(value) == torch.Tensor:
+                    if (value.is_cuda) == True:
+                        tensor_gpu_flag = True
+                    tensor_flag = True
+                    value = value.cpu().numpy()
+                input_mats[self.input_names[index]] = Mat(value)
+        else:
+            if isinstance(inputs[0], tuple) or isinstance(inputs[0], list):
+                for index, value in enumerate(inputs[0]):
+                    if type(value) == torch.Tensor:
+                        if (value.is_cuda) == True:
+                            tensor_gpu_flag = True
+                        tensor_flag = True
+                        value = value.cpu().numpy()
+                    input_mats[self.input_names[index]] = Mat(value)
+            elif isinstance(inputs[0], dict):
+                for key, value in inputs[0].items():
+                    if type(value) == torch.Tensor:
+                        if (value.is_cuda) == True:
+                            tensor_gpu_flag = True
+                        tensor_flag = True
+                        value = value.cpu().numpy()
+                    input_mats[key] = Mat(value)
+            else:
+                data = inputs[0]
+                if type(data) == torch.Tensor:
+                    if (data.is_cuda) == True:
+                        tensor_gpu_flag = True
+                    tensor_flag = True
+                    data = data.cpu().numpy()
+                input_mats[self.input_names[0]] = Mat(data)
+                
+        input_shapes = {}
+        for key, value in input_mats.items():
+            input_shapes[key] = value.GetDims()
+        self.instance.Reshape(input_shapes) 
+        
+        for key, value in input_mats.items():
+            self.instance.SetInputMat(value, MatConvertParam(), key) 
+      
+        self.instance.Forward()
+        output_blobs = self.instance.GetAllOutputBlobs()
+        output = []
+        is_dict = False
+        if rtype == "dict":
+            output = {}
+            is_dict = True
+        if len(self.output_names) == 0:
+            self.output_names = list(self.instance.GetAllOutputBlobs().keys())
+        for output_name in self.output_names:
+            output_mat=self.instance.GetOutputMat(MatConvertParam(), output_name, DEVICE_NAIVE, NCHW_FLOAT)
+            output_mat_numpy=numpy.array(output_mat, copy=False)
+            if tensor_flag == True:
+                output_mat_final = torch.from_numpy(output_mat_numpy)
+                if tensor_gpu_flag == True:
+                    output_mat_final = output_mat_final.cuda()
+            else:
+                output_mat_final = output_mat_numpy
+            if is_dict:
+                output[key] = output_mat_final
+            else:
+                output.append(output_mat_final)
+        
+        return output
diff --git a/source/pytnn/utils/bfp16_utils_py.cc b/source/pytnn/utils/bfp16_utils_py.cc
new file mode 100644
index 000000000..577e60812
--- /dev/null
+++ b/source/pytnn/utils/bfp16_utils_py.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/bfp16_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitBFP16UtilsPy(py::module& m) {
+    m.def("ConvertFromFloatToBFP16", &ConvertFromFloatToBFP16);
+    m.def("ConvertFromBFP16ToFloat", &ConvertFromBFP16ToFloat);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/blob_converter_py.cc b/source/pytnn/utils/blob_converter_py.cc
new file mode 100644
index 000000000..838766038
--- /dev/null
+++ b/source/pytnn/utils/blob_converter_py.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitBlobConverterPy(py::module& m) {
+    py::class_<MatConvertParam>(m, "MatConvertParam")
+    .def(py::init<>())
+    .def_readwrite("scale", &MatConvertParam::scale)
+    .def_readwrite("bias", &MatConvertParam::bias)
+    .def_readwrite("reverse_channel", &MatConvertParam::reverse_channel);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/cpu_utils_py.cc b/source/pytnn/utils/cpu_utils_py.cc
new file mode 100644
index 000000000..894e45e74
--- /dev/null
+++ b/source/pytnn/utils/cpu_utils_py.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/cpu_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitCpuUtilsPy(py::module& m) {
+    py::class_<CpuUtils>(m, "CpuUtils")
+    .def_static("SetCpuAffinity", &CpuUtils::SetCpuAffinity)
+    .def_static("SetCpuPowersave", &CpuUtils::SetCpuPowersave)
+    .def_static("CpuSupportFp16", &CpuUtils::CpuSupportFp16)
+    .def_static("SetCpuDenormal", &CpuUtils::SetCpuDenormal)
+    ;
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/data_type_utils_py.cc b/source/pytnn/utils/data_type_utils_py.cc
new file mode 100644
index 000000000..1b8370917
--- /dev/null
+++ b/source/pytnn/utils/data_type_utils_py.cc
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitDataTypeUtilsPy(py::module& m) {
+    py::class_<DataTypeUtils>(m, "DataTypeUtils")
+    .def_static("GetBytesSize", &DataTypeUtils::GetBytesSize)
+    .def_static("GetDataTypeString", &DataTypeUtils::GetDataTypeString)
+    .def_static("SaturateCast", &DataTypeUtils::SaturateCast)
+    ;
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/device_utils_py.cc b/source/pytnn/utils/device_utils_py.cc
new file mode 100644
index 000000000..7546b4573
--- /dev/null
+++ b/source/pytnn/utils/device_utils_py.cc
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/device_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitDeviceUtilsPy(py::module& m) {
+    m.def("GetConcreteDeviceType", &GetConcreteDeviceType);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/dims_vector_utils_py.cc b/source/pytnn/utils/dims_vector_utils_py.cc
new file mode 100644
index 000000000..ce7e756ff
--- /dev/null
+++ b/source/pytnn/utils/dims_vector_utils_py.cc
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitDimsVectorUtilsPy(py::module& m) {
+    py::class_<DimsVectorUtils>(m, "DimsVectorUtils")
+    .def_static("Count", &DimsVectorUtils::Count)
+    .def_static("Max", &DimsVectorUtils::Max)
+    .def_static("Min", &DimsVectorUtils::Min)
+    .def_static("Equal", &DimsVectorUtils::Equal)
+    .def_static("NCHW2NHWC", &DimsVectorUtils::NCHW2NHWC)
+    .def_static("NHWC2NCHW", &DimsVectorUtils::NHWC2NCHW)
+    ;
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/half_utils_py.cc b/source/pytnn/utils/half_utils_py.cc
new file mode 100644
index 000000000..edf2f4116
--- /dev/null
+++ b/source/pytnn/utils/half_utils_py.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/half_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitHalfUtilsPy(py::module& m) {
+    m.def("ConvertFromFloatToHalf", &ConvertFromFloatToHalf);
+    m.def("ConvertFromHalfToFloat", &ConvertFromHalfToFloat);
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/mat_utils_py.cc b/source/pytnn/utils/mat_utils_py.cc
new file mode 100644
index 000000000..90597e097
--- /dev/null
+++ b/source/pytnn/utils/mat_utils_py.cc
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/mat_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void SetTransform(WarpAffineParam* param, std::vector<std::vector<int>> transform) {
+    for(int i = 0; i < 2; ++i) {
+        for(int j = 0; j < 3; ++j) {
+            param->transform[i][j] = transform[i][j];
+        }
+    }	
+}
+
+void InitMatUtilsPy(py::module& m) {
+
+    py::enum_<InterpType>(m, "InterpType")
+        .value("INTERP_TYPE_NEAREST", InterpType::INTERP_TYPE_NEAREST)
+        .value("INTERP_TYPE_LINEAR", InterpType::INTERP_TYPE_LINEAR)
+        .export_values();
+
+    py::enum_<BorderType>(m, "BorderType")
+        .value("BORDER_TYPE_CONSTANT", BorderType::BORDER_TYPE_CONSTANT)
+        .value("BORDER_TYPE_REFLECT", BorderType::BORDER_TYPE_REFLECT)
+        .value("BORDER_TYPE_EDGE", BorderType::BORDER_TYPE_EDGE)
+        .export_values();
+
+    py::enum_<ColorConversionType>(m, "ColorConversionType")
+        .value("COLOR_CONVERT_NV12TOBGR", ColorConversionType::COLOR_CONVERT_NV12TOBGR)
+        .value("COLOR_CONVERT_NV12TOBGRA", ColorConversionType::COLOR_CONVERT_NV12TOBGRA)
+        .value("COLOR_CONVERT_NV21TOBGR", ColorConversionType::COLOR_CONVERT_NV21TOBGR)
+        .value("COLOR_CONVERT_NV21TOBGRA", ColorConversionType::COLOR_CONVERT_NV21TOBGRA)
+        .value("COLOR_CONVERT_BGRTOGRAY", ColorConversionType::COLOR_CONVERT_BGRTOGRAY)
+        .value("COLOR_CONVERT_BGRATOGRAY", ColorConversionType::COLOR_CONVERT_BGRATOGRAY)
+        .value("COLOR_CONVERT_RGBTOGRAY", ColorConversionType::COLOR_CONVERT_RGBTOGRAY)
+        .value("COLOR_CONVERT_RGBATOGRAY", ColorConversionType::COLOR_CONVERT_RGBATOGRAY)
+        .export_values();
+
+    py::class_<ResizeParam>(m, "ResizeParam")
+    .def(py::init<>())
+    .def_readwrite("scale_w", &ResizeParam::scale_w)
+    .def_readwrite("scale_h", &ResizeParam::scale_h)
+    .def_readwrite("type", &ResizeParam::type);
+
+    py::class_<CropParam>(m, "CropParam")
+    .def(py::init<>())
+    .def_readwrite("top_left_x", &CropParam::top_left_x)
+    .def_readwrite("top_left_y", &CropParam::top_left_y)
+    .def_readwrite("width", &CropParam::width)
+    .def_readwrite("height", &CropParam::height)
+    ;
+
+    py::class_<WarpAffineParam>(m, "WarpAffineParam")
+    .def(py::init<>())
+    .def("SetTransform", &SetTransform)
+    .def_readwrite("interp_type", &WarpAffineParam::interp_type)
+    .def_readwrite("border_type", &WarpAffineParam::border_type)
+    .def_readwrite("border_val", &WarpAffineParam::border_val)
+    ;
+
+    py::class_<CopyMakeBorderParam>(m, "CopyMakeBorderParam")
+    .def(py::init<>())
+    .def_readwrite("top", &CopyMakeBorderParam::top)
+    .def_readwrite("bottom", &CopyMakeBorderParam::bottom)
+    .def_readwrite("left", &CopyMakeBorderParam::left)
+    .def_readwrite("right", &CopyMakeBorderParam::right)
+    .def_readwrite("border_type", &CopyMakeBorderParam::border_type)
+    .def_readwrite("border_val", &CopyMakeBorderParam::border_val)
+    ;
+
+    py::class_<MatUtils>(m, "MatUtils")
+    .def_static("Copy", &MatUtils::Copy)
+    .def_static("Resize", &MatUtils::Resize)
+    .def_static("Crop", &MatUtils::Crop)
+    .def_static("CvtColor", &MatUtils::CvtColor)
+    .def_static("CopyMakeBorder", &MatUtils::CopyMakeBorder)
+    .def_static("WarpAffine", &MatUtils::WarpAffine)
+    ;
+
+}
+
+}  // namespace TNN_NS
diff --git a/source/pytnn/utils/string_utils_py.cc b/source/pytnn/utils/string_utils_py.cc
new file mode 100644
index 000000000..8bb7dd5ba
--- /dev/null
+++ b/source/pytnn/utils/string_utils_py.cc
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pytnn/pytnn.h"
+#include "tnn/utils/string_utils.h"
+
+namespace py = pybind11;
+
+namespace TNN_NS {
+
+void InitStringUtilsPy(py::module& m) {
+    m.def("UcharToString", &UcharToString);
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/core/abstract_device.cc b/source/tnn/core/abstract_device.cc
index b9b3cb5d7..865f86d2a 100644
--- a/source/tnn/core/abstract_device.cc
+++ b/source/tnn/core/abstract_device.cc
@@ -17,6 +17,10 @@
 #include <map>
 #include <mutex>
 
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/core/macro.h"
+
 namespace TNN_NS {
 
 AbstractDevice::AbstractDevice(DeviceType device_type) : device_type_(device_type) {}
@@ -49,7 +53,12 @@ std::shared_ptr<const ImplementedLayout> AbstractDevice::GetImplementedLayout(La
 }
 
 AbstractDevice* GetDevice(DeviceType type) {
-    return GetGlobalDeviceMap()[type].get();
+    auto device_map = GetGlobalDeviceMap();
+    if (device_map.find(type) != device_map.end()) {
+        return GetGlobalDeviceMap()[type].get();
+    } else {
+        return nullptr;
+    }
 }
 
 /*
diff --git a/source/tnn/core/abstract_network.cc b/source/tnn/core/abstract_network.cc
index f1f0a9867..d6d915908 100644
--- a/source/tnn/core/abstract_network.cc
+++ b/source/tnn/core/abstract_network.cc
@@ -29,6 +29,11 @@ Status AbstractNetwork::ShareCommandQueue(AbstractNetwork *network) {
     return Status(TNNERR_COMMON_ERROR, "Subclass of AbstractNetwork must implement this func ShareCommandQueue");
 }
 
+Status AbstractNetwork::ShareNetResource(AbstractNetwork *network) {
+    LOGE("Subclass of AbstractNetwork must implement this func ShareNetResource\n");
+    return Status(TNNERR_COMMON_ERROR, "Subclass of AbstractNetwork must implement this func ShareNetResource");
+}
+
 Status AbstractNetwork::SetCpuNumThreads(int num_threads) {
     return TNN_OK;
 }
@@ -69,4 +74,56 @@ void NetworkImplManager::RegisterNetworkImplFactory(NetworkType type, AbstractNe
     }
 }
 
+
+Status AbstractNetwork::InitWrapper(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
+    Status ret = Init(net_config, model_config, interpreter, min_inputs_shape, max_inputs_shape, inputs_data_type, enable_const_folder);
+    if(ret != TNN_OK) {
+        return ret;
+    }
+
+    BlobMap inputs;
+    ret = GetAllInputBlobs(inputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get input blobs failed");
+        return ret;
+    }
+
+    // init min max shapes
+    for(auto iter : inputs) {
+        max_inputs_shape_[iter.first] = iter.second->GetBlobDesc().dims;
+        if(min_inputs_shape.count(iter.first) > 0) {
+            min_inputs_shape_[iter.first] = min_inputs_shape[iter.first];
+        } else {
+            min_inputs_shape_[iter.first] = iter.second->GetBlobDesc().dims;
+        }
+    }
+
+    return ret;
+}
+
+Status AbstractNetwork::ReshapeWrapper(const InputShapesMap &inputs) {
+    for(auto iter : inputs) {
+        auto name = iter.first;
+        auto dims = iter.second;
+        if(min_inputs_shape_.count(name) > 0) {
+            auto min_dims = min_inputs_shape_[name];
+            auto max_dims = max_inputs_shape_[name];
+            if(min_dims.size() != dims.size()) {
+                return Status(TNNERR_PARAM_ERR, "input shape dims error \n");
+            } else {
+                for(int i = 0; i < dims.size(); ++i) {
+                    if((dims[i] > max_dims[i])) {
+                        return Status(TNNERR_PARAM_ERR, "input shape dims error \n");
+                    }
+                }
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "input shape dims error \n");
+        }
+    }
+
+    return Reshape(inputs);
+}
+
 }  // namespace TNN_NS
diff --git a/source/tnn/core/abstract_network.h b/source/tnn/core/abstract_network.h
index 040c97b29..eb0baaf49 100644
--- a/source/tnn/core/abstract_network.h
+++ b/source/tnn/core/abstract_network.h
@@ -34,11 +34,15 @@ class AbstractNetwork {
     // @brief virtual default destructor
     virtual ~AbstractNetwork() {}
 
+    // @brief init network wrapper, record min and max input shapes
+    Status InitWrapper(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
+
     // @brief init network with net cfg and net res.
     // @param net_cfg
     // @param net_res
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true) = 0;
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true) = 0;
 
     // @brief deinit release init create resource
     virtual Status DeInit() = 0;
@@ -48,7 +52,7 @@ class AbstractNetwork {
     //  forward
     //  @return error code: If successful, returns zero. Otherwise, returns
     //  an error code.
-    virtual Status GetForwardMemorySize(int &memory_size) = 0;
+    virtual Status GetForwardMemorySize(size_t &memory_size) = 0;
 
     //  @brief: set memory used by the tnn instance without forward
     //  memory, the memory size must be at least that returned by
@@ -61,6 +65,8 @@ class AbstractNetwork {
     //
     virtual Status SetForwardMemory(void *memory) = 0;
 
+    Status ReshapeWrapper(const InputShapesMap &inputs);
+
     // @brief network infer
     virtual Status Reshape(const InputShapesMap &inputs) = 0;
 
@@ -68,8 +74,16 @@ class AbstractNetwork {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue) = 0;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void *command_queue) = 0;
+
     // @brief share tnn command queue to another network。
     virtual Status ShareCommandQueue(AbstractNetwork *network);
+
+    // @brief share tnn network resource to another network
+    // @param network to share resource
+    virtual Status ShareNetResource(AbstractNetwork *network);
     
     // @brief network infer, it will sync to wait result
     virtual Status Forward() = 0;
@@ -98,6 +112,10 @@ class AbstractNetwork {
     virtual void StartProfile();
     virtual std::shared_ptr<ProfileResult> FinishProfile();
 #endif
+
+protected:
+    InputShapesMap min_inputs_shape_;
+    InputShapesMap max_inputs_shape_;
 };
 
 class AbstractNetworkImplFactory {
diff --git a/source/tnn/core/any.cc b/source/tnn/core/any.cc
new file mode 100644
index 000000000..822f383b3
--- /dev/null
+++ b/source/tnn/core/any.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/core/any.h"
+
+namespace TNN_NS {
+
+Any::Any(Any const& other) : content_(other.content_ ? other.content_->clone() : nullptr) {}
+
+Any::Any(Any&& other) noexcept : content_(std::move(other.content_)) {
+    other.content_ = nullptr;
+}
+
+Any::~Any() {
+    reset();
+}
+
+Any& Any::operator=(const Any& rhs) {
+    Any(rhs).swap(*this);
+    return *this;
+}
+
+Any& Any::operator=(Any&& rhs) noexcept {
+    Any(std::move(rhs)).swap(*this);
+    return *this;
+}
+
+void Any::reset() noexcept {
+    delete content_;
+    content_ = nullptr;
+}
+
+void Any::swap(Any& other) noexcept {
+    std::swap(content_, other.content_);
+}
+
+bool Any::has_value() const noexcept {
+    return content_ != nullptr;
+}
+
+const std::type_info & Any::type() const noexcept {
+    return has_value() ? content_->type() : typeid(void);
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/core/any.h b/source/tnn/core/any.h
new file mode 100644
index 000000000..811548d1d
--- /dev/null
+++ b/source/tnn/core/any.h
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_INCLUDE_TNN_CORE_ANY_H_
+#define TNN_INCLUDE_TNN_CORE_ANY_H_
+
+#include <initializer_list>
+#include <typeinfo>
+#include <type_traits>
+#include <utility>
+
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+// C++11 Does not support Variable Template, so we make a small change on in_place_type,
+// Inplace Constructor of TNN::Any is a little bit different from that of std::any
+// e.g.:
+//struct A {
+//    A(int age, std::string name, double salary) {}
+//    int age;
+//    std::string name;
+//    double salary;
+//};
+//Any a2(any_in_place_type_t<A>{}, 30, std::string("Ada"), 1000.25);  // C++17 Style
+//Any a2(any_in_place_type<A>, 30, std::string("Ada"), 1000.25);  // Our TNN implementation.
+
+
+// ambiguation tags like std::inplace_type_t passed to the constructors of TNN::Any to indicate that the contained object should be constructed in-place, andthe type of the object to be constructed.
+// C++17 Style
+//template <class T>
+//struct any_in_place_type_t {
+//    explicit any_in_place_type_t() = default;
+//};
+//template <class T>
+//constexpr any_in_place_type_t<T> any_in_place_type{};
+
+// C++11 Style
+template<class T>
+struct any_in_place_type_tag {};
+struct any_in_place_t {};
+template<class T>
+inline any_in_place_t any_in_place_type(any_in_place_type_tag<T> = any_in_place_type_tag<T>()) {
+    return any_in_place_t();
+}
+#define any_in_place_type_t(T) any_in_place_t(&)(any_in_place_type_tag<T>)
+#define any_in_place_type(T) any_in_place_type<T>
+
+
+
+// @brief Simplified TNN::Any Class similar with C++17 std::any.
+//class PUBLIC Any final {
+class Any final {
+public:
+    //@brief Constructors of TNN::Any
+    constexpr Any() noexcept : content_(nullptr) {}
+    Any(const Any& other);
+    Any(Any&& other) noexcept;
+
+    template<class ValueType, class T = typename std::decay<ValueType>::type, typename std::enable_if<!std::is_same<T,Any>::value, int>::type = 0>
+    Any(ValueType&& value) : content_(new TypedContent<T>(std::forward<ValueType>(value))) {}
+
+    template<class ValueType, class... Args, typename std::enable_if<std::is_constructible<ValueType, Args&&...>::value, int>::type = 0>
+    explicit Any(any_in_place_type_t(ValueType), Args&&... args) : content_(new TypedContent<ValueType>(ValueType(std::forward<Args>(args)...))) {}
+    //explicit Any(any_in_place_type_t<ValueType>, Args&&... args) : content_(new TypedContent<ValueType>(ValueType(std::forward<Args>(args)...))) {}  // C++17 Style
+
+    template<class ValueType, class U, class... Args, typename std::enable_if<std::is_constructible<ValueType, std::initializer_list<U>&, Args&&...>::value, int>::type = 0>
+    explicit Any(any_in_place_type_t(ValueType), std::initializer_list<U> il, Args&&... args) : content_(new TypedContent<ValueType>(ValueType(il, std::forward<Args>(args)...))) {}
+    //explicit Any(any_in_place_type_t<ValueType>, std::initializer_list<U> il, Args&&... args) : content_(new TypedContent<ValueType>(ValueType(il, std::forward<Args>(args)...))) {}  // C++17 Style
+    
+    //@brief Assigns contents to TNN::Any.
+    Any& operator=(const Any& rhs);
+    Any& operator=(Any&& rhs) noexcept;
+
+    template<class ValueType, class T = typename std::decay<ValueType>::type, typename std::enable_if<!std::is_same<T,Any>::value, int>::type = 0>
+    Any& operator=(T && value) {
+        Any(std::move(value)).swap(*this);
+        return *this;
+    }
+
+    //@brief Destructor of TNN::Any.
+    ~Any();
+
+    //@brief Modifiers of TNN::Any.
+    // First destroys the current contained object (if any) by reset(), then constructs an object as the contained object.
+    template<class ValueType, class... Args, typename std::enable_if<std::is_constructible<typename std::decay<ValueType>::type, Args&&...>::value, int>::type = 0>
+    typename std::decay<ValueType>::type& emplace(Args&&... args) {
+        Any(typename std::decay<ValueType>::type(std::forward<Args>(args)...)).swap(*this);
+        return *internal_pointer_cast<typename std::decay<ValueType>::type>();
+    }
+
+    template<class ValueType, class U, class... Args, typename std::enable_if<std::is_constructible<typename std::decay<ValueType>::type, std::initializer_list<U>&, Args&&...>::value, int>::type = 0>
+    typename std::decay<ValueType>::type& emplace(std::initializer_list<U> il, Args&&... args) {
+        Any(typename std::decay<ValueType>::type(il, std::forward<Args>(args)...)).swap(*this);
+        return *internal_pointer_cast<typename std::decay<ValueType>::type>();
+    }
+
+    // If not empty, destroys the contained object.
+    void reset() noexcept;
+
+    // Swaps the content of two any objects.
+    void swap(Any& other) noexcept;
+
+    //@brief Observers of TNN::Any.
+    // Checks whether the object contains a value.
+    bool has_value() const noexcept;
+
+    // Queries the contained type.
+    const std::type_info& type() const noexcept;
+
+private:
+    template<class T>
+    friend T* any_cast(Any* operand) noexcept;
+    
+    template<class T>
+    friend T* any_cast(Any* operand) noexcept;
+
+    class Content {
+    public:
+        virtual ~Content() {}
+        virtual std::type_info const& type() const = 0;
+        virtual Content* clone() const = 0;
+    };
+    
+    template<typename ValueType>
+    class TypedContent : public Content {
+    public:
+        TypedContent(ValueType const& value) : value_(value) {}
+        TypedContent(ValueType && value): value_(std::move(value)) {}
+        virtual std::type_info const& type() const override {
+            return typeid(ValueType);
+        }
+        virtual TypedContent* clone() const override {
+            return new TypedContent(value_);
+        }
+        ValueType value_;
+    };
+
+    template<class ValueType>
+    const ValueType* internal_pointer_cast() const {
+        return &(static_cast<TypedContent<ValueType>*>(content_)->value_);
+    }
+
+    template<class ValueType>
+    ValueType* internal_pointer_cast() {
+        return &(static_cast<TypedContent<ValueType>*>(content_)->value_);
+    }
+
+    Content* content_;
+};
+
+
+// Non-member functions of TNN::Any
+// Overloads the std::swap algorithm for std::any. Swaps the content of two any objects by calling lhs.swap(rhs).
+void swap(Any& lhs, Any& rhs) noexcept;
+
+// Defines a type of object to be thrown by the value-returning forms of std::any_cast on failure.
+class bad_any_cast : public std::bad_cast {
+public:
+    virtual const char* what() const throw() {
+        return "Bad TNN::Any Cast.";
+    }
+};
+
+// Performs type-safe access to the contained object.
+template<class T, typename = typename std::enable_if<(std::is_reference<T>::value || std::is_copy_constructible<T>::value), T>::type>
+inline T any_cast(const Any& operand) {
+    auto* ret = any_cast<typename std::remove_cv<typename  std::remove_reference<T>::type>::type>(&operand);
+    //assert(ret); // If No Throw
+    if (!ret) throw bad_any_cast();
+    return static_cast<T>(*ret);
+}
+
+template<class T, typename = typename std::enable_if<(std::is_reference<T>::value || std::is_copy_constructible<T>::value), T>::type>
+inline T any_cast(Any& operand) {
+    auto* ret = any_cast<typename std::remove_cv<typename std::remove_reference<T>::type>::type>(&operand);
+    //assert(ret); // If No Throw
+    if (!ret) throw bad_any_cast();
+    return static_cast<T>(*ret);
+}
+
+template<class T, typename = typename std::enable_if<(std::is_reference<T>::value || std::is_copy_constructible<T>::value), T>::type>
+inline T any_cast(Any&& operand) {
+    auto* ret = any_cast<typename std::remove_cv<typename  std::remove_reference<T>::type>::type>(&operand);
+    //assert(ret); // If No Throw
+    if (!ret) throw bad_any_cast();
+    return static_cast<T>(std::move(*ret));
+}
+
+template<class T>
+const T* any_cast(const Any* operand) noexcept {
+    return operand != nullptr && operand->type() == typeid(T) ? operand->internal_pointer_cast<T>() : nullptr;
+}
+
+template<class T>
+T* any_cast(Any* operand) noexcept {
+    return operand != nullptr && operand->type() == typeid(T) ? operand->internal_pointer_cast<T>() : nullptr;
+}
+
+// Constructs an any object containing an object of type T, passing the provided arguments to T's constructor.
+template<class T, class... Args>
+Any make_any(Args&&... args) {
+    //return Any(any_in_place_type<T>, std::forward<Args>(args)...);  // C++17 Style
+    return Any(any_in_place_type(T), std::forward<Args>(args)...);
+}
+
+template<class T, class U, class... Args>
+Any make_any(std::initializer_list<U> il, Args&&... args) {
+    //return Any(any_in_place_type<T>, il, std::forward<Args>(args)...);  // C++17 Style
+    return Any(any_in_place_type(T), il, std::forward<Args>(args)...);
+}
+}  // namespace TNN_NS
+
+#endif  // TNN_INCLUDE_TNN_CORE_BLOB_IMPL_H_
diff --git a/source/tnn/core/blob_manager.cc b/source/tnn/core/blob_manager.cc
index d36eaf21d..c46378a70 100644
--- a/source/tnn/core/blob_manager.cc
+++ b/source/tnn/core/blob_manager.cc
@@ -229,7 +229,8 @@ Status BlobManager::AllocateBlobMemory(int flag) {
     Status status = TNN_OK;
 
     do {
-        if (config_.share_memory_mode == SHARE_MEMORY_MODE_DEFAULT) {
+        if (config_.share_memory_mode == SHARE_MEMORY_MODE_DEFAULT ||
+            config_.share_memory_mode == SHARE_MEMORY_MODE_SHARE_NET_RESOURCE) {
             // The default strategy allocated the blob memory separately.
             MemorySeperateAssignStrategy strategy;
             for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
@@ -312,7 +313,8 @@ void BlobManager::OnSharedForwardMemoryChanged(void *memory) {
  * The total size required is given by GetAllBlobMemorySize().
  */
 Status BlobManager::SetForwardMemory(void *memory) {
-    if (config_.share_memory_mode != SHARE_MEMORY_MODE_SET_FROM_EXTERNAL) {
+    if (config_.share_memory_mode != SHARE_MEMORY_MODE_SET_FROM_EXTERNAL &&
+        config_.share_memory_mode != SHARE_MEMORY_MODE_SET_ALL_FROM_EXTERNAL) {
         return Status(TNNERR_NOT_SUPPORT_SET_FORWARD_MEM, "set memory from external is unsupported");
     }
     MemoryUnifyAssignStrategy strategy(memory);
diff --git a/source/tnn/core/const_folder.cc b/source/tnn/core/const_folder.cc
index 4df4451e4..e6c57f448 100644
--- a/source/tnn/core/const_folder.cc
+++ b/source/tnn/core/const_folder.cc
@@ -44,7 +44,7 @@ ConstFolder::~ConstFolder() {
  * Those object is initialized in this function.
  */
 Status ConstFolder::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                            InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+                         InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     config_ = net_config;
     config_.device_type = DEVICE_NAIVE;
     auto device         = GetDevice(DEVICE_NAIVE);
@@ -83,7 +83,7 @@ Status ConstFolder::Init(NetworkConfig &net_config, ModelConfig &model_config, A
         }
     }
 
-    auto ret =DefaultNetwork::Init(config_, model_config, interpreter, min_inputs_shape, max_inputs_shape);
+    auto ret =DefaultNetwork::Init(config_, model_config, interpreter, min_inputs_shape, max_inputs_shape, inputs_data_type);
     if(ret != TNN_OK) {
         return ret;
     }
diff --git a/source/tnn/core/const_folder.h b/source/tnn/core/const_folder.h
index f99d5d883..4bf932bf4 100644
--- a/source/tnn/core/const_folder.h
+++ b/source/tnn/core/const_folder.h
@@ -55,8 +55,9 @@ class ConstFolder : public DefaultNetwork {
     // @param net_resource network resource info
     // @param inputs_shape_map modify input shape, if empty, it will use the
     // shape in proto
+    // @param inputs_data_type specify input data type, by default float
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
 
     // fold for new inputs shape
     Status Reshape(const InputShapesMap& inputs_shape);
diff --git a/source/tnn/core/context.cc b/source/tnn/core/context.cc
index 8bee9fef4..5c2c93494 100644
--- a/source/tnn/core/context.cc
+++ b/source/tnn/core/context.cc
@@ -77,6 +77,10 @@ void Context::SetCacheFilePath(std::string cache_file_path) {
 std::string Context::GetCacheFilePath() {
     return cache_file_path_;
 }
+    
+std::map<std::string, Any>& Context::GetExtraInfoMap() {
+    return extra_info_map_;
+}
 
 #if TNN_PROFILE
 void Context::StartProfile() {
diff --git a/source/tnn/core/context.h b/source/tnn/core/context.h
index 5be406105..86720a89d 100644
--- a/source/tnn/core/context.h
+++ b/source/tnn/core/context.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <vector>
 
+#include "tnn/core/any.h"
 #include "tnn/core/macro.h"
 #include "tnn/core/status.h"
 #include "tnn/core/profile.h"
@@ -38,6 +39,10 @@ class Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) = 0;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) = 0;
+
     // @brief share tnn command queue to another context
     virtual Status ShareCommandQueue(Context* context);
     
@@ -75,6 +80,8 @@ class Context {
 
     std::string GetCacheFilePath();
 
+    std::map<std::string, Any> &GetExtraInfoMap();
+
 #if TNN_PROFILE
 public:
     virtual void StartProfile();
@@ -92,6 +99,7 @@ class Context {
     bool enable_tune_kernel_ = true;
     std::string cache_path_ = ""; // dir to save cache files
     std::string cache_file_path_ = "";
+    std::map<std::string, Any> extra_info_map_;
 };
 
 }  // namespace TNN_NS
diff --git a/source/tnn/core/default_network.cc b/source/tnn/core/default_network.cc
index 6294f5be4..762444af5 100644
--- a/source/tnn/core/default_network.cc
+++ b/source/tnn/core/default_network.cc
@@ -60,7 +60,7 @@ Status DefaultNetwork::SetCpuNumThreads(int num_threads) {
  * Those object is initialized in this function.
  */
 Status DefaultNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     config_                                      = net_config;
     Status ret                                   = TNN_OK;
     DefaultModelInterpreter *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
@@ -89,7 +89,7 @@ Status DefaultNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config
     context_->SetPrecision(net_config.precision);
     context_->SetEnableTuneKernel(net_config.enable_tune_kernel);
 
-    if(!net_config.cache_path.empty()) {
+    if(!net_config.cache_path.empty() && net_config.cache_path.compare(CACHE_MEMORY_TAG) != 0) {
         auto params_md5 = default_interpreter->GetParamsMd5();
         if (params_md5.size() < 1) {
             return Status(TNNERR_PARAM_ERR, "model params md5 missing");
@@ -426,7 +426,7 @@ Status DefaultNetwork::UpdateBlobPrecision(std::shared_ptr<LayerInfo> layer_info
     return TNN_OK;
 }
 
-Status DefaultNetwork::GetForwardMemorySize(int &memory_size) {
+Status DefaultNetwork::GetForwardMemorySize(size_t &memory_size) {
     memory_size = blob_manager_->GetAllBlobMemorySize();
     return TNN_OK;
 }
@@ -539,6 +539,13 @@ Status DefaultNetwork::GetCommandQueue(void **command_queue) {
     return context_->GetCommandQueue(command_queue);
 }
 
+Status DefaultNetwork::SetCommandQueue(void *command_queue) {
+    if (context_ == NULL) {
+        return TNNERR_DEVICE_CONTEXT_CREATE;
+    }
+    return context_->SetCommandQueue(command_queue);
+}
+
 Status DefaultNetwork::ShareCommandQueue(AbstractNetwork *network) {
     if (context_ == NULL) {
         return TNNERR_DEVICE_CONTEXT_CREATE;
diff --git a/source/tnn/core/default_network.h b/source/tnn/core/default_network.h
index c2aa6b144..e0cb06142 100644
--- a/source/tnn/core/default_network.h
+++ b/source/tnn/core/default_network.h
@@ -49,8 +49,9 @@ class DefaultNetwork : public AbstractNetwork {
     // @param net_resource network resource info
     // @param inputs_shape_map modify input shape, if empty, it will use the
     // shape in proto
+	// @param inputs_data_type specify input data type, by default float
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
 
     // @brief reshape with input shape info
     // @inputs input shape info
@@ -59,6 +60,10 @@ class DefaultNetwork : public AbstractNetwork {
     // @brief get tnn command queue
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void *command_queue);
     
     // @brief share tnn command queue to another network
     virtual Status ShareCommandQueue(AbstractNetwork *network);
@@ -78,7 +83,7 @@ class DefaultNetwork : public AbstractNetwork {
     virtual Status DeInit();
 
     // @brief get network forward for all blob memory size
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     // @brief set forward memory when share memory mode is set from external
     virtual Status SetForwardMemory(void *memory);
diff --git a/source/tnn/core/instance.cc b/source/tnn/core/instance.cc
index 5dff25858..814df60ff 100644
--- a/source/tnn/core/instance.cc
+++ b/source/tnn/core/instance.cc
@@ -41,11 +41,11 @@ Instance::~Instance() {
     DeInit();
 }
 
-Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape) {
-    return Init(interpreter, inputs_shape, inputs_shape);
+Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap inputs_shape, InputDataTypeMap inputs_data_type) {
+    return Init(interpreter, inputs_shape, inputs_shape, inputs_data_type);
 }
 
-Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type) {
     
     auto type = net_config_.device_type;
     if(type == DEVICE_APPLE_NPU) {
@@ -65,7 +65,7 @@ Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, Inp
         interpreter_ = interpreter->Copy();
         if (nullptr == interpreter_) {
             // The ModelInterpreter not implement Copy API, just use interpreter
-            LOGI("Interpreter Copy failed, use interpreter in params instead\n");
+            LOGD("Interpreter Copy failed, use interpreter in params instead\n");
             interpreter_ = interpreter;
         }
     }
@@ -84,9 +84,13 @@ Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, Inp
         return Status(TNNERR_NET_ERR, "network_ is nil, network_type may not support");
     }
     if (net_config_.device_type == DEVICE_CUDA) {
-        auto ret = network_->Init(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, false);
-        if (ret == TNN_OK) {
-            return ret;
+        try {
+            auto ret = network_->InitWrapper(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, inputs_data_type, false);
+            if (ret == TNN_OK) {
+                return ret;
+            }
+        } catch(std::exception &e){
+            //ignore, re-init with const foler.
         }
 
         LOGI("Init network failed. Try to re-init it with const folder, and if succeed all of error info above can be ignored.\n");
@@ -99,16 +103,16 @@ Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, Inp
         auto const_folder = std::make_shared<ConstFolder>();
         auto folder_net_config = net_config_;
         folder_net_config.share_memory_mode = SHARE_MEMORY_MODE_DEFAULT;
-        auto status = const_folder->Init(folder_net_config, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape);
+        auto status = const_folder->InitWrapper(folder_net_config, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, inputs_data_type);
         RETURN_ON_NEQ(status, TNN_OK);
 
         if (min_inputs_shape.size() != 0) {
-            status = const_folder->Reshape(min_inputs_shape);
+            status = const_folder->ReshapeWrapper(min_inputs_shape);
             RETURN_ON_NEQ(status, TNN_OK);
             auto min_blob_shapes_map = default_interpreter->GetNetResource()->blob_shapes_map;
                 
             //Note output shape may not change after reshape for const folder, but will do change after forward because shape may be determined at rumtime
-            status = const_folder->Reshape(max_inputs_shape);
+            status = const_folder->ReshapeWrapper(max_inputs_shape);
             RETURN_ON_NEQ(status, TNN_OK);
                 
             default_interpreter->GetNetResource()->min_blob_shapes_map = min_blob_shapes_map;
@@ -121,7 +125,7 @@ Status Instance::Init(std::shared_ptr<AbstractModelInterpreter> interpreter, Inp
     }
 
     network_ = NetworkImplManager::GetNetworkImpl(network_type);
-    auto ret = network_->Init(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, true);
+    auto ret = network_->InitWrapper(net_config_, model_config_, interpreter_.get(), min_inputs_shape, max_inputs_shape, inputs_data_type, true);
     RETURN_ON_NEQ(ret, TNN_OK);
 
     return TNN_OK;
@@ -132,7 +136,7 @@ Status Instance::DeInit() {
     return TNN_OK;
 }
 
-Status Instance::GetForwardMemorySize(int &memory_size) {
+Status Instance::GetForwardMemorySize(size_t &memory_size) {
     return network_->GetForwardMemorySize(memory_size);
 }
 
@@ -144,10 +148,10 @@ Status Instance::Reshape(const InputShapesMap &inputs) {
     Status status = TNN_OK;
     if (const_folder_) {
         auto folder = dynamic_cast<ConstFolder*>(const_folder_.get());
-        status = folder->Reshape(inputs);
+        status = folder->ReshapeWrapper(inputs);
         RETURN_ON_NEQ(status, TNN_OK);
     }
-    status = network_->Reshape(inputs);
+    status = network_->ReshapeWrapper(inputs);
     return status;
 }
 
@@ -155,10 +159,18 @@ Status Instance::GetCommandQueue(void **command_queue) {
     return network_->GetCommandQueue(command_queue);
 }
 
+Status Instance::SetCommandQueue(void *command_queue) {
+    return network_->SetCommandQueue(command_queue);
+}
+
 Status Instance::ShareCommandQueue(Instance *instance) {
     return network_->ShareCommandQueue(instance->GetNetwork());
 }
 
+Status Instance::ShareNetResource(Instance *instance) {
+    return network_->ShareNetResource(instance->GetNetwork());
+}
+
 AbstractNetwork *Instance::GetNetwork() {
     return network_.get();
 }
diff --git a/source/tnn/core/layer_type.cc b/source/tnn/core/layer_type.cc
index e6cd8af18..4c88af3d0 100644
--- a/source/tnn/core/layer_type.cc
+++ b/source/tnn/core/layer_type.cc
@@ -73,6 +73,7 @@ static std::map<std::string, LayerType> global_layer_type_map = {
     {"Abs", LAYER_ABS},
     {"Mul", LAYER_MUL},
     {"InstBatchNormCxx", LAYER_INST_BATCH_NORM},
+    {"InstanceNorm", LAYER_INST_BATCH_NORM},
     {"Pad", LAYER_PAD},
     {"Normalize", LAYER_NORMALIZE},
     {"QuantizeV2", LAYER_QUANTIZEV2},
@@ -81,6 +82,7 @@ static std::map<std::string, LayerType> global_layer_type_map = {
     {"QuantizedPooling", LAYER_POOLING},
     // 50
     {"Dequantize", LAYER_DEQUANTIZE},
+    {"Quantize", LAYER_QUANTIZE},
     {"QuantizedReshapeTensorflow", LAYER_RESHAPE},
     {"ConvolutionDepthwise", LAYER_CONVOLUTION_DEPTHWISE},
     {"QuantizedBiasAdd", LAYER_BIAS_ADD},
@@ -224,6 +226,15 @@ static std::map<std::string, LayerType> global_layer_type_map = {
     {"NonMaxSuppression", LAYER_NON_MAX_SUPPRESSION},
     {"TopK", LAYER_TOPK},
     {"Scatter", LAYER_SCATTER},
+    {"Norm", LAYER_NORM},
+    {"Clampmin", LAYER_CLAMPMIN},
+    {"Expandas", LAYER_EXPANDAS},
+    {"Or", LAYER_OR},
+    {"Xor", LAYER_XOR},
+    {"Roll", LAYER_ROLL},
+    {"Cumsum", LAYER_CUMSUM},
+    {"Mod", LAYER_MOD},
+    {"Linspace", LAYER_LINSPACE},
     // LAYER_INT8_RANGE
     // LAYER_TRT_ENGINE
 
@@ -252,8 +263,14 @@ static std::map<std::string, LayerType> global_layer_type_map = {
     {"CbamFusedPooling", LAYER_CBAM_FUSED_POOLING},
     {"Softsign", LAYER_SOFTSIGN},
     {"LogSoftmax", LAYER_LOGSOFTMAX},
+    {"SplitTorch", LAYER_SPLITTORCH},
+    {"PermuteV2", LAYER_PERMUTEV2},
+    {"Clone", LAYER_CLONE},
+    {"ReshapeTorch", LAYER_RESHAPETORCH},
+    {"FlattenTorch", LAYER_FLATTENTORCH},
     {"QuantizedReshape", LAYER_RESHAPE},
     {"QuantizedPermute", LAYER_PERMUTE},
+    {"GatherElements", LAYER_GATHERELEMENTS},
     {"Swish", LAYER_SWISH},
     {"GLU", LAYER_GLU},
 
@@ -267,6 +284,16 @@ static std::map<std::string, LayerType> global_layer_type_map = {
     // TNN Graph Matcher related LAYER_TYPES
     {"Dummy", LAYER_DUMMY_TYPE},
     {"AnyType", LAYER_ANY_TYPE},
+
+    {"Fused", LAYER_FUSED},
+    {"EffectiveTransformer", LAYER_EFFECTIVE_TRANSFORMER},
+    {"SplitGELU", LAYER_FUSED_SPLIT_GELU},
+    {"FusedGroupNormSwish", LAYER_FUSED_GROUP_NORM_SWISH},
+
+    {"InplaceCopy", LAYER_INPLACE_COPY},
+    {"InplaceSliceCopy", LAYER_INPLACE_SLICE_COPY},
+
+    {"DeConvolution1D", LAYER_DECONVOLUTION_1D}
 };
 
 LayerType GlobalConvertLayerType(std::string layer_type_str) {
diff --git a/source/tnn/core/layer_type.h b/source/tnn/core/layer_type.h
index ead00b8de..b57455908 100644
--- a/source/tnn/core/layer_type.h
+++ b/source/tnn/core/layer_type.h
@@ -66,6 +66,7 @@ enum LayerType {
     LAYER_QUANTIZEV2       = 46,
     LAYER_LSTM             = 47,
     // Quantization related layers
+    LAYER_QUANTIZE                                          = 48,
     LAYER_QUANTIZEDPOOLING                                  = 49,
     LAYER_DEQUANTIZE                                        = 50,
     LAYER_QUANTIZEDRESHAPE                                  = 51,
@@ -191,6 +192,7 @@ enum LayerType {
     LAYER_POOLING_3D                                        = 202,
     LAYER_CONVOLUTION_1D                                    = 203,
     LAYER_POOLING_1D                                        = 204,
+    LAYER_DECONVOLUTION_1D                                  = 205,
 
     LAYER_HDRGUIDE                                          = 302,
     LAYER_PIXEL_SHUFFLE                                     = 303,
@@ -227,10 +229,30 @@ enum LayerType {
     LAYER_LESS                                              = 334,
     LAYER_NON_MAX_SUPPRESSION                               = 335,
     LAYER_SCATTER                                           = 336,
+    LAYER_SPLITTORCH                                        = 337,
+    LAYER_PERMUTEV2                                         = 338,
+    LAYER_CLONE                                             = 339,
+    LAYER_RESHAPETORCH                                      = 340,
+    LAYER_NORM                                              = 341,
+    LAYER_CLAMPMIN                                          = 342,
+    LAYER_EXPANDAS                                          = 343,
+    LAYER_FLATTENTORCH                                      = 344,
+    LAYER_OR                                                = 345,
+    LAYER_XOR                                               = 346,
+    LAYER_ROLL                                              = 347,
+    LAYER_CUMSUM                                            = 348,
+    LAYER_MOD                                               = 349,
+    LAYER_GATHERELEMENTS                                    = 350,
+    LAYER_LINSPACE                                          = 351,
+    
     LAYER_SWISH                                             = 401,
     LAYER_GLU                                               = 402,
 
     LAYER_BLOB_SCALE                                        = 600,
+    LAYER_FUSED                                             = 601,
+    LAYER_EFFECTIVE_TRANSFORMER                             = 602,
+    LAYER_FUSED_SPLIT_GELU                                  = 603,
+    LAYER_FUSED_GROUP_NORM_SWISH                            = 604,
 
     LAYER_INT8_RANGE                                        = 700,
     LAYER_TRT_ENGINE                                        = 701,
@@ -241,6 +263,10 @@ enum LayerType {
     // TNN Graph Matcher related LAYER_TYPES
     LAYER_DUMMY_TYPE                                        = 1000,
     LAYER_ANY_TYPE                                          = 1001,
+
+    // Torch Inplace Op
+    LAYER_INPLACE_COPY                                      = 2000,
+    LAYER_INPLACE_SLICE_COPY                                = 2001,
 };
 
 LayerType GlobalConvertLayerType(std::string layer_type_str);
diff --git a/source/tnn/core/mat.cc b/source/tnn/core/mat.cc
index 1413948c0..696602ca7 100644
--- a/source/tnn/core/mat.cc
+++ b/source/tnn/core/mat.cc
@@ -64,6 +64,28 @@ int Mat::GetWidth() {
     return GetDim(3);
 }
 
+void Mat::ReAlloc(DimsVector dims) {
+    dims_ = dims;
+    auto device = GetDevice(device_type_);
+    
+    void* data_alloc = nullptr;
+    auto status = device->Allocate(&data_alloc, mat_type_, dims);
+    if (status == TNN_OK) {
+        data_alloc_ = nullptr;
+        data_alloc_ = std::shared_ptr<void>(data_alloc, [=](void* p) {
+            auto device = GetDevice(device_type_);
+            if (device) {
+                device->Free(p);
+            }
+        });
+        data_       = data_alloc_.get();
+    } else {
+        data_       = nullptr;
+        data_alloc_ = nullptr;
+        LOGE("Mat ReAllocate failed msg:%s\n", status.description().c_str());
+    }
+}
+
 Mat::Mat(DeviceType device_type, MatType mat_type, DimsVector dims) {
     dims_ = dims;
 
@@ -91,6 +113,7 @@ Mat::Mat(DeviceType device_type, MatType mat_type, DimsVector dims) {
     } else {
         data_       = nullptr;
         data_alloc_ = nullptr;
+        LOGE("Mat Allocate failed msg:%s\n", status.description().c_str());
     }
 }
 
diff --git a/source/tnn/core/tnn.cc b/source/tnn/core/tnn.cc
index 3a25a9d3f..8d64a4dd1 100644
--- a/source/tnn/core/tnn.cc
+++ b/source/tnn/core/tnn.cc
@@ -54,22 +54,46 @@ Status TNN::GetModelInputShapesMap(InputShapesMap& shapes_map) {
     return impl_->GetModelInputShapesMap(shapes_map);
 }
 
-std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap inputs_shape) {
+Status TNN::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+     if (!impl_) {
+        LOGE("Error: impl_ is nil\n");
+        return Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+    }
+    return impl_->GetModelInputDataTypeMap(data_type_map);
+}
+
+Status TNN::GetModelInputNames(std::vector<std::string>& input_names) {
+     if (!impl_) {
+        LOGE("Error: impl_ is nil\n");
+        return Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+    }
+    return impl_->GetModelInputNames(input_names);
+}
+
+Status TNN::GetModelOutputNames(std::vector<std::string>& output_names) {
+     if (!impl_) {
+        LOGE("Error: impl_ is nil\n");
+        return Status(TNNERR_NET_ERR, "tnn impl_ is nil");
+    }
+    return impl_->GetModelOutputNames(output_names);
+}
+
+std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap inputs_shape, InputDataTypeMap inputs_data_type) {
     if (!impl_) {
         status = Status(TNNERR_NET_ERR, "tnn impl_ is nil");
         return nullptr;
     }
 
-    return impl_->CreateInst(config, status, inputs_shape);
+    return impl_->CreateInst(config, status, inputs_shape, inputs_data_type);
 }
 
-std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+std::shared_ptr<Instance> TNN::CreateInst(NetworkConfig& config, Status& status, InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type) {
     if (!impl_) {
         status = Status(TNNERR_NET_ERR, "tnn impl_ is nil");
         return nullptr;
     }
 
-    return impl_->CreateInst(config, status, min_inputs_shape, max_inputs_shape);
+    return impl_->CreateInst(config, status, min_inputs_shape, max_inputs_shape, inputs_data_type);
 }
 
 }  // namespace TNN_NS
diff --git a/source/tnn/core/tnn_impl.h b/source/tnn/core/tnn_impl.h
index 2840eeb20..10be2c4fc 100644
--- a/source/tnn/core/tnn_impl.h
+++ b/source/tnn/core/tnn_impl.h
@@ -53,24 +53,37 @@ class TNNImpl {
     //@brief get input shapes map from model
     virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map) = 0;
 
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) = 0;
+
+    // return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names) = 0;
+
+    // return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& output_names) = 0;
+
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param inputs_shape: modify input shape, or it will use the shape in the
     // proto
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap inputs_shape = InputShapesMap()) = 0;
+                                                 InputShapesMap inputs_shape = InputShapesMap(),
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap()) = 0;
 
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param min_inputs_shape: support min shape
     // @param max_inputs_shape: support max shape
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) = 0;
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap()) = 0;
 
 
 protected:
diff --git a/source/tnn/core/tnn_impl_default.cc b/source/tnn/core/tnn_impl_default.cc
index 6c225633b..c305d4ea6 100644
--- a/source/tnn/core/tnn_impl_default.cc
+++ b/source/tnn/core/tnn_impl_default.cc
@@ -69,9 +69,43 @@ Status TNNImplDefault::GetModelInputShapesMap(InputShapesMap& shapes_map) {
     return TNN_OK;
 } 
 
+Status TNNImplDefault::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+    if (!interpreter_) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+    data_type_map = default_interpreter->GetNetStructure()->input_data_type_map;
+    return TNN_OK;
+} 
+
+Status TNNImplDefault::GetModelInputNames(std::vector<std::string>& input_names) {
+    if (!interpreter_) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+    input_names = default_interpreter->GetNetStructure()->parsed_input_names_list;
+    return TNN_OK;
+}
+
+Status TNNImplDefault::GetModelOutputNames(std::vector<std::string>& output_names) {
+    if (!interpreter_) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+
+    auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
+    CHECK_PARAM_NULL(default_interpreter);
+    output_names = default_interpreter->GetNetStructure()->parsed_output_names_list;
+    return TNN_OK;
+}
+
 
 std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config, Status& status,
-                                                     InputShapesMap inputs_shape) {
+                                                     InputShapesMap inputs_shape,
+                                                     InputDataTypeMap inputs_data_type) {
     if (!interpreter_) {
         status = Status(TNNERR_NET_ERR, "interpreter is nil");
         return nullptr;
@@ -87,7 +121,7 @@ std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config,
 #endif
 
     auto instance = std::make_shared<Instance>(net_config, model_config_);
-    status        = instance->Init(interpreter_, inputs_shape);
+    status        = instance->Init(interpreter_, inputs_shape, inputs_data_type);
 
     if (status != TNN_OK) {
         return nullptr;
@@ -96,7 +130,8 @@ std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config,
 }
 
 std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config, Status& status,
-                                                     InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+                                                     InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                     InputDataTypeMap inputs_data_type) {
     if (!interpreter_) {
         status = Status(TNNERR_NET_ERR, "interpreter is nil");
         return nullptr;
@@ -112,7 +147,7 @@ std::shared_ptr<Instance> TNNImplDefault::CreateInst(NetworkConfig& net_config,
 #endif
 
     auto instance = std::make_shared<Instance>(net_config, model_config_);
-    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape, inputs_data_type);
 
     if (status != TNN_OK) {
         return nullptr;
@@ -124,9 +159,27 @@ Status TNNImplDefault::AddAllLayersOutput() {
     auto default_interpreter = dynamic_cast<DefaultModelInterpreter*>(interpreter_.get());
     CHECK_PARAM_NULL(default_interpreter);
     auto net_structure = default_interpreter->GetNetStructure();
+    std::set<std::string> shape_tensors;
     for(auto layer_info : net_structure->layers) {
-        for(auto output_name : layer_info->outputs) {
-            AddOutput(output_name);
+        if(layer_info->type == LAYER_SHAPE) {
+            for(auto output_name : layer_info->outputs) {
+                shape_tensors.insert(output_name);
+            }
+        } else {
+            bool all_shape_tensor = true;
+            for(auto input_name : layer_info->inputs) {
+                if(shape_tensors.count(input_name) == 0) {
+                    all_shape_tensor = false;
+                    break;
+                }    
+            }
+            for(auto output_name : layer_info->outputs) {
+                if(all_shape_tensor) {
+                    shape_tensors.insert(output_name);
+                } else {
+                    AddOutput(output_name);
+                }
+            }
         }
     }
     return TNN_OK;
diff --git a/source/tnn/core/tnn_impl_default.h b/source/tnn/core/tnn_impl_default.h
index 75c5c43d5..498a52f5b 100644
--- a/source/tnn/core/tnn_impl_default.h
+++ b/source/tnn/core/tnn_impl_default.h
@@ -56,27 +56,40 @@ class TNNImplDefault : public TNNImpl {
 
     //@brief get input shapes map from model
     virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+    
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    // return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    // return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& input_names);
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param inputs_shape: modify input shape, or it will use the shape in the
     // proto
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(
         NetworkConfig& config, Status& status,
-        InputShapesMap inputs_shape = InputShapesMap());
+        InputShapesMap inputs_shape = InputShapesMap(),
+        InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param min_inputs_shape: support min shape
     // @param max_inputs_shape: support max shape
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(
         NetworkConfig& config, Status& status,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+        InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 
 private:
diff --git a/source/tnn/device/arm/acc/arm_cumsum_layer_acc.cc b/source/tnn/device/arm/acc/arm_cumsum_layer_acc.cc
new file mode 100644
index 000000000..5a098f736
--- /dev/null
+++ b/source/tnn/device/arm/acc/arm_cumsum_layer_acc.cc
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_cumsum_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/bfp16.h"
+#include "tnn/utils/dims_function_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+// TODO: Same as CPU Cumsum right now, add NEON SIMD speed-up
+template <typename T>
+void ArmCumsumKernel(const T* input, T* output, const int dim_pre, const int dim_curr, const int dim_post,
+                     const bool exclusive, const bool exclusive_extend, const bool reverse) {
+    // Set std::function according to 'exclusive' and 'reverse' settings.
+    std::function<void(const T*, T*, int, int, const int, const int)> func_cumsum_loop;
+    if (exclusive && !exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            for (int i=0; i<dim_curr; i++) {
+                output[offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset -= dim_post;
+            }
+        };
+    } else if (exclusive && !exclusive_extend && !reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr; i++) {
+                output[offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset += dim_post;
+            }
+        };
+    } else if (!exclusive && exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            out_offset += dim_post * dim_curr;
+            for (int i=0; i<dim_curr+1; i++) {
+                output[out_offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset -= dim_post;
+                out_offset -= dim_post;
+            }
+        };
+    } else if (!exclusive && exclusive_extend && !reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr+1; i++) {
+                output[out_offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset += dim_post;
+                out_offset += dim_post;
+            }
+        };
+    } else if (!exclusive && !exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            for (int i=0; i<dim_curr; i++) {
+                curr_cumsum += input[offset];
+                output[offset] = curr_cumsum;
+                offset -= dim_post;
+            }
+        };
+    } else { // !exclusive && !reverse
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr; i++) {
+                curr_cumsum += input[offset];
+                output[offset] = curr_cumsum;
+                offset += dim_post;
+            }
+        };
+    }
+
+    // Main Compute Loop
+    for (int i = 0; i<dim_pre; i++) {
+        for (int j = 0; j<dim_post; j++) {
+            int curr_offset = i * dim_curr * dim_post + j;
+            int curr_out_offset = i * (dim_curr+1) * dim_post + j; // used ONLY in Exclusive Extend Mode.
+            func_cumsum_loop(input, output, curr_offset, curr_out_offset, dim_curr, dim_post);
+        }
+    }
+}
+
+Status ArmCumsumLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // Operator Cumsum input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+
+    auto cumsum_param  = dynamic_cast<CumsumLayerParam*>(param_);
+    if (cumsum_param == nullptr) {
+        LOGE("Error: CpuCumsumLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CpuCumsumLayer forward Load layer param failed!");
+    }
+    if (cumsum_param->axis < 0) {
+        cumsum_param->axis += input_dims.size();
+    }
+
+    int dim_pre  = 1;
+    int dim_curr = input_dims[cumsum_param->axis];
+    int dim_post = 1;
+    for (int i=0; i<cumsum_param->axis; i++) {
+        dim_pre *= input_dims[i];
+    }
+    for (int i=cumsum_param->axis+1; i<input_dims.size(); i++) {
+        dim_post *= input_dims[i];
+    }
+
+    DataType in_dtype = input_blob->GetBlobDesc().data_type;
+    if (in_dtype==DATA_TYPE_FLOAT) {
+        float* input_data  = reinterpret_cast<float *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        float* output_data  = reinterpret_cast<float *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmCumsumKernel<float>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                               cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else if (in_dtype==DATA_TYPE_HALF) {
+        fp16_t* input_data  = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        fp16_t* output_data = reinterpret_cast<fp16_t *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmCumsumKernel<fp16_t>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                                cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else if (in_dtype==DATA_TYPE_INT32) {
+        int* input_data  = reinterpret_cast<int *>(GetBlobHandlePtr(input_blob->GetHandle()));
+        int* output_data  = reinterpret_cast<int *>(GetBlobHandlePtr(output_blob->GetHandle()));
+        ArmCumsumKernel<int>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                             cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else {
+        LOGE("Error: ArmCumsumLayerAcc don't support data type: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: ArmCumsumLayerAcc don't support data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Cumsum, LAYER_CUMSUM)
+REGISTER_ARM_LAYOUT(LAYER_CUMSUM, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/arm/acc/arm_cumsum_layer_acc.h b/source/tnn/device/arm/acc/arm_cumsum_layer_acc.h
new file mode 100644
index 000000000..9d1d0bf40
--- /dev/null
+++ b/source/tnn/device/arm/acc/arm_cumsum_layer_acc.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_CUMSUM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_CUMSUM_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Cumsum, LAYER_CUMSUM);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_CUMSUM_LAYER_ACC_H_
diff --git a/source/tnn/device/arm/acc/arm_roll_layer_acc.cc b/source/tnn/device/arm/acc/arm_roll_layer_acc.cc
new file mode 100644
index 000000000..fdfabaf55
--- /dev/null
+++ b/source/tnn/device/arm/acc/arm_roll_layer_acc.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/arm/acc/arm_roll_layer_acc.h"
+#include "tnn/device/arm/arm_common.h"
+#include "tnn/device/arm/arm_context.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_function_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+// TODO: Same as CPU Roll right now, add NEON SIMD speed-up
+Status ArmRollLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // Operator Roll input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+
+    auto roll_param  = dynamic_cast<RollLayerParam*>(param_);
+    if (roll_param == nullptr) {
+        LOGE("Error: ArmRollLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ArmRollLayer forward Load layer param failed!");
+    }
+    if (roll_param->dims.size() != roll_param->shifts.size()) {
+        LOGE("Error: ArmRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: ArmRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims!");
+    }
+ 
+    char *input_data   = reinterpret_cast<char *>(GetBlobHandlePtr(input_blob->GetHandle()));
+    char *output_data  = reinterpret_cast<char *>(GetBlobHandlePtr(output_blob->GetHandle()));
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto count         = DimsVectorUtils::Count(input_dims);
+    
+    // Create Ordered, Positive shifts from param.shifts.
+    std::vector<int> shifts(input_dims.size(), 0);
+    for (int d=0; d<roll_param->dims.size(); d++) {
+        int dim = roll_param->dims[d];
+        shifts[dim] = roll_param->shifts[d] < 0 ? roll_param->shifts[d] + input_dims[dim] : roll_param->shifts[d];
+    }
+
+    for (int i=0; i<count; i++) {
+        // Too Many Calls of Memcpy is not a good choice here when speed matters.
+        // Address of input, output should be different. Inplace Mode not supported.
+        int out_i = 0;
+        int remainder = i;
+        for (int d=0; d<input_dims.size(); d++) {
+            int stride_dim    = DimsVectorUtils::Count(input_dims, d+1);
+            int in_index_dim  = remainder / stride_dim;
+            int out_index_dim = (in_index_dim + shifts[d]) % input_dims[d];
+            out_i += stride_dim * out_index_dim;
+            remainder %= stride_dim;
+        }
+        memcpy(output_data + ele_size*out_i, input_data + ele_size*i, ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_ARM_ACC(Roll, LAYER_ROLL)
+REGISTER_ARM_LAYOUT(LAYER_ROLL, DATA_FORMAT_NCHW)
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/arm/acc/arm_roll_layer_acc.h b/source/tnn/device/arm/acc/arm_roll_layer_acc.h
new file mode 100644
index 000000000..e9e839e2a
--- /dev/null
+++ b/source/tnn/device/arm/acc/arm_roll_layer_acc.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ARM_ARM_ROLL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_ARM_ARM_ROLL_LAYER_ACC_H_
+
+#include "tnn/device/arm/acc/arm_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_ARM_ACC(Roll, LAYER_ROLL);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ARM_ARM_ROLL_LAYER_ACC_H_
diff --git a/source/tnn/device/arm/arm_blob_converter.cc b/source/tnn/device/arm/arm_blob_converter.cc
index f086dd76b..448c6ddbd 100644
--- a/source/tnn/device/arm/arm_blob_converter.cc
+++ b/source/tnn/device/arm/arm_blob_converter.cc
@@ -28,6 +28,34 @@
 namespace TNN_NS {
 using namespace arm;
 
+static bool NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool NeedDoScaleBias(const MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+    return false;
+}
+
 ArmBlobConverterAcc::ArmBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {}
 ArmBlobConverterAcc::~ArmBlobConverterAcc() {}
 
@@ -177,7 +205,7 @@ Status ArmBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam para
     auto hw         = DimsVectorUtils::Count(dims, 2);
     auto handle_ptr = GetBlobHandlePtr(blob_->GetHandle());
     auto c_r4       = ROUND_UP(channel, 4);
-    if (desc.data_type == DATA_TYPE_INT8 && image.GetMatType() != RESERVED_INT8_TEST) {
+    if (desc.data_type == DATA_TYPE_INT8 && (image.GetMatType() != NC_INT8 && image.GetMatType() != RESERVED_INT8_TEST)) {
         if (fused_int8_scale.size() < c_r4) {
             fused_int8_scale.resize(c_r4);
             fused_int8_bias.resize(c_r4);
@@ -1176,6 +1204,10 @@ REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_MAT
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<float,float>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(NC_INT32,            DATA_TYPE_INT32, CVT_DIR_MAT2BLOB, ConvertInt32MatToInt32Blob)
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_BFP16, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<float, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_BFP16,          DATA_TYPE_BFP16, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<bfp16_t, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_HALF,           DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<fp16_t,float>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NC_INT8,             DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
+// DEPRECATED MAT TYPES, TO BE REMOVED IN THE FUTURE
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_BFP16_TEST, DATA_TYPE_BFP16, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<bfp16_t, bfp16_t>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_FLOAT, CVT_DIR_MAT2BLOB, (ConvertFloatMatToFloatBlob<fp16_t,float>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
@@ -1264,7 +1296,10 @@ REGISTER_ARM_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_HALF,  CVT_DIR_MAT
 REGISTER_ARM_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertNNV12ToHalfBlob)
 REGISTER_ARM_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertNNV21ToHalfBlob)
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertFloatMatToHalfBlob<float>)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_HALF,           DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertFloatMatToHalfBlob<fp16_t>)
+// DEPRECATED MAT TYPES, TO BE REMOVED IN THE FUTURE
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_HALF,  CVT_DIR_MAT2BLOB, ConvertFloatMatToHalfBlob<fp16_t>)
+
 #endif
 
 static Status ConvertInt8BlobToN8UC4(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
@@ -1385,9 +1420,13 @@ REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_BLO
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_FLOAT, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<float,float>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_BFP16, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<float, bfp16_t>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(NC_INT32,            DATA_TYPE_INT32, CVT_DIR_BLOB2MAT, ConvertInt32BlobToInt32Mat)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_BFP16,          DATA_TYPE_BFP16, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<bfp16_t, bfp16_t>))
+REGISTER_ARM_BLOB_CONVERT_FUNC(NC_INT8,             DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
+// DEPRECATED MAT TYPES, TO BE REMOVED IN THE FUTURE
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_BFP16_TEST, DATA_TYPE_BFP16, CVT_DIR_BLOB2MAT, (ConvertFloatBlobToFloatMat<bfp16_t, bfp16_t>))
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
 
+
 #if TNN_ARM82
 static Status ConvertHalfBlobToN8UC4(Mat& image, char* handle_ptr, const MatConvertParam& param, const DimsVector& dims,
                                      const int hw, const int c_r4, std::vector<float>& fused_int8_scale,
@@ -1442,6 +1481,8 @@ static Status ConvertHalfBlobToFloatMat(Mat& image, char* handle_ptr, const MatC
 REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToN8UC4)
 REGISTER_ARM_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToN8UC3)
 REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToFloatMat<float>)
+REGISTER_ARM_BLOB_CONVERT_FUNC(NCHW_HALF,           DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToFloatMat<fp16_t>)
+// DEPRECATED MAT TYPES, TO BE REMOVED IN THE FUTURE
 REGISTER_ARM_BLOB_CONVERT_FUNC(RESERVED_FP16_TEST,  DATA_TYPE_HALF,  CVT_DIR_BLOB2MAT, ConvertHalfBlobToFloatMat<fp16_t>)
 #endif
 
diff --git a/source/tnn/device/arm/arm_context.cc b/source/tnn/device/arm/arm_context.cc
index e7fbff57a..7eed0f768 100644
--- a/source/tnn/device/arm/arm_context.cc
+++ b/source/tnn/device/arm/arm_context.cc
@@ -27,6 +27,10 @@ Status ArmContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status ArmContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status ArmContext::ShareCommandQueue(Context* context) {
     return TNN_OK;
 }
diff --git a/source/tnn/device/arm/arm_context.h b/source/tnn/device/arm/arm_context.h
index 5f6f4745b..8a59f8f29 100644
--- a/source/tnn/device/arm/arm_context.h
+++ b/source/tnn/device/arm/arm_context.h
@@ -27,6 +27,10 @@ class ArmContext : public Context {
     // @brief get tnn command queue
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
     
     // @brief share tnn command queue to another context
     virtual Status ShareCommandQueue(Context* context) override;
diff --git a/source/tnn/device/arm/arm_device.cc b/source/tnn/device/arm/arm_device.cc
index e55f40e83..7db3216cc 100644
--- a/source/tnn/device/arm/arm_device.cc
+++ b/source/tnn/device/arm/arm_device.cc
@@ -75,12 +75,12 @@ Status ArmDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
     desc.data_format = DATA_FORMAT_NCHW;
     if (mat_type == NCHW_FLOAT) {
         desc.data_type = DATA_TYPE_FLOAT;
-    } else if (mat_type == RESERVED_BFP16_TEST) {
+    } else if (mat_type == NCHW_BFP16 || mat_type == RESERVED_BFP16_TEST) {
         desc.data_type = DATA_TYPE_BFP16;
-    } else if (mat_type == RESERVED_FP16_TEST) {
+    } else if (mat_type == NCHW_HALF || mat_type == RESERVED_FP16_TEST) {
         desc.data_type = DATA_TYPE_HALF;
     } else if (mat_type == N8UC3 || mat_type == N8UC4 || mat_type == NGRAY || mat_type == NNV21 || mat_type == NNV12 ||
-               mat_type == RESERVED_INT8_TEST) {
+               mat_type == NC_INT8 || mat_type == RESERVED_INT8_TEST) {
         // round up to support special case like: N8UC4 with dims[1] = 3
         desc.dims[1]   = ROUND_UP(desc.dims[1], 4);
         desc.data_type = DATA_TYPE_INT8;
diff --git a/source/tnn/device/atlas/CMakeLists.txt b/source/tnn/device/atlas/CMakeLists.txt
new file mode 100644
index 000000000..225cba4a4
--- /dev/null
+++ b/source/tnn/device/atlas/CMakeLists.txt
@@ -0,0 +1,35 @@
+aux_source_directory(${CMAKE_SOURCE_DIR}/source/tnn/device/atlas ATLAS_SRC)
+
+include_directories(${CMAKE_SOURCE_DIR}/source/tnn/device/atlas)
+
+set(ASCEND_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(ASCEND_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default ASCEND_PATH: ${ASCEND_PATH}")
+else ()
+    message(STATUS "env ASCEND_PATH: ${ASCEND_PATH}")
+endif()
+
+set(ACL_LIB_PATH $ENV{NPU_HOST_LIB})
+
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    set(ACL_LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/lib64")
+    message(STATUS "set default ACL_LIB_PATH: ${ACL_LIB_PATH}")
+else ()
+    message(STATUS "env ACL_LIB_PATH: ${ACL_LIB_PATH}")
+endif()
+
+add_definitions(-DENABLE_DVPP_INTERFACE)
+
+# Header path
+include_directories(
+    ${ASCEND_PATH}/acllib/include/
+)
+
+# add host lib path
+link_directories(
+    ${ACL_LIB_PATH}
+)
+
+add_library(TNNAtlas OBJECT ${ATLAS_SRC})
diff --git a/source/tnn/device/atlas/atlas_blob_converter.cc b/source/tnn/device/atlas/atlas_blob_converter.cc
new file mode 100644
index 000000000..da0cae6e3
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_blob_converter.cc
@@ -0,0 +1,617 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+
+#include "tnn/core/macro.h"
+#include "tnn/device/atlas/atlas_blob_converter.h"
+#include "tnn/device/atlas/atlas_utils.h"
+#include "tnn/memory_manager/blob_memory_size_info.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/mat_utils.h"
+
+namespace TNN_NS {
+
+// default contructor will create convert buffer
+AtlasBlobConverterAcc::AtlasBlobConverterAcc(Blob *blob) : BlobConverterAcc(blob) {
+    BlobMemorySizeInfo size_info = Calculate1DMemorySize(blob->GetBlobDesc());
+    blob_bytesize_               = GetBlobMemoryBytesSize(size_info);
+    LOGD("blob bytesize: %d\n", blob_bytesize_);
+
+    // for input blob, need to find model info
+    if (global_blob_om_model_info_map.find(blob) != global_blob_om_model_info_map.end()) {
+        om_model_info_ = global_blob_om_model_info_map[blob];
+        aclError acl_ret =
+            aclmdlGetInputIndexByName(om_model_info_->model_desc, ACL_DYNAMIC_AIPP_NAME, &dynamic_aipp_index_);
+        LOGD("acl ret: %d  input_index: %d\n", acl_ret, dynamic_aipp_index_);
+        if (ACL_ERROR_NONE == acl_ret) {
+            aipp_type_ = AIPP_DYNAMIC;
+        } else {
+            if (!(om_model_info_->aipp_input_format_map.empty())) {
+                aipp_type_ = AIPP_STATIC;
+            } else {
+                aipp_type_ = AIPP_NONE;
+            }
+        }
+        input_blob_info_found_ = true;
+    } else {
+        input_blob_info_found_ = false;
+    }
+}
+
+AtlasBlobConverterAcc::~AtlasBlobConverterAcc() {
+    if (nullptr != aipp_dynamic_set_) {
+        aclError ret = aclmdlDestroyAIPP(aipp_dynamic_set_);
+        if (ret != ACL_ERROR_NONE) {
+            LOGE("destory aipp_dynamic_set falied\n");
+        }
+    }
+}
+
+// convert blob data to mat async
+Status AtlasBlobConverterAcc::ConvertToMatAsync(Mat &mat, MatConvertParam param, void *command_queue) {
+    Status tnn_ret   = TNN_OK;
+    aclError acl_ret = ACL_ERROR_NONE;
+
+    do_scale_bias_ = NeedDoScaleBias(param);
+
+    if (do_scale_bias_) {
+        return Status(TNNERR_PARAM_ERR, "not support postprocess yet!");
+    }
+
+    aclrtStream stream_ptr = static_cast<aclrtStream>(command_queue);
+    if (stream_ptr == nullptr) {
+        LOGE("get atlas command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+    }
+
+    acl_ret = aclrtSetCurrentContext(global_stream_context_map[stream_ptr]);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("set context failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "set context failed");
+    }
+
+    DataFormat blob_dataformat = blob_->GetBlobDesc().data_format;
+    DataType blob_datatype     = blob_->GetBlobDesc().data_type;
+
+    // For Dynamic Models, Output Shape may have changed, get new Output Blob Shape.
+    BlobMemorySizeInfo new_size_info = Calculate1DMemorySize(blob_->GetBlobDesc());
+    blob_bytesize_                   = GetBlobMemoryBytesSize(new_size_info);
+    if (NCHW_FLOAT == mat.GetMatType()) {
+        LOGD("Convert To Mat:  mat type: %d, mat device type: %d, byte_size: %d.\n", mat.GetMatType(), mat.GetDeviceType(), blob_bytesize_);
+        if (DATA_FORMAT_NCHW == blob_dataformat && DATA_TYPE_FLOAT == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(mat.GetData(), blob_->GetHandle().base, mat.GetDeviceType(), blob_bytesize_,
+                                           stream_ptr, false);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_FORMAT_NHWC == blob_dataformat && DATA_TYPE_FLOAT == blob_datatype) {
+            // only support DEVICE_NAIVE device type
+            if (DEVICE_NAIVE == mat.GetDeviceType()) {
+                if (nullptr == buffer_) {
+                    buffer_.reset(new char[blob_bytesize_], [](char *p) { delete[] p; });
+                }
+                tnn_ret = AtlasMemoryCopyAsync(buffer_.get(), blob_->GetHandle().base, DEVICE_NAIVE, blob_bytesize_,
+                                               stream_ptr, false);
+                if (tnn_ret != TNN_OK)
+                    return tnn_ret;
+                // force sync
+                LOGD("force sync to get buffer data\n");
+                acl_ret = aclrtSynchronizeStream(stream_ptr);
+                if (acl_ret != ACL_ERROR_NONE) {
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "stream sync failed");
+                }
+                LOGD("convert from nhwc to nchw\n");
+                auto blob_dim = blob_->GetBlobDesc().dims;
+                DataFormatConverter::ConvertFromNHWCToNCHWFloat((float *)buffer_.get(), (float *)mat.GetData(),
+                                                                blob_dim[0], blob_dim[3], blob_dim[1], blob_dim[2]);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "not support this device type convert yet!");
+            }
+        } else if ((DATA_FORMAT_NCHW == blob_dataformat || DATA_FORMAT_NHWC == blob_dataformat) && DATA_TYPE_INT64 == blob_datatype) {
+            if (DEVICE_NAIVE == mat.GetDeviceType()) {
+                if (nullptr == buffer_) {
+                    buffer_.reset(new char[blob_bytesize_], [](char *p) { delete[] p; });
+                }
+                tnn_ret = AtlasMemoryCopyAsync(buffer_.get(), blob_->GetHandle().base, DEVICE_NAIVE, blob_bytesize_,
+                                               stream_ptr, false);
+                if (tnn_ret != TNN_OK)
+                    return tnn_ret;
+                // force sync
+                LOGD("force sync to get buffer data\n");
+                acl_ret = aclrtSynchronizeStream(stream_ptr);
+                if (acl_ret != ACL_ERROR_NONE) {
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "stream sync failed");
+                }
+                LOGD("convert from int64 to fp32\n");
+                auto blob_dim = blob_->GetBlobDesc().dims;
+                if (DATA_FORMAT_NCHW == blob_dataformat) {
+                    DataFormatConverter::ConvertFromInt64ToFloatNCHW((int64_t *)buffer_.get(), (float *)mat.GetData(),
+                                                                     blob_dim[0], blob_dim[3], blob_dim[1], blob_dim[2]);
+                } else if (DATA_FORMAT_NHWC == blob_dataformat) {
+                    DataFormatConverter::ConvertFromInt64NHWCToFloatNCHW((int64_t *)buffer_.get(), (float *)mat.GetData(),
+                                                                         blob_dim[0], blob_dim[3], blob_dim[1], blob_dim[2]);
+                } else {
+                    return Status(TNNERR_PARAM_ERR, "not support this data format convert yet!");
+                }
+            } else {
+                return Status(TNNERR_PARAM_ERR, "not support this device type convert yet!");
+            }
+        } else {
+            char error_msg[256];
+            sprintf(error_msg, "not support this dataformat type convert yet! (data format: %d  data type: %d)", blob_dataformat, blob_datatype);
+            return Status(TNNERR_PARAM_ERR, error_msg);
+        }
+    } else if (NC_INT32 == mat.GetMatType()) {
+        LOGD("Convert To NC_INT32 Mat: mat type: %d, mat device type: %d, byte_size: %d.\n", mat.GetMatType(), mat.GetDeviceType(), blob_bytesize_);
+        if (DATA_TYPE_INT32 == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(mat.GetData(), blob_->GetHandle().base, mat.GetDeviceType(), blob_bytesize_,
+                                           stream_ptr, false);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_TYPE_FLOAT == blob_datatype) {
+            LOGD("WARNING: Target Blob name is '%s', internally convert Blob DataType from FLOAT to INT32.", blob_->GetBlobDesc().name.c_str());
+            blob_->GetBlobDesc().data_type = DATA_TYPE_INT32;
+            tnn_ret = AtlasMemoryCopyAsync(mat.GetData(), blob_->GetHandle().base, mat.GetDeviceType(), blob_bytesize_,
+                                           stream_ptr, false);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else {
+            LOGE("Convert To NC_INT32 Mat: target blob Should be INT32, or FLOAT(convert to INT32)but got %d: %d\n", int(blob_datatype));
+            return Status(TNNERR_PARAM_ERR, "Convert To NC_INT32 Mat: target blob Should be INT32.");
+        }
+    } else if (NC_INT64 == mat.GetMatType()) {
+        LOGD("Convert To NC_INT64 Mat: mat type: %d, mat device type: %d, byte_size: %d.\n", mat.GetMatType(), mat.GetDeviceType(), blob_bytesize_);
+        if (DATA_TYPE_INT64 == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(mat.GetData(), blob_->GetHandle().base, mat.GetDeviceType(), blob_bytesize_,
+                                           stream_ptr, false);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_TYPE_FLOAT == blob_datatype || DATA_TYPE_INT32 == blob_datatype) {
+            LOGD("WARNING: Target Blob name is '%s', internally convert Blob DataType from FLOAT/INT32 to INT64, re-calculate blob size.", blob_->GetBlobDesc().name.c_str());
+            blob_->GetBlobDesc().data_type   = DATA_TYPE_INT64;
+            BlobMemorySizeInfo new_size_info = Calculate1DMemorySize(blob_->GetBlobDesc());
+            blob_bytesize_                   = GetBlobMemoryBytesSize(new_size_info);  // sizeof(int64_t) == 8, re-calculate ByteSize
+            tnn_ret = AtlasMemoryCopyAsync(mat.GetData(), blob_->GetHandle().base, mat.GetDeviceType(), blob_bytesize_,
+                                           stream_ptr, false);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else {
+            LOGE("Convert To NC_INT64 Mat: target blob Should be INT64, or FLOAT/INT32(convert to INT64)but got %d: %d\n", int(blob_datatype));
+            return Status(TNNERR_PARAM_ERR, "Convert To NC_INT64 Mat: target blob Should be INT64.");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "not support this mat type convert yet!");
+    }
+
+    return TNN_OK;
+}
+
+// convert mat data to blob async
+Status AtlasBlobConverterAcc::ConvertFromMatAsync(Mat &mat, MatConvertParam param, void *command_queue) {
+    if (!input_blob_info_found_) {
+        LOGE("blob converter init failed, input_blob not found in model info map!\n");
+        return Status(TNNERR_COMMON_ERROR, "blob converter init failed, input_blob not found in model info map!");
+    }
+
+    Status tnn_ret   = TNN_OK;
+    aclError acl_ret = ACL_ERROR_NONE;
+
+    aclrtStream stream_ptr = static_cast<aclrtStream>(command_queue);
+    if (stream_ptr == nullptr) {
+        LOGE("get atlas command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+    }
+
+    aclrtContext aclrt_context = global_stream_context_map[stream_ptr];
+    acl_ret = aclrtSetCurrentContext(aclrt_context);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("set context failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "set context failed");
+    }
+
+    if (AIPP_DYNAMIC == aipp_type_) {
+        //LOGD("run with dynamic aipp\n");
+        //tnn_ret = ConvertFromMatAsyncWithDynamicAipp(mat, param, stream_ptr);
+        LOGE("Convert From Mat With Dynamic AIPP NOT SUPPORTED yet.\n");
+        return Status(TNNERR_NULL_PARAM, "Convert From Mat With Dynamic AIPP NOT SUPPORTED yet");
+    } else if (AIPP_STATIC == aipp_type_) {
+        LOGD("run with static aipp\n");
+        tnn_ret = ConvertFromMatAsyncWithStaticAipp(mat, param, stream_ptr);
+    } else {
+        LOGD("run without aipp\n");
+        tnn_ret = ConvertFromMatAsyncWithoutAipp(mat, param, stream_ptr);
+    }
+
+    return tnn_ret;
+}
+
+Status AtlasBlobConverterAcc::ConvertToMat(Mat &mat, MatConvertParam param, void *command_queue) {
+    Status ret = ConvertToMatAsync(mat, param, command_queue);
+    if (ret == TNN_OK) {
+        aclrtStream stream_ptr = static_cast<aclrtStream>(command_queue);
+        if (stream_ptr == nullptr) {
+            LOGE("get atlas command queue failed!\n");
+            return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+        }
+
+        aclError acl_ret = aclrtSynchronizeStream(stream_ptr);
+        if (acl_ret != ACL_ERROR_NONE) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "stream sync failed");
+        }
+    }
+    return ret;
+}
+
+Status AtlasBlobConverterAcc::ConvertFromMat(Mat &mat, MatConvertParam param, void *command_queue) {
+    if (!input_blob_info_found_) {
+        LOGE("blob converter init failed, input_blob not found in model info map!\n");
+        return Status(TNNERR_COMMON_ERROR, "blob converter init failed, input_blob not found in model info map!");
+    }
+
+    Status ret = ConvertFromMatAsync(mat, param, command_queue);
+    if (ret == TNN_OK) {
+        aclrtStream stream_ptr = static_cast<aclrtStream>(command_queue);
+        if (stream_ptr == nullptr) {
+            LOGE("get atlas command queue failed!\n");
+            return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+        }
+
+        aclError acl_ret = aclrtSynchronizeStream(stream_ptr);
+        if (acl_ret != ACL_ERROR_NONE) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "stream sync failed");
+        }
+    }
+    return ret;
+}
+
+Status AtlasBlobConverterAcc::ConvertFromMatAsyncWithoutAipp(Mat &mat, MatConvertParam param,
+                                                             const aclrtStream& aclrt_stream) {
+    Status tnn_ret   = TNN_OK;
+    aclError acl_ret = ACL_ERROR_NONE;
+
+    do_scale_bias_ = NeedDoScaleBias(param);
+
+    if (do_scale_bias_) {
+        LOGE("warning: mat convert param is useless in no-dynamic aipp model!\n");
+    }
+
+    int mat_bytesize = 0;
+    tnn_ret          = MatUtils::GetMatByteSize(mat, mat_bytesize);
+    if (TNN_OK != tnn_ret) {
+        LOGE("GetMatByteSize failed in ConvertFromMatAsyncWithoutAipp\n");
+        return tnn_ret;
+    }
+
+    // For input with one or more dim == 0, no need to call ATLAS Memcpy
+    if (mat_bytesize == 0) {
+        LOGD("Convert From Mat, blob_name = '%s', Blob Size = 0, Skip AtlasMemcpyHostToDevice Step.\n", blob_->GetBlobDesc().name.c_str());
+        return TNN_OK;
+    }
+
+    DataFormat blob_dataformat = blob_->GetBlobDesc().data_format;
+    DataType blob_datatype     = blob_->GetBlobDesc().data_type;
+    LOGD("Convert From Mat:  mat type: %d  mat device type: %d\n", mat.GetMatType(), mat.GetDeviceType());
+    if (NCHW_FLOAT == mat.GetMatType()) {
+        if (DATA_FORMAT_NCHW == blob_dataformat && DATA_TYPE_FLOAT == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                           aclrt_stream, true);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_FORMAT_NHWC == blob_dataformat && DATA_TYPE_FLOAT == blob_datatype) {
+            // only support DEVICE_NAIVE device type
+            if (DEVICE_NAIVE == mat.GetDeviceType()) {
+                if (nullptr == buffer_) {
+                    buffer_.reset(new char[mat_bytesize], [](char *p) { delete[] p; });
+                }
+                // transfer from NCHW to NHWC
+                LOGD("convert from nchw to nhwc\n");
+                auto blob_dim = blob_->GetBlobDesc().dims;
+                DataFormatConverter::ConvertFromNCHWToNHWCFloat((float *)mat.GetData(), (float *)buffer_.get(),
+                                                                blob_dim[0], blob_dim[3], blob_dim[1], blob_dim[2]);
+
+                tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, buffer_.get(), DEVICE_NAIVE, mat_bytesize,
+                                               aclrt_stream, true);
+                if (tnn_ret != TNN_OK)
+                    return tnn_ret;
+            } else {
+                return Status(TNNERR_PARAM_ERR, "not support this device type convert in no-aipp model yet!");
+            }
+        } else {
+            return Status(TNNERR_PARAM_ERR, "not support this dataformat type convert in no-aipp model yet!");
+        }
+    } else if (NC_INT32 == mat.GetMatType()) {
+        LOGD("Convert from NC_INT32 Mat: mat device type: %d\n", mat.GetDeviceType());
+        if (DATA_TYPE_INT32 == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                           aclrt_stream, true);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_TYPE_FLOAT == blob_datatype) {
+            LOGD("WARNING: Target Blob name is '%s', internally convert Blob DataType from FLOAT to INT32.", blob_->GetBlobDesc().name.c_str());
+            blob_->GetBlobDesc().data_type = DATA_TYPE_INT32;
+            tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                           aclrt_stream, true);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else {
+            LOGE("Convert From NC_INT32 Mat: target blob Should be INT32, or FLOAT(convert to INT32)but got %d: %d\n", int(blob_datatype));
+            return Status(TNNERR_PARAM_ERR, "Convert From NC_INT32 Mat: target blob Should be INT32.");
+        }
+    } else if (NC_INT64 == mat.GetMatType()) {
+        LOGD("Convert from NC_INT64 Mat: mat device type: %d\n", mat.GetDeviceType());
+        if (DATA_TYPE_INT64 == blob_datatype) {
+            tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                           aclrt_stream, true);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else if (DATA_TYPE_FLOAT == blob_datatype || DATA_TYPE_INT32 == blob_datatype) {
+            LOGD("WARNING: Target Blob name is '%s', internally convert Blob DataType from FLOAT/INT32 to INT64.", blob_->GetBlobDesc().name.c_str());
+            blob_->GetBlobDesc().data_type   = DATA_TYPE_INT64;
+            BlobMemorySizeInfo new_size_info = Calculate1DMemorySize(blob_->GetBlobDesc());
+            blob_bytesize_                   = GetBlobMemoryBytesSize(new_size_info);  // sizeof(int64_t) == 8, re-calculate ByteSize
+            tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                           aclrt_stream, true);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        } else {
+            LOGE("Convert From NC_INT64 Mat: target blob Should be INT64, or FLOAT(convert to INT64)but got %d: %d\n", int(blob_datatype));
+            return Status(TNNERR_PARAM_ERR, "Convert From NC_INT64 Mat: target blob Should be INT64.");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "not support this mat type convert in no-aipp model yet!");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasBlobConverterAcc::ConvertFromMatAsyncWithStaticAipp(Mat &mat, MatConvertParam param,
+                                                                const aclrtStream& aclrt_stream) {
+    Status tnn_ret   = TNN_OK;
+    aclError acl_ret = ACL_ERROR_NONE;
+
+    do_scale_bias_ = NeedDoScaleBias(param);
+
+    if (do_scale_bias_) {
+        LOGE("warning: mat convert param is useless in no-dynamic aipp model!\n");
+    }
+
+    int mat_bytesize = 0;
+    tnn_ret          = MatUtils::GetMatByteSize(mat, mat_bytesize);
+    if (TNN_OK != tnn_ret) {
+        LOGE("GetMatByteSize failed in ConvertFromMatAsyncWithoutAipp\n");
+        return tnn_ret;
+    }
+
+    auto aipp_input_format = om_model_info_->aipp_input_format_map[blob_->GetBlobDesc().name];
+
+    LOGD("Convert From Mat:  mat type: %d  mat device type: %d  acl input format:%d\n", mat.GetMatType(),
+         mat.GetDeviceType(), aipp_input_format);
+    if ((N8UC3 == mat.GetMatType() && ACL_RGB888_U8 == aipp_input_format) ||
+        (NGRAY == mat.GetMatType() && ACL_YUV400_U8 == aipp_input_format) ||
+        ((NNV12 == mat.GetMatType() || NNV21 == mat.GetMatType()) && ACL_YUV420SP_U8 == aipp_input_format)) {
+        tnn_ret = AtlasMemoryCopyAsync(blob_->GetHandle().base, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                       aclrt_stream, true);
+        if (tnn_ret != TNN_OK)
+            return tnn_ret;
+    } else {
+        return Status(TNNERR_PARAM_ERR, "input mat type mismatch with static aipp!");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasBlobConverterAcc::ConvertFromMatAsyncWithDynamicAipp(Mat &mat, MatConvertParam param,
+                                                                 const aclrtStream& aclrt_stream) {
+    /*
+    Status tnn_ret = SetDynamicAipp(mat, param);
+    if (TNN_OK != tnn_ret) {
+        LOGE("set dynamic aipp failed!\n");
+        return tnn_ret;
+    }
+    auto data_buffer = aclmdlGetDatasetBuffer(om_model_info_.input_dataset, 0);
+    if (nullptr == data_buffer) {
+        LOGE("get data buffer from dataset failed!\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get data buffer failed");
+    }
+
+    auto data_buffer_ptr = aclGetDataBufferAddr(data_buffer);
+    if (nullptr == data_buffer_ptr) {
+        LOGE("get data buffer from dataset failed!\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get data buffer failed");
+    }
+
+    if (blob_->GetHandle().base != data_buffer_ptr) {
+        LOGE("data buffer ptr not match blob data ptr (0x%lx vs 0x%lx)! note: dynamic aipp not support multi input\n",
+             (unsigned long)data_buffer_ptr, (unsigned long)blob_->GetHandle().base);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "data buffer ptr is invalid");
+    }
+
+    int mat_bytesize = 0;
+    tnn_ret          = MatUtils::GetMatByteSize(mat, mat_bytesize);
+    if (TNN_OK != tnn_ret) {
+        LOGE("GetMatByteSize failed in ConvertFromMatAsyncWithDynamicAipp\n");
+        return tnn_ret;
+    }
+
+    tnn_ret = AtlasMemoryCopyAsync(data_buffer_ptr, mat.GetData(), mat.GetDeviceType(), mat_bytesize,
+                                   aclrt_stream, true);
+
+    return tnn_ret;
+    */
+    return TNN_OK;
+}
+
+bool AtlasBlobConverterAcc::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+Status AtlasBlobConverterAcc::AtlasMemoryCopyAsync(void *dst, void *src, DeviceType mat_device_type, int bytes,
+                                                   void *stream, bool from_mat) {
+    aclError ret = ACL_ERROR_NONE;
+    if (DEVICE_ATLAS == mat_device_type) {
+        // need to copy from device to device
+        LOGD("acl memcpy: copy from device to device (%d bytes)\n", bytes);
+        ret = aclrtMemcpyAsync(dst, bytes, src, bytes, ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
+        if (ACL_ERROR_NONE != ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+        }
+    } else if (DEVICE_NAIVE == mat_device_type || DEVICE_ARM == mat_device_type) {
+        if (from_mat) {
+            // need to copy from host to device
+            LOGD("acl memcpy: copy from host to device (%d bytes)\n", bytes);
+            ret = aclrtMemcpyAsync(dst, bytes, src, bytes, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+        } else {
+            // need to copy form device to host
+            LOGD("acl memcpy: copy from device to host (%d bytes)\n", bytes);
+            ret = aclrtMemcpyAsync(dst, bytes, src, bytes, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+        }
+        if (ACL_ERROR_NONE != ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+        }
+    } else {
+        return Status(TNNERR_PARAM_ERR, "not support this device type convert yet!");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasBlobConverterAcc::SetDynamicAipp(Mat &mat, MatConvertParam &param) {
+    /*
+    aclError acl_ret = ACL_ERROR_NONE;
+    Status tnn_ret   = TNN_OK;
+
+    if (nullptr == aipp_dynamic_set_) {
+        aipp_mat_batchsize_ = GetMaxBatchSize(om_model_info_->model_desc, blob_->GetBlobDesc().dims[0]);
+        aipp_dynamic_set_   = aclmdlCreateAIPP(aipp_mat_batchsize_);
+        if (nullptr == aipp_dynamic_set_) {
+            LOGE("create aipp info failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "create aipp info failed!\n");
+        }
+    }
+
+    int height = mat.GetHeight();
+    int width  = mat.GetWidth();
+
+    // set aipp image size
+    acl_ret = aclmdlSetAIPPSrcImageSize(aipp_dynamic_set_, width, height);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set image size failed!\n");
+    }
+    LOGD("set aipp input image size: w = %d  h = %d\n", width, height);
+
+    // set aipp input format
+    aclAippInputFormat aipp_input_format;
+    tnn_ret = ConvertFromMatTypeToAippInputFormat(mat.GetMatType(), aipp_input_format);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+    acl_ret = aclmdlSetAIPPInputFormat(aipp_dynamic_set_, aipp_input_format);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set image format failed!\n");
+    }
+    LOGD("set aipp input format: %d\n", aipp_input_format);
+
+    // set aipp mean and var
+    float aipp_mean0 = (-1.0f) * param.bias[0] / param.scale[0];
+    float aipp_mean1 = (-1.0f) * param.bias[1] / param.scale[1];
+    float aipp_mean2 = (-1.0f) * param.bias[2] / param.scale[2];
+    float aipp_mean3 = (-1.0f) * param.bias[3] / param.scale[3];
+    for (int i = 0; i < mat.GetBatch(); ++i) {
+        acl_ret = aclmdlSetAIPPDtcPixelMin(aipp_dynamic_set_, aipp_mean0, aipp_mean1, aipp_mean2, aipp_mean3, i);
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set mean failed!\n");
+        }
+        LOGD("set aipp input mean: %f, %f, %f, %f\n", aipp_mean0, aipp_mean1, aipp_mean2, aipp_mean3);
+        acl_ret = aclmdlSetAIPPPixelVarReci(aipp_dynamic_set_, param.scale[0], param.scale[1], param.scale[2],
+                                            param.scale[3], i);
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set var failed!\n");
+        }
+        LOGD("set aipp input scale: %f, %f, %f, %f\n", param.scale[0], param.scale[1], param.scale[2], param.scale[3]);
+    }
+
+    // set aipp ax swap
+    if (ACL_XRGB8888_U8 == aipp_input_format) {
+        acl_ret = aclmdlSetAIPPAxSwapSwitch(aipp_dynamic_set_, 1);
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set ax swap failed!\n");
+        }
+        LOGD("set aipp ax swap switch: 1\n");
+    }
+
+    // convert format
+    {
+        // if input is yuv, then use csc to convert from yuv to rgb
+        if (ACL_YUV420SP_U8 == aipp_input_format) {
+            acl_ret = aclmdlSetAIPPCscParams(aipp_dynamic_set_, 1, 256, 0, 359, 256, -88, -183, 256, 454, 0, 0, 0, 0, 0,
+                                             128, 128);
+            if (ACL_ERROR_NONE != acl_ret) {
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set csc failed!\n");
+            }
+            LOGD("set aipp csc params\n");
+        }
+
+        // set aipp swap
+        if (ACL_RGB888_U8 == aipp_input_format || ACL_XRGB8888_U8 == aipp_input_format) {
+            acl_ret = aclmdlSetAIPPRbuvSwapSwitch(aipp_dynamic_set_, (int8_t)param.reverse_channel);
+            LOGD("set aipp rbuv swap switch: %d\n", param.reverse_channel);
+        } else if (ACL_YUV420SP_U8 == aipp_input_format) {
+            if (NNV12 == mat.GetMatType()) {
+                acl_ret = aclmdlSetAIPPRbuvSwapSwitch(aipp_dynamic_set_, (int8_t)param.reverse_channel);
+                LOGD("set aipp rbuv swap switch: %d\n", param.reverse_channel);
+            } else if (NNV21 == mat.GetMatType()) {
+                // opposite with param.reverse_channel
+                if (param.reverse_channel) {
+                    acl_ret = aclmdlSetAIPPRbuvSwapSwitch(aipp_dynamic_set_, 0);
+                    LOGD("set aipp rbuv swap switch: %d\n", 0);
+                } else {
+                    acl_ret = aclmdlSetAIPPRbuvSwapSwitch(aipp_dynamic_set_, 1);
+                    LOGD("set aipp rbuv swap switch: %d\n", 1);
+                }
+            }
+        }
+
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set swap failed!\n");
+        }
+    }
+
+    // set input aipp
+    acl_ret =
+        aclmdlSetInputAIPP(om_model_info_->model_id, om_model_info_.input_dataset, dynamic_aipp_index_, aipp_dynamic_set_);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aipp set input failed!\n");
+    }
+    */
+
+    return TNN_OK;
+}
+
+DECLARE_BLOB_CONVERTER_CREATER(Atlas);
+REGISTER_BLOB_CONVERTER(Atlas, DEVICE_ATLAS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_blob_converter.h b/source/tnn/device/atlas/atlas_blob_converter.h
new file mode 100644
index 000000000..d77254f8f
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_blob_converter.h
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_BLOB_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_BLOB_CONVERTER_H_
+
+#include "acl/acl.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/atlas/atlas_common_types.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+
+namespace TNN_NS {
+
+typedef enum { AIPP_NONE = 0, AIPP_STATIC, AIPP_DYNAMIC } AippType;
+
+class AtlasBlobConverterAcc : public BlobConverterAcc {
+public:
+    AtlasBlobConverterAcc(Blob* blob);
+    virtual ~AtlasBlobConverterAcc();
+
+    virtual Status ConvertToMat(Mat& mat, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertToMatAsync(Mat& mat, MatConvertParam param, void* command_queue = NULL);
+
+    virtual Status ConvertFromMat(Mat& mat, MatConvertParam param, void* command_queue = NULL);
+    virtual Status ConvertFromMatAsync(Mat& mat, MatConvertParam param, void* command_queue = NULL);
+
+private:
+    Status ConvertFromMatAsyncWithoutAipp(Mat& mat, MatConvertParam param, const aclrtStream& aclrt_stream);
+    Status ConvertFromMatAsyncWithStaticAipp(Mat& mat, MatConvertParam param, const aclrtStream& aclrt_stream);
+    Status ConvertFromMatAsyncWithDynamicAipp(Mat& mat, MatConvertParam param, const aclrtStream& aclrt_stream);
+
+    bool NeedDoScaleBias(MatConvertParam& param);
+    Status AtlasMemoryCopyAsync(void* dst, void* src, DeviceType mat_device_type, int bytes, void* stream,
+                                bool from_mat);
+    Status SetDynamicAipp(Mat& mat, MatConvertParam& param);
+
+    bool input_blob_info_found_   = false;
+    bool do_scale_bias_           = true;
+    int blob_bytesize_            = 0;
+    std::shared_ptr<char> buffer_ = nullptr;
+
+    aclmdlAIPP* aipp_dynamic_set_ = nullptr;
+    AippType aipp_type_           = AIPP_NONE;
+    int aipp_mat_batchsize_       = 0;
+    size_t dynamic_aipp_index_    = 0;
+    std::shared_ptr<AtlasOMModelInfo> om_model_info_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_BLOB_CONVERTER_H_
diff --git a/source/tnn/device/atlas/atlas_common_types.h b/source/tnn/device/atlas/atlas_common_types.h
new file mode 100644
index 000000000..46ca8809c
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_common_types.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_DEVICE_ATLAS_ATLAS_COMMON_TYPES_H_
+#define TNN_SOURCE_DEVICE_ATLAS_ATLAS_COMMON_TYPES_H_
+
+#include <climits>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "acl/acl.h"
+#include "tnn/core/blob.h"
+#include "tnn/core/macro.h"
+
+namespace TNN_NS {
+
+enum class AtlasOmModelDynamicMode {
+    Static         = 0,
+    DynamicBatch   = 1,
+    DynamicHW      = 2,
+    GenericDynamic = 3,  // New Dynamic Mode, convert by input_shape_range or input_shape without dynamic dim/hw specified.
+};
+
+struct AtlasOMModelInfo {
+    aclmdlDesc* model_desc       = nullptr;
+    uint32_t model_id            = INT_MAX;
+    aclmdlDataset* input_dataset = nullptr;
+    aclrtContext aclrt_context   = nullptr;
+
+    size_t memory_size = 0;
+    size_t weight_size = 0;
+
+    // Dynamic Input
+    AtlasOmModelDynamicMode dynamic_mode;
+    std::unordered_set<std::string> generic_dynamic_input_names;
+
+    // AIPP Input
+    std::map<std::string, aclAippInputFormat> aipp_input_format_map;
+};
+
+extern std::map<Blob*, std::shared_ptr<AtlasOMModelInfo>> global_blob_om_model_info_map;
+extern std::map<aclrtStream, aclrtContext> global_stream_context_map;
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_ATLAS_ATLAS_COMMON_TYPES_H_
diff --git a/source/tnn/device/atlas/atlas_context.cc b/source/tnn/device/atlas/atlas_context.cc
new file mode 100644
index 000000000..efa100211
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_context.cc
@@ -0,0 +1,111 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/atlas/atlas_context.h"
+
+namespace TNN_NS {
+
+AtlasContext::~AtlasContext() {
+    // Aclrt Stream is created and maintained by AtlasNetwork
+    // Do not Destroy aclrtStream HERE.
+    //if (this->aclrt_stream_ != nullptr) {
+    //    ret = aclrtDestroyStream(this->aclrt_stream_);
+    //    this->aclrt_stream_ = nullptr;
+    //}
+}
+
+Status AtlasContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status AtlasContext::GetCommandQueue(void** command_queue) {
+    // Reshape Model For different Model Types
+    if (this->model_type_ == MODEL_TYPE_TORCHSCRIPT) {
+        LOGE("Fail to GetCommandQueue, MODEL_TYPE_TORCHSCRIPT not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to GetCommandQueue, MODEL_TYPE_TORCHSCRIPT not supported YET");
+    } else if (this->model_type_ == MODEL_TYPE_TNN || this->model_type_ == MODEL_TYPE_RAPIDNET) {
+        LOGE("Fail to GetCommandQueue, MODEL_TYPE_TNN not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to GetCommandQueue, MODEL_TYPE_TNN not supported YET");
+    } else if (this->model_type_ == MODEL_TYPE_ATLAS) {
+        *command_queue = this->aclrt_stream_;
+    } else {
+        LOGE("Fail to GetCommandQueue, model type not supported.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to GetCommandQueue, model type not supported");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
+Status AtlasContext::ShareCommandQueue(Context* context) {
+    return TNN_OK;
+}
+
+Status AtlasContext::OnInstanceForwardBegin() {
+    return TNN_OK;
+}
+
+Status AtlasContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status AtlasContext::Synchronize() {
+    if (model_type_ == MODEL_TYPE_TNN || model_type_ == MODEL_TYPE_RAPIDNET ||
+        model_type_ == MODEL_TYPE_ATLAS) {
+        aclError acl_ret = aclrtSynchronizeStream(this->aclrt_stream_);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("before forward synchronize stream failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "before forward synchronize stream failed");
+        }
+    }
+    return TNN_OK;
+}
+
+aclrtStream& AtlasContext::GetAclrtStream() {
+    return this->aclrt_stream_;
+}
+
+void AtlasContext::SetAclrtStream(const aclrtStream& stream) {
+    this->aclrt_stream_ = stream;
+}
+
+Status AtlasContext::CreateAclrtStream() {
+    // Create aclrt Stream
+    aclError acl_ret = aclrtCreateStream(&aclrt_stream_);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("acl create stream failed (acl error code: %d)\n", acl_ret);
+    }
+    return TNN_OK;
+}
+
+ModelType& AtlasContext::GetModelType() {
+    return this->model_type_;
+}
+
+void AtlasContext::SetModelType(ModelType model_type) {
+    this->model_type_ = model_type;
+}
+
+void AtlasContext::SetDeviceId(int device_id) {
+    this->device_id_ = device_id;
+}
+
+int AtlasContext::GetDeviceId() {
+    return this->device_id_;
+}
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_context.h b/source/tnn/device/atlas/atlas_context.h
new file mode 100644
index 000000000..346362a96
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_context.h
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_CONTEXT_H_
+
+#include "tnn/core/context.h"
+#include "tnn/device/atlas/atlas_common_types.h"
+#include "tnn/interpreter/raw_buffer.h"
+
+namespace TNN_NS {
+
+class AtlasContext : public Context {
+public:
+    // @brief deconstructor
+    ~AtlasContext();
+
+    // @brief load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
+    // @brief share tnn command queue to another context
+    virtual Status ShareCommandQueue(Context* context);
+
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief get Atlas stream
+    aclrtStream& GetAclrtStream();
+
+    // @brief set Atlas stream
+    void SetAclrtStream(const aclrtStream& stream);
+
+    // @brief create Atlas stream
+    Status CreateAclrtStream();
+
+    // @brief get ModelType
+    ModelType& GetModelType();
+
+    // @brief set ModelType
+    void SetModelType(ModelType model_type);
+    
+    // @brief set specific device id
+    void SetDeviceId(int device_id);
+    
+    int GetDeviceId();
+
+private:
+    ModelType model_type_;
+    int device_id_ = INT_MAX;
+
+    // ACL Runtime Related
+    aclrtStream aclrt_stream_ = nullptr;
+};
+
+}  //  namespace TNN_NS;
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_CONTEXT_H_
diff --git a/source/tnn/device/atlas/atlas_device.cc b/source/tnn/device/atlas/atlas_device.cc
new file mode 100644
index 000000000..e70d67ac2
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_device.cc
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "acl/ops/acl_dvpp.h"
+#include "tnn/device/atlas/atlas_context.h"
+#include "tnn/device/atlas/atlas_device.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+AtlasDevice::AtlasDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+AtlasDevice::~AtlasDevice() {}
+
+BlobMemorySizeInfo AtlasDevice::Calculate(BlobDesc& desc) {
+    return Calculate1DMemorySize(desc);
+}
+
+Status AtlasDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
+    if (dims.size() != 4) {
+        LOGE("invalid dim size: %d\n", (int)dims.size());
+        return Status(TNNERR_PARAM_ERR, "invalid dim size");
+    }
+
+    BlobMemorySizeInfo size_info;
+
+    int N = dims[0];
+    int C = dims[1];
+    int H = dims[2];
+    int W = dims[3];
+
+    if (NCHW_FLOAT == mat_type) {
+        size_info.data_type = DATA_TYPE_FLOAT;
+        size_info.dims.push_back(N * C * W * H);
+    } else if (N8UC3 == mat_type) {
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(N * 3 * W * H);
+    } else if (N8UC4 == mat_type) {
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(N * 4 * W * H);
+    } else if (NGRAY == mat_type) {
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(N * 1 * W * H);
+    } else if (NNV12 == mat_type) {
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(N * 3 * W * H / 2);
+    } else if (NNV21 == mat_type) {
+        size_info.data_type = DATA_TYPE_INT8;
+        size_info.dims.push_back(N * 3 * W * H / 2);
+    } else {
+        LOGE("atlas allocator not support this mat type: %d\n", mat_type);
+        return Status(TNNERR_PARAM_ERR, "not support this mat type");
+    }
+
+    return Allocate(handle, size_info);
+}
+
+// allocate atlas memory
+Status AtlasDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
+    ASSERT(size_info.dims.size() == 1);
+
+    int bytes_size = GetBlobMemoryBytesSize(size_info);
+    if (bytes_size == 0) {
+        return Status(TNNERR_PARAM_ERR, "invalid size for memory allocate");
+    }
+
+    aclError ret = acldvppMalloc(handle, bytes_size);
+    if (ret != ACL_ERROR_NONE) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "atlas alloc memory failed");
+    }
+    LOGD("atlas allocate memory addr: 0x%lx\n", *handle);
+    return TNN_OK;
+}
+
+// release atlas memory
+Status AtlasDevice::Free(void* handle) {
+    aclError ret = acldvppFree(handle);
+    if (ret != ACL_ERROR_NONE) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "atlas free memory failed");
+    }
+    return TNN_OK;
+}
+
+// Copy data from Cpu To Device, format is same.
+Status AtlasDevice::CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    return Status(TNNERR_DEVICE_NOT_SUPPORT, "Atlas not support CopyToDevice");
+}
+
+// Copy data from Device To Cpu, format is same.
+Status AtlasDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue) {
+    return Status(TNNERR_DEVICE_NOT_SUPPORT, "Atlas not support CopyFromDevice");
+}
+
+// create layer acc with layer type
+AbstractLayerAcc* AtlasDevice::CreateLayerAcc(LayerType type) {
+    return nullptr;
+}
+
+Context* AtlasDevice::CreateContext(int device_id) {
+    auto context = new AtlasContext();
+    context->SetDeviceId(device_id);
+    return context;
+}
+
+NetworkType AtlasDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_ATLAS;
+}
+
+TypeDeviceRegister<AtlasDevice> g_atlas_device_register(DEVICE_ATLAS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_device.h b/source/tnn/device/atlas/atlas_device.h
new file mode 100644
index 000000000..feae641f1
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_device.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_DEVICE_FACTORY_H_
+#define TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_DEVICE_FACTORY_H_
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+class AtlasDevice : public AbstractDevice {
+public:
+    explicit AtlasDevice(DeviceType device_type);
+
+    ~AtlasDevice();
+
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    virtual Status Free(void* handle);
+
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    virtual Context* CreateContext(int device_id);
+
+    // @brief auto network type decided by device.
+    virtual NetworkType ConvertAutoNetworkType();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_DEVICE_FACTORY_H_
diff --git a/source/tnn/device/atlas/atlas_mat_converter.cc b/source/tnn/device/atlas/atlas_mat_converter.cc
new file mode 100644
index 000000000..b38e52325
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_mat_converter.cc
@@ -0,0 +1,639 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/atlas/atlas_mat_converter.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/atlas/atlas_utils.h"
+#include "tnn/utils/data_format_converter.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+namespace TNN_NS {
+
+// default contructor will create convert buffer
+AtlasMatConverterAcc::AtlasMatConverterAcc() {
+    input_desc_ = acldvppCreatePicDesc();
+    if (nullptr == input_desc_) {
+        LOGE("create input desc for mat converter failed\n");
+        return;
+    }
+
+    output_desc_ = acldvppCreatePicDesc();
+    if (nullptr == input_desc_) {
+        LOGE("create input desc for mat converter failed\n");
+        return;
+    }
+
+    dvpp_channel_desc_ = acldvppCreateChannelDesc();
+    if (nullptr == dvpp_channel_desc_) {
+        LOGE("create channel desc for mat converter failed\n");
+        return;
+    }
+
+    aclError ret = acldvppCreateChannel(dvpp_channel_desc_);
+    if (ACL_ERROR_NONE != ret) {
+        LOGE("create channel for mat converter failed\n");
+        return;
+    }
+
+    init_success_ = true;
+}
+
+AtlasMatConverterAcc::~AtlasMatConverterAcc() {
+    aclError ret;
+    if (nullptr != dvpp_input_buffer_) {
+        ret = acldvppFree(dvpp_input_buffer_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppFree failed, ret = %d\n", ret);
+        }
+        dvpp_input_buffer_ = nullptr;
+        input_buffer_size_ = 0;
+    }
+
+    if (nullptr != dvpp_output_buffer_) {
+        ret = acldvppFree(dvpp_output_buffer_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppFree failed, ret = %d\n", ret);
+        }
+        dvpp_output_buffer_ = nullptr;
+        output_buffer_size_ = 0;
+    }
+
+    if (nullptr != input_desc_) {
+        ret = acldvppDestroyPicDesc(input_desc_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppDestroyPicDesc failed, ret = %d\n", ret);
+        }
+        input_desc_ = nullptr;
+    }
+
+    if (nullptr != output_desc_) {
+        ret = acldvppDestroyPicDesc(output_desc_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppDestroyPicDesc failed, ret = %d\n", ret);
+        }
+        output_desc_ = nullptr;
+    }
+
+    if (nullptr != dvpp_channel_desc_) {
+        ret = acldvppDestroyChannel(dvpp_channel_desc_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppDestroyChannel failed, ret = %d\n", ret);
+        }
+
+        ret = acldvppDestroyChannelDesc(dvpp_channel_desc_);
+        if (ACL_ERROR_NONE != ret) {
+            LOGE("acldvppDestroyChannelDesc failed, ret = %d\n", ret);
+        }
+        dvpp_channel_desc_ = nullptr;
+    }
+}
+
+Status AtlasMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    aclrtMemcpyKind memcpy_type;
+    if (DEVICE_ATLAS == src.GetDeviceType() && DEVICE_ATLAS == dst.GetDeviceType()) {
+        memcpy_type = ACL_MEMCPY_DEVICE_TO_DEVICE;
+    } else if (DEVICE_ATLAS == src.GetDeviceType() &&
+               (DEVICE_NAIVE == dst.GetDeviceType() || DEVICE_ARM == dst.GetDeviceType())) {
+        memcpy_type = ACL_MEMCPY_DEVICE_TO_HOST;
+    } else if ((DEVICE_NAIVE == src.GetDeviceType() || DEVICE_ARM == src.GetDeviceType()) &&
+               DEVICE_ATLAS == dst.GetDeviceType()) {
+        memcpy_type = ACL_MEMCPY_HOST_TO_DEVICE;
+    } else {
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "invalid mat device type for atlas Copy()");
+    }
+
+    Status tnn_ret = TNN_OK;
+    int src_size   = 0;
+    int dst_size   = 0;
+
+    tnn_ret = MatUtils::GetMatByteSize(src, src_size);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+    tnn_ret = MatUtils::GetMatByteSize(dst, dst_size);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    if (dst_size != src_size) {
+        LOGE("invalid size for MatCopy\n");
+        return Status(TNNERR_PARAM_ERR, "invalid size for MatCopy");
+    }
+
+    aclError acl_ret = aclrtMemcpy(dst.GetData(), src_size, src.GetData(), src_size, memcpy_type);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    aclrtStream* stream_ptr = static_cast<aclrtStream*>(command_queue);
+    if (stream_ptr == nullptr) {
+        LOGE("get atlas command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+    }
+
+    if (0 != param.scale_w && 0 != param.scale_h) {
+        int dst_width  = src.GetWidth() * param.scale_w;
+        int dst_height = src.GetHeight() * param.scale_h;
+
+        if (dst_width != dst.GetWidth() || dst_height != dst.GetHeight()) {
+            dst = Mat(dst.GetDeviceType(), dst.GetMatType(), {dst.GetBatch(), dst.GetChannel(), dst_height, dst_width},
+                      nullptr);
+        }
+    } else if (0 == dst.GetWidth() || 0 == dst.GetHeight()) {
+        LOGE("dst mat size is invailed! (%dx%d)\n", dst.GetWidth(), dst.GetHeight());
+        return Status(TNNERR_NULL_PARAM, "resize param is invalid!");
+    }
+
+    Status ret       = TNN_OK;
+    aclError acl_ret = ACL_ERROR_NONE;
+
+    ret = PrepareInput(src);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    ret = PrepareOutput(dst);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    acldvppResizeConfig* resize_config = acldvppCreateResizeConfig();
+    if (nullptr == resize_config) {
+        LOGE("create resize config for mat converter failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acldvppCreateResizeConfig failed");
+    }
+
+    acl_ret =
+        acldvppVpcResizeAsync(dvpp_channel_desc_, input_desc_, output_desc_, resize_config, *stream_ptr);
+    if (ACL_ERROR_NONE != acl_ret) {
+        LOGE("acldvppVpcResizeAsync failed, ret = %d\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acldvppVpcResizeAsync failed");
+    }
+
+    if (nullptr != resize_config) {
+        acl_ret = acldvppDestroyResizeConfig(resize_config);
+        if (ACL_ERROR_NONE != acl_ret) {
+            LOGE("acldvppDestroyResizeConfig failed, ret = %d\n", acl_ret);
+        }
+        resize_config = nullptr;
+    }
+
+    aclrtSynchronizeStream(*stream_ptr);
+
+    ret = ProcessOutput(dst);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    aclrtStream* stream_ptr = static_cast<aclrtStream*>(command_queue);
+    if (stream_ptr == nullptr) {
+        LOGE("get atlas command queue failed!\n");
+        return Status(TNNERR_NULL_PARAM, "get atlas command queue failed!");
+    }
+
+    Status ret = TNN_OK;
+    ret        = PrepareInput(src);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    CropParam param_real = ProcessCropParam(param);
+    dst                  = Mat(dst.GetDeviceType(), dst.GetMatType(),
+                               {dst.GetBatch(), dst.GetChannel(), 
+                                param_real.height, param_real.width}, nullptr);
+    ret                  = PrepareOutput(dst);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    acldvppRoiConfig* crop_roi_config =
+        acldvppCreateRoiConfig(param_real.top_left_x, param_real.top_left_x + param_real.width - 1,
+                               param_real.top_left_y, param_real.top_left_y + param_real.height - 1);
+    if (nullptr == crop_roi_config) {
+        LOGE("create crop roi config in crop failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acldvppCreateRoiConfig failed");
+    }
+
+    aclError acl_ret = ACL_ERROR_NONE;
+    acl_ret =
+        acldvppVpcCropAsync(dvpp_channel_desc_, input_desc_, output_desc_, crop_roi_config, *stream_ptr);
+    if (ACL_ERROR_NONE != acl_ret) {
+        LOGE("acldvppVpcResizeAsync failed, ret = %d\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acldvppVpcResizeAsync failed");
+    }
+
+    aclrtSynchronizeStream(*stream_ptr);
+
+    ret = ProcessOutput(dst);
+    if (TNN_OK != ret) {
+        return ret;
+    }
+
+    if (nullptr != crop_roi_config) {
+        acl_ret = acldvppDestroyRoiConfig(crop_roi_config);
+        if (ACL_ERROR_NONE != acl_ret) {
+            LOGE("acldvppDestroyRoiConfig failed, ret = %d\n", acl_ret);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acldvppDestroyRoiConfig failed");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat not support WarpAffine");
+}
+
+Status AtlasMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat not support CvtColor");
+}
+
+Status AtlasMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
+    if (!init_success_) {
+        LOGE("init mat converter failed!\n");
+        return Status(TNNERR_NULL_PARAM, "init mat converter failed!");
+    }
+
+    return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat not support CopyMakeBorder");
+}
+
+Status AtlasMatConverterAcc::PrepareInput(Mat& mat) {
+    int batch = mat.GetBatch();
+    if (1 != batch) {
+        LOGE("mat resize not support multi batch (batch is %d)!\n", batch);
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat resize not support multi batch");
+    }
+
+    aclError acl_ret;
+    Status tnn_ret;
+
+    int width_aligned  = 0;
+    int height_aligned = 0;
+    int buffer_size    = 0;
+    tnn_ret            = GetAlignedBufferSize(mat, 16, 2, buffer_size, width_aligned, height_aligned);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    LOGD("input width: %d  height: %d   width_aligned: %d  height_aligned: %d  buffer_size: %d\n", mat.GetWidth(),
+         mat.GetHeight(), width_aligned, height_aligned, buffer_size);
+
+    DeviceType device_type = mat.GetDeviceType();
+    if (DEVICE_ATLAS == device_type) {
+        LOGD("input is on device\n");
+        // input device memory must by aligned with 16x2
+        dvpp_input_buffer_ptr_ = mat.GetData();
+    } else if (DEVICE_NAIVE == device_type || DEVICE_ARM == device_type) {
+        LOGD("input is on host\n");
+        // malloc device memory
+        tnn_ret = MallocDeviceMemory(&dvpp_input_buffer_, input_buffer_size_, buffer_size);
+        if (TNN_OK != tnn_ret) {
+            return tnn_ret;
+        }
+
+        // copy from host to device
+        tnn_ret = CopyFromHostToDeviceAligned(mat, dvpp_input_buffer_, 16, 2);
+        if (TNN_OK != tnn_ret) {
+            return tnn_ret;
+        }
+
+        dvpp_input_buffer_ptr_ = dvpp_input_buffer_;
+    } else {
+        LOGE("mat resize not support this input device type (device type is %d)!\n", device_type);
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat resize not support this input device type");
+    }
+
+    acldvppPixelFormat dvpp_pixel_format;
+    tnn_ret = ConvertFromMatTypeToDvppPixelFormat(mat.GetMatType(), dvpp_pixel_format);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    int width_stride = GetWidthStride(mat.GetMatType(), width_aligned);
+    LOGD(
+        "input set:   buffer addr: 0x%lx    dvpp format: %d   width: %d  height: %d  wight_stride: %d  height_stride: "
+        "%d  buffer size: %d\n",
+        dvpp_input_buffer_ptr_, dvpp_pixel_format, mat.GetWidth(), mat.GetHeight(), width_stride, height_aligned,
+        buffer_size);
+    acldvppSetPicDescData(input_desc_, dvpp_input_buffer_ptr_);
+    acldvppSetPicDescFormat(input_desc_, dvpp_pixel_format);
+    acldvppSetPicDescWidth(input_desc_, mat.GetWidth());
+    acldvppSetPicDescHeight(input_desc_, mat.GetHeight());
+    acldvppSetPicDescWidthStride(input_desc_, width_stride);
+    acldvppSetPicDescHeightStride(input_desc_, height_aligned);
+    acldvppSetPicDescSize(input_desc_, buffer_size);
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::PrepareOutput(Mat& mat, int pad_value) {
+    int batch = mat.GetBatch();
+    if (1 != batch) {
+        LOGE("atlas mat convert not support multi batch (batch is %d)!\n", batch);
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat resize not support multi batch");
+    } 
+
+    aclError acl_ret;
+    Status tnn_ret;
+
+    int width_origin  = mat.GetWidth();
+    int height_origin = mat.GetHeight();
+
+    int width_aligned  = 0;
+    int height_aligned = 0;
+    int buffer_size    = 0;
+    tnn_ret            = GetAlignedBufferSize(mat, 16, 2, buffer_size, width_aligned, height_aligned);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+    LOGD("output width: %d  height: %d   width_aligned: %d  height_aligned: %d  buffer_size: %d\n", mat.GetWidth(),
+         mat.GetHeight(), width_aligned, height_aligned, buffer_size);
+
+    DeviceType device_type = mat.GetDeviceType();
+    if (nullptr == mat.GetData()) {
+        mat = Mat(device_type, mat.GetMatType(), {mat.GetBatch(), mat.GetChannel(), height_aligned, width_aligned});
+    }
+
+    // get dvpp_output_buffer
+    if (DEVICE_ATLAS == device_type) {
+        LOGD("output is on device\n");
+        // output device memory must by aligned with 16x2
+        dvpp_output_buffer_ptr_ = mat.GetData();
+    } else if (DEVICE_NAIVE == device_type || DEVICE_ARM == device_type) {
+        LOGD("output is on cpu\n");
+        // malloc device memory
+        tnn_ret = MallocDeviceMemory(&dvpp_output_buffer_, output_buffer_size_, buffer_size);
+        if (TNN_OK != tnn_ret) {
+            return tnn_ret;
+        }
+
+        dvpp_output_buffer_ptr_ = dvpp_output_buffer_;
+    } else {
+        LOGE("mat resize not support this input device type (device type is %d)!\n", device_type);
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "atlas mat resize not support this input device type");
+    }
+    acl_ret = aclrtMemset(dvpp_output_buffer_ptr_, buffer_size, pad_value, buffer_size);
+    if (ACL_ERROR_NONE != acl_ret) {
+        LOGE("aclrtMemset failed, ret = %d\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "aclrtMemset failed");
+    }
+
+    acldvppPixelFormat dvpp_pixel_format;
+    tnn_ret = ConvertFromMatTypeToDvppPixelFormat(mat.GetMatType(), dvpp_pixel_format);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    int width_stride = GetWidthStride(mat.GetMatType(), width_aligned);
+    LOGD(
+        "output set:   buffer addr: 0x%lx    dvpp format: %d   width: %d  height: %d  wight_stride: %d  height_stride: "
+        "%d  buffer size: %d\n",
+        dvpp_output_buffer_ptr_, dvpp_pixel_format, mat.GetWidth(), mat.GetHeight(), width_stride, height_aligned,
+        buffer_size);
+    acldvppSetPicDescData(output_desc_, dvpp_output_buffer_ptr_);
+    acldvppSetPicDescFormat(output_desc_, dvpp_pixel_format);
+    acldvppSetPicDescWidth(output_desc_, width_origin);
+    acldvppSetPicDescHeight(output_desc_, height_origin);
+    acldvppSetPicDescWidthStride(output_desc_, width_stride);
+    acldvppSetPicDescHeightStride(output_desc_, height_aligned);
+    acldvppSetPicDescSize(output_desc_, buffer_size);
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::ProcessOutput(Mat& mat) {
+    if (DEVICE_ATLAS != mat.GetDeviceType()) {
+        // if dst is on host, need to do copy
+        LOGD("resize: copy form device to host\n");
+        int buffer_size = 0;
+        Status tnn_ret  = MatUtils::GetMatByteSize(mat, buffer_size);
+        if (TNN_OK != tnn_ret) {
+            return tnn_ret;
+        }
+
+        aclError acl_ret =
+            aclrtMemcpy(mat.GetData(), buffer_size, dvpp_output_buffer_ptr_, buffer_size, ACL_MEMCPY_DEVICE_TO_HOST);
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::GetAlignedBufferSize(Mat& mat, int width_align_to, int height_align_to, int& buffer_size,
+                                                  int& width_aligned, int& height_aligned) {
+    int width      = mat.GetWidth();
+    int height     = mat.GetHeight();
+    width_aligned  = (width + width_align_to - 1) / width_align_to * width_align_to;
+    height_aligned = (height + height_align_to - 1) / height_align_to * height_align_to;
+
+    Mat mat_aligned(mat.GetDeviceType(), mat.GetMatType(),
+                    {mat.GetBatch(), mat.GetChannel(), height_aligned, width_aligned}, nullptr);
+    Status tnn_ret = MatUtils::GetMatByteSize(mat_aligned, buffer_size);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::MallocDeviceMemory(void** buffer, int& size, int desired_size) {
+    if (nullptr == buffer) {
+        return Status(TNNERR_NULL_PARAM, "invalid param");
+    }
+
+    aclError acl_ret;
+
+    if (nullptr != *buffer) {
+        if (size >= desired_size) {
+            return TNN_OK;
+        } else {
+            // reallocate memory
+            LOGD("re allocate memory \n");
+            acl_ret = acldvppFree(*buffer);
+            if (ACL_ERROR_NONE != acl_ret) {
+                LOGE("atlas dvpp free buffer failed!\n");
+                return Status(TNNERR_ATLAS_FREE_ERROR, "atlas dvpp free buffer failed");
+            }
+            *buffer = nullptr;
+        }
+    }
+
+    acl_ret = acldvppMalloc(buffer, desired_size);
+    if (ACL_ERROR_NONE != acl_ret) {
+        LOGE("atlas dvpp malloc buffer failed!\n");
+        return Status(TNNERR_ATLAS_MALLOC_ERROR, "atlas dvpp malloc buffer failed");
+    }
+    LOGD("memory addr: 0x%lx   size: %d\n", *buffer, desired_size);
+    size = desired_size;
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::CopyFromHostToDeviceAligned(Mat& src, void* dst, int width_align_to, int height_align_to) {
+    Status tnn_ret;
+    aclError acl_ret;
+
+    int width_aligned  = 0;
+    int height_aligned = 0;
+    int buffer_size    = 0;
+    tnn_ret = GetAlignedBufferSize(src, width_align_to, height_align_to, buffer_size, width_aligned, height_aligned);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    if (width_aligned == src.GetWidth() && height_aligned == src.GetHeight()) {
+        // copy directly
+        acl_ret = aclrtMemcpy(dst, buffer_size, src.GetData(), buffer_size, ACL_MEMCPY_HOST_TO_DEVICE);
+        if (ACL_ERROR_NONE != acl_ret) {
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+        }
+    } else {
+        // need to do padding
+        // only support N8UC3 and N8UC4
+        MatType mat_type = src.GetMatType();
+        if (N8UC3 != mat_type && N8UC4 != mat_type) {
+            LOGE("not support this mat type copy from host to device aligned! (mat type: %d)\n", mat_type);
+        }
+
+        if (width_aligned == src.GetWidth()) {
+            // copy directly
+            acl_ret = aclrtMemcpy(dst, buffer_size, src.GetData(), buffer_size, ACL_MEMCPY_HOST_TO_DEVICE);
+            if (ACL_ERROR_NONE != acl_ret) {
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+            }
+        } else {
+            // copy form src with stride
+            int width_stride = width_aligned;
+            if (N8UC3 == mat_type) {
+                width_stride *= 3;
+            } else if (N8UC4 == mat_type) {
+                width_stride *= 4;
+            }
+
+            int src_offset = 0;
+            int dst_offset = 0;
+            for (int h = 0; h < src.GetHeight(); ++h) {
+                acl_ret = aclrtMemcpy((char*)dst + dst_offset, src.GetWidth(), (char*)src.GetData() + src_offset,
+                                      src.GetWidth(), ACL_MEMCPY_HOST_TO_DEVICE);
+                if (ACL_ERROR_NONE != acl_ret) {
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+                }
+                dst_offset += width_stride;
+                src_offset += src.GetWidth();
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+int AtlasMatConverterAcc::GetWidthStride(MatType mat_type, int width) {
+    if (N8UC3 == mat_type) {
+        return width * 3;
+    } else if (N8UC4 == mat_type) {
+        return width * 4;
+    }
+
+    return width;
+}
+
+CropParam AtlasMatConverterAcc::ProcessCropParam(CropParam param) {
+    CropParam result  = param;
+    result.top_left_x = result.top_left_x & (~(0x01));
+    result.top_left_y = result.top_left_y & (~(0x01));
+    result.width      = (result.width + 1) & (~(0x01));
+    result.height     = (result.height + 1) & (~(0x01));
+    return result;
+}
+
+Status AtlasMatConverterAcc::MatCopyAsync(Mat& dst, Mat& src, int dst_offset, void* stream) {
+    if (dst.GetDeviceType() != DEVICE_ATLAS || src.GetDeviceType() != DEVICE_ATLAS) {
+        LOGE("MatCopyAsync in AtlasMatConverterAcc only support DEVICE_ATLAS\n");
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "MatCopyAsync in AtlasMatConverterAcc only support DEVICE_ATLAS");
+    }
+
+    Status tnn_ret = TNN_OK;
+    int src_size   = 0;
+    int dst_size   = 0;
+
+    tnn_ret = MatUtils::GetMatByteSize(src, src_size);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+    tnn_ret = MatUtils::GetMatByteSize(dst, dst_size);
+    if (TNN_OK != tnn_ret) {
+        return tnn_ret;
+    }
+
+    if (dst_size < (dst_offset + src_size)) {
+        LOGE("invalid offset for MatCopyAsync\n");
+        return Status(TNNERR_PARAM_ERR, "invalid offset for MatCopyAsync");
+    }
+
+    aclError acl_ret = aclrtMemcpyAsync((char*)dst.GetData() + dst_offset, src_size, src.GetData(), src_size,
+                                        ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl memory copy failed");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::ResizeAndPaste(Mat& src, Mat& dst, ResizeParam param, PasteParam paste_param, void* command_queue) {
+    return TNN_OK;
+}
+
+Status AtlasMatConverterAcc::ConcatMatWithBatch(std::vector<Mat>& src_vec, Mat& dst, void* command_queue) {
+    return TNN_OK;
+}
+
+DECLARE_MAT_CONVERTER_CREATER(Atlas);
+REGISTER_MAT_CONVERTER(Atlas, DEVICE_ATLAS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_mat_converter.h b/source/tnn/device/atlas/atlas_mat_converter.h
new file mode 100644
index 000000000..dbbfa4ec6
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_mat_converter.h
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_MAT_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_MAT_CONVERTER_H_
+
+#include "acl/acl.h"
+#include "acl/ops/acl_dvpp.h"
+#include "tnn/core/macro.h"
+#include "tnn/utils/mat_converter_acc.h"
+#include "tnn/utils/mat_utils.h"
+
+namespace TNN_NS {
+
+class AtlasMatConverterAcc : public MatConverterAcc {
+public:
+    AtlasMatConverterAcc();
+    virtual ~AtlasMatConverterAcc();
+    virtual Status Copy(Mat& src, Mat& dst, void* command_queue = NULL) override;
+    virtual Status Resize(Mat& src, Mat& dst, ResizeParam param, void* command_queue = NULL) override;
+    virtual Status Crop(Mat& src, Mat& dst, CropParam param, void* command_queue = NULL) override;
+    virtual Status WarpAffine(Mat& src, Mat& dst, WarpAffineParam param, void* command_queue = NULL) override;
+    virtual Status CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* command_queue = NULL) override;
+    virtual Status ResizeAndPaste(Mat& src, Mat& dst, ResizeParam param, PasteParam paste_param,
+                                  void* command_queue = NULL) override;
+    virtual Status ConcatMatWithBatch(std::vector<Mat>& src_vec, Mat& dst, void* command_queue = NULL) override;
+    virtual Status CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue = NULL) override;
+
+private:
+    Status PrepareInput(Mat& mat);
+    Status PrepareOutput(Mat& mat, int pad_value = 0);
+    Status ProcessOutput(Mat& mat);
+
+    Status GetAlignedBufferSize(Mat& mat, int width_align_to, int height_align_to, int& buffer_size, int& width_aligned,
+                                int& height_aligned);
+
+    Status MallocDeviceMemory(void** buffer, int& size, int desired_size);
+
+    Status CopyFromHostToDeviceAligned(Mat& src, void* dst, int width_align_to, int height_align_to);
+
+    int GetWidthStride(MatType mat_type, int width);
+
+    CropParam ProcessCropParam(CropParam param);
+
+    Status MatCopyAsync(Mat& dst, Mat& src, int dst_offset, void* stream);
+
+private:
+    bool init_success_                     = false;
+    acldvppChannelDesc* dvpp_channel_desc_ = nullptr;
+    acldvppPicDesc* input_desc_            = nullptr;
+    acldvppPicDesc* output_desc_           = nullptr;
+
+    void* dvpp_input_buffer_      = nullptr;
+    void* dvpp_input_buffer_ptr_  = nullptr;
+    int input_buffer_size_        = 0;
+    void* dvpp_output_buffer_     = nullptr;
+    void* dvpp_output_buffer_ptr_ = nullptr;
+    int output_buffer_size_       = 0;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_ATLAS_ATLAS_MAT_CONVERTER_H_
diff --git a/source/tnn/device/atlas/atlas_network.cc b/source/tnn/device/atlas/atlas_network.cc
new file mode 100644
index 000000000..dc5c078aa
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_network.cc
@@ -0,0 +1,1124 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include <time.h>
+#include <chrono>
+#include "tnn/device/atlas/atlas_common_types.h"
+#include "tnn/device/atlas/atlas_network.h"
+#include "tnn/device/atlas/atlas_om_model_interpreter.h"
+#include "tnn/device/atlas/atlas_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
+
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<AtlasNetwork>> g_network_impl_atlas_factory_register(NETWORK_TYPE_ATLAS);
+
+// Default initialize global variable defined in "atlas_common_types.h"
+std::map<Blob*, std::shared_ptr<AtlasOMModelInfo>> global_blob_om_model_info_map;
+std::map<aclrtStream, aclrtContext> global_stream_context_map;
+
+AtlasNetwork::~AtlasNetwork() {
+    if (!this->network_init_called_) {
+        LOGD("TNN ATLAS Network DeInit() called without Inited, do nothing.\n");
+    }
+    this->network_init_called_ = false;
+
+    for (auto item : input_blob_map_) {
+        if (nullptr != item.second) {
+            delete item.second;
+        }
+    }
+    input_blob_map_.clear();
+
+    for (auto item : output_blob_map_) {
+        if (nullptr != item.second) {
+            delete item.second;
+        }
+    }
+    output_blob_map_.clear();
+    
+    LOGD("TNN AtlasNetwork Destructor: aclmdl destroy input dataset\n");
+    if (this->aclmdl_input_dataset_ != nullptr) {
+        DestroyDataset(this->aclmdl_input_dataset_);
+    }
+    LOGD("TNN AtlasNetwork Destructor: aclmdl destroy output dataset\n");
+    if (this->aclmdl_output_dataset_ != nullptr) {
+        DestroyDataset(this->aclmdl_output_dataset_);
+    }
+
+    if (this->model_type_ == MODEL_TYPE_ATLAS) {
+        // Release OM model related classes and resources.
+        aclError acl_ret;
+        if (this->om_model_info_->model_id != INT_MAX) {
+            LOGD("Unload ATLAS ACL Model id & Model Desc.\n");
+            acl_ret = aclmdlUnload(this->om_model_info_->model_id);
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("unload model failed, modelId is %u\n", this->om_model_info_->model_id);
+            }
+            this->om_model_info_->model_id = INT_MAX;
+        }
+
+        if (nullptr != this->om_model_info_->model_desc) {
+            (void)aclmdlDestroyDesc(this->om_model_info_->model_desc);
+            this->om_model_info_->model_desc = nullptr;
+        }
+        
+        AtlasContext* tnn_atlas_context = dynamic_cast<AtlasContext*>(context_);
+        if(tnn_atlas_context == nullptr) {
+            LOGE("TNN ATLAS Network: fail to cast to tnn atlas context\n");
+        }
+        if (tnn_atlas_context->GetAclrtStream() != nullptr) {
+            acl_ret = aclrtSetCurrentContext(om_model_info_->aclrt_context);
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("TNN ATLAS Network: on destroy stream set context failed\n");
+            }
+            acl_ret = aclrtDestroyStream(tnn_atlas_context->GetAclrtStream());
+            LOGD("aclrt destroy stream\n");
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("TNN ATLAS Network: destroy stream failed\n");
+            }
+            tnn_atlas_context->SetAclrtStream(nullptr);
+        }
+        
+        if (om_model_info_->aclrt_context != nullptr) {
+            acl_ret = aclrtDestroyContext(om_model_info_->aclrt_context);
+            LOGD("aclrt destroy aclrt context\n");
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("TNN ATLAS Network: destroy context failed\n");
+            }
+            om_model_info_->aclrt_context = nullptr;
+        }
+        
+        if (nullptr != this->om_model_memory_ptr_) {
+            aclrtFree(this->om_model_memory_ptr_);
+            LOGD("Unload ATLAS ACL Model Memory.\n");
+            this->om_model_memory_ptr_  = nullptr;
+            this->om_model_info_->memory_size = 0;
+        }
+        
+        if (nullptr != this->om_model_weight_ptr_) {
+            aclrtFree(this->om_model_weight_ptr_);
+            LOGD("Unload ATLAS ACL Model Weight.\n");
+            this->om_model_weight_ptr_  = nullptr;
+            this->om_model_info_->weight_size = 0;
+        }
+
+        // Destroy aclrt Device()
+        if (tnn_atlas_context->GetDeviceId() != INT_MAX) {
+            LOGD("Reset aclrt Device.\n");
+            acl_ret = aclrtResetDevice(tnn_atlas_context->GetDeviceId());
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("TNN ATLAS Network: aclrtResetDevice() failed\n");
+            }
+        }
+    }
+
+    // Call DeInit() of DefaultNetwork
+    DeInit();
+}
+
+Status AtlasNetwork::LoadOMModelFromFile(const std::string &om_file) {
+    // Step 1: Query Model Weight And Memory Size
+    aclError acl_ret = aclmdlQuerySize(om_file.c_str(), &(om_model_info_->memory_size), &(om_model_info_->weight_size));
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("Atlas API: aclmdlQuerySize failed with Error Code: (%d)\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclmdlQuerySize failed");
+    }
+    LOGD("Load Atlas OM Model From FILE. Weight Size: %d, Memory Size: %d\n", om_model_info_->weight_size, om_model_info_->memory_size);
+
+    // Step 2: Load Model & Alloc Model Memory
+    if (om_model_info_->memory_size > 0) {
+        acl_ret = aclrtMalloc(&om_model_memory_ptr_, om_model_info_->memory_size, ACL_MEM_MALLOC_HUGE_FIRST);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclrtMalloc for model memory failed, require size is %zu\n", om_model_info_->memory_size);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclrtMalloc for model memory failed");
+        }
+
+        acl_ret = aclmdlLoadFromFileWithMem(om_file.c_str(), &(om_model_info_->model_id), om_model_memory_ptr_, om_model_info_->memory_size,
+                                            om_model_weight_ptr_, om_model_info_->weight_size);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclmdlLoadFromFileWithMem failed, model file is %s\n", om_file.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclmdlLoadFromFileWithMem failed");
+        }
+    } else {
+        // Some model, e.g model Converted with atc config: --input_shape_range,
+        // Does not have model_mem_size, aclrtMalloc EMPTY mem is NOT ALLOWED.
+        acl_ret = aclmdlLoadFromFile(om_file.c_str(), &(om_model_info_->model_id));
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclmdlLoadFromFile without memory failed, model file is %s\n", om_file.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclmdlLoadFromFile without memory failed");
+        }
+    }
+
+    // Step 3: Create Model Desc to get Model Info
+    om_model_info_->model_desc = aclmdlCreateDesc();
+    if (nullptr == om_model_info_->model_desc) {
+        LOGE("Atlas API: aclmdlCreateDesc failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "create model description failed");
+    }
+
+    acl_ret = aclmdlGetDesc(om_model_info_->model_desc, om_model_info_->model_id);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("Atlas API: aclmdlGetDesc failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get model description failed");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::LoadOMModelFromMemory(const std::string &om_content) {
+    // Step 1: Query Model Weight And Memory Size
+    aclError acl_ret = aclmdlQuerySizeFromMem(om_content.data(), om_content.length(), &(om_model_info_->memory_size), &(om_model_info_->weight_size));
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("Atlas API: aclmdlQuerySizeFromMem failed with Error Code: (%d)\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclmdlQuerySizeFromMem failed");
+    }
+    LOGD("Load Atlas OM Model From MEMORY. Weight Size: %d, Memory Size: %d\n", om_model_info_->weight_size, om_model_info_->memory_size);
+
+    // Step 2: Load Model & Alloc Model Memory
+    if (om_model_info_->memory_size > 0) {
+        acl_ret = aclrtMalloc(&om_model_memory_ptr_, om_model_info_->memory_size, ACL_MEM_MALLOC_HUGE_FIRST);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclrtMalloc for model memory failed, require size is %zu\n", om_model_info_->memory_size);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Atlas API: aclrtMalloc for model memory failed");
+        }
+
+        acl_ret = aclmdlLoadFromMemWithMem(om_content.data(), om_content.length(), &(om_model_info_->model_id), om_model_memory_ptr_,
+                                           om_model_info_->memory_size, om_model_weight_ptr_, om_model_info_->weight_size);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclmdlLoadFromMemWithMem, load om content from memory with model memory failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Load om content from memory with model memory");
+        }
+    } else {
+        // Some model, e.g model Converted with atc config: --input_shape_range,
+        // Does not need model_mem_size,
+        acl_ret = aclmdlLoadFromMem(om_content.data(), om_content.length(), &(om_model_info_->model_id));
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("Atlas API: aclmdlLoadFromMem, load model from file without model memory failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Load model from file without model memory failed");
+        }
+    }
+
+    // Step 3: Create Model Desc to get Model Info
+    om_model_info_->model_desc = aclmdlCreateDesc();
+    if (nullptr == om_model_info_->model_desc) {
+        LOGE("Atlas API: aclmdlCreateDesc failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "create model description failed");
+    }
+
+    acl_ret = aclmdlGetDesc(om_model_info_->model_desc, om_model_info_->model_id);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("Atlas API: aclmdlGetDesc failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get model description failed");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::DeduceOMModelDynamicMode() {
+    // ATC Converted HUAWEI atlas .om dynamic Models are devided into:
+    //
+    // 1. Traditional dynamic models with only 1 dynamic inputs.
+    //    Min/Max value of the dynamic dim has been explicitly defined in ATC Conversion.
+    // ---- 1.1:
+    //      dynmaic batch
+    //      --input_shape="img:-1,2,224,224;img_info:-1,4"
+    //      --dynamic_batch_size="1,2,4,8"
+    // ---- 1.2:
+    //      dynamic hw size
+    //      --input_shape="data:8,3,-1,-1;img_info:8,4,-1,-1"
+    //      --dynamic_image_size="416,416;832,832"
+    // ---- 1.3
+    //      dynamic dims
+    //      --input_shape="data:-1,1,256,256", --dynamic_dims="1,2"
+    //
+    // 2. More flexible dynamic input models.
+    //    Min/Max Value is not explictly defined in ATC Conversion.
+    // ---- 2.1:
+    //      input_shape_range
+    //      --input_shape_range="input1:[8~20,3,5,-1];input2:[5,3~9,10,-1]"
+    // ---- 2.1:
+    //      input_shape (without "dynamic_batch_size" or "dynamic_image_size")
+    //      --input_shape="input1:[8~20,3,5,-1];input2:[5,3~9,10,-1]"
+    
+    // Get Number of Inputs by Calling ACL API
+    int count = aclmdlGetNumInputs(this->om_model_info_->model_desc);
+    LOGD("TNN Atlas Loaded OM Model have %d inputs.\n", count);
+
+    // Type 1 OM model has an extra input called "ascend_mbatch_shape_data"
+    // Check if the input exists.
+    bool is_om_model_dynamic = false;
+    
+    for (int i = 0; i < count; i++) {
+        std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+        if (input_name.find(ACL_DYNAMIC_TENSOR_NAME) != std::string::npos) {
+            LOGD("Network is converted with dynamic batch/hw/dims.\n");
+            is_om_model_dynamic = true;
+        }
+    }
+
+    // Traditional Type 1 Dynamic
+    if (is_om_model_dynamic) {
+        if (count != 2) {
+            // TODO: SUPPORT Type 1 Model with more than ONE input in the future.
+            LOGD("Dynamic batch/hw/dims ATLAS with more than ONE input not supported yet.\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR,
+                          "Dynamic batch/hw/dims ATLAS with more than ONE input not supported yet.");
+        }
+        
+        // TODO: Update this part for multiple inputs
+        for (int i = 0; i < count; i++) {
+            std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+            if (input_name.find(ACL_DYNAMIC_TENSOR_NAME) == std::string::npos) {
+                aclmdlIODims acl_dims;
+                aclError acl_ret = aclmdlGetInputDims(this->om_model_info_->model_desc, i, &acl_dims);
+                if (ACL_ERROR_NONE != acl_ret) {
+                    LOGE("ACL API Call aclmdlGetInputDims falied!\n");
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "ACL API Call aclmdlGetInputDims falied.");
+                }
+
+                int minus_one_count = 0;
+                for (int d = 0; d < acl_dims.dimCount; d++) {
+                    if (acl_dims.dims[d] == -1) {
+                        minus_one_count++;
+                    }
+                }
+                if (minus_one_count == 0) {
+                    LOGE("The Only Input %s is not dynamic But Model is dynamic. Not Supported.\n", input_name.c_str());
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR,
+                                  "The Only Input is not dynamic But Model is dynamic. Not Supported..");
+                }
+
+                if (minus_one_count == 1 && acl_dims.dims[0] == -1) {
+                    LOGD("Deduced Dynamic Batch Mode from input: %s.\n", input_name.c_str());
+                    this->om_model_info_->dynamic_mode = AtlasOmModelDynamicMode::DynamicBatch;
+                    return TNN_OK;
+                }
+                if (minus_one_count == 2 && acl_dims.dimCount == 4 && acl_dims.dims[2] == -1 &&
+                    acl_dims.dims[3] == -1) {
+                    LOGD("Deduced Dynamic HW Mode from input: %s.\n", input_name.c_str());
+                    this->om_model_info_->dynamic_mode = AtlasOmModelDynamicMode::DynamicHW;
+                    return TNN_OK;
+                }
+                // ELSE
+                LOGD("Deduced Generic Dynamic Dim Mode from input: %s.\n", input_name.c_str());
+                this->om_model_info_->dynamic_mode = AtlasOmModelDynamicMode::GenericDynamic;
+                return TNN_OK;
+            }
+        }
+    }
+
+    // No Dynamic Or Type 2 Dynamic Input by --input_shape_range
+    for (int i = 0; i < count; i++) {
+        aclmdlIODims acl_dims;
+        aclError acl_ret = aclmdlGetInputDims(this->om_model_info_->model_desc, i, &acl_dims);
+        if (ACL_ERROR_NONE != acl_ret) {
+            LOGE("ACL API Call aclmdlGetInputDims falied!\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "ACL API Call aclmdlGetInputDims falied.");
+        }
+        
+        int minus_one_count = 0;
+        for (int d = 0; d < acl_dims.dimCount; d++) {
+            if (acl_dims.dims[d] == -1) {
+                minus_one_count++;
+            }
+        }
+
+        if (minus_one_count > 0) {
+            std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+            LOGD("Input: '%s' is dynamic by --input_shape_range.\n", input_name.c_str());
+            this->om_model_info_->generic_dynamic_input_names.insert(input_name);
+        }
+    }
+
+    if (this->om_model_info_->generic_dynamic_input_names.empty()) {
+        LOGD("No Dynamic Input.\n");
+    }
+    return TNN_OK;
+}
+
+Status AtlasNetwork::DeduceOMModelAIPPInputFormat() {
+    // Get Number of Inputs by Calling ACL API
+    int count = aclmdlGetNumInputs(this->om_model_info_->model_desc);
+
+    for (int i = 0; i < count; i++) {
+        std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+        aclAippInfo aipp_info;
+        aclError acl_ret = aclmdlGetFirstAippInfo(this->om_model_info_->model_id, i, &aipp_info);
+        if (acl_ret == ACL_ERROR_NONE) {
+            LOGD("Found AIPP Input, shapeCount: %d srcDimNum: %d\n", aipp_info.shapeCount, aipp_info.srcDimNum);
+            this->om_model_info_->aipp_input_format_map[input_name] = aipp_info.inputFormat;
+        }
+    }
+    return TNN_OK;
+}
+
+Status AtlasNetwork::InitOMModel(ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                 InputDataTypeMap inputs_data_type, bool enable_const_folder) {
+    AtlasOMModelInterpreter *om_interpreter = dynamic_cast<AtlasOMModelInterpreter *>(interpreter);
+    CHECK_PARAM_NULL(om_interpreter);
+    AtlasContext* atlas_context = dynamic_cast<AtlasContext *>(context_);
+    CHECK_PARAM_NULL(atlas_context);
+
+    std::string& om_str = om_interpreter->GetOmString();
+
+    // Part 1: Load(Interpret) Model. Aclrt load OM model will directly load model onto Device
+    //         So it can only be called in AtlasNetwork, not ModelInterpreter
+    // Step 1: Create OM Model Info, aclrt load_model_context & load model stream
+    this->om_model_info_ = std::make_shared<AtlasOMModelInfo>();
+
+    aclError acl_ret = aclrtSetDevice(this->config_.device_id);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("acl open device %d failed (acl error code: %d)\n", this->config_.device_id, acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl open device falied");
+    }
+    acl_ret = aclrtCreateContext(&(om_model_info_->aclrt_context), this->config_.device_id);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("acl create context failed (acl error code: %d)\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl create context falied");
+    }
+    acl_ret = aclrtSetCurrentContext(om_model_info_->aclrt_context);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("TNN ATLAS OM Model Interpreter: on destroy stream set context failed\n");
+    }
+    aclrtStream aclrt_stream;
+    acl_ret = aclrtCreateStream(&aclrt_stream);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("acl create stream failed (acl error code: %d)\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl create stream falied");
+    }
+    atlas_context->SetAclrtStream(aclrt_stream);
+    global_stream_context_map[atlas_context->GetAclrtStream()] = om_model_info_->aclrt_context;
+
+    // Step 2: Load Model From Path or From Memory
+    // Determine OM string is model path or model content
+    Status tnn_ret;
+    if (om_str.length() < 1024) {
+        std::ifstream om_file(om_str);
+        if (!om_file) {
+            LOGE("Invalied om file path! (om_str : %s) maybe as memory content\n", om_str.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Invalied om file Path, cannot determine if om_str is Path or Model Content.");
+        }
+        tnn_ret = LoadOMModelFromFile(om_str);
+        if (tnn_ret != TNN_OK) {
+            LOGE("TNN Atlas Load OM Model from File Failed.\n");
+            return tnn_ret;
+        }
+    } else {
+        tnn_ret = LoadOMModelFromMemory(om_str);
+        if (tnn_ret != TNN_OK) {
+            LOGE("TNN Atlas Load OM Model from Model Content Failed.\n");
+            return tnn_ret;
+        }
+    }
+    // Synchronize Device and Destroy Model Load Stream
+    acl_ret = aclrtSynchronizeDevice();
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("acl device synchronize failed (acl error code: %d)\n", acl_ret);
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl device synchronize falied");
+    }
+
+    // Step 3: Deduce Atlas OM Model Dynamic Type
+    tnn_ret = DeduceOMModelDynamicMode();
+    if (tnn_ret != TNN_OK) {
+        LOGE("TNN Atlas Deduce Model Dynamic Mode Failed.\n");
+        return tnn_ret;
+    }
+
+    // Step 4: Deduce Atlas OM Model AIPP input format if input is AIPP Mode.
+    tnn_ret = DeduceOMModelAIPPInputFormat();
+    if (tnn_ret != TNN_OK) {
+        LOGE("TNN Atlas Deduce Model AIPP input format Failed.\n");
+        return tnn_ret;
+    }
+
+
+    // Part 2: Allocate Input/Output, Reshape etc.
+    // Step 5: allocate input and output
+    tnn_ret = AllocateDatasetCreateBlob(&aclmdl_input_dataset_, max_inputs_shape, true);
+    if (tnn_ret != TNN_OK)
+        return tnn_ret;
+    tnn_ret = AllocateDatasetCreateBlob(&aclmdl_output_dataset_, max_inputs_shape, false);
+    if (tnn_ret != TNN_OK)
+        return tnn_ret;
+
+    // Step 6: set dynamic batch size
+    //         must do if input is dynamic batch
+    if (this->om_model_info_->dynamic_mode != AtlasOmModelDynamicMode::Static) {
+        for (auto item : input_blob_map_) {
+            tnn_ret = SetDynamicBatchSize(item.first, item.second->GetBlobDesc().dims[0]);
+            if (tnn_ret != TNN_OK)
+                return tnn_ret;
+        }
+    }
+
+    // Step 7: reshape if needed
+    tnn_ret = Reshape(max_inputs_shape);
+    if (tnn_ret != TNN_OK)
+        return tnn_ret;
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                          InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, 
+                          bool enable_const_folder) {
+    this->network_init_called_ = true;
+    this->config_ = net_config;
+    this->model_type_ = model_config.model_type;
+
+    // GetDevice and Context
+    this->device_ = GetDevice(net_config.device_type);
+    CHECK_PARAM_NULL(this->device_);
+    this->context_ = device_->CreateContext(net_config.device_id);
+    CHECK_PARAM_NULL(this->context_);
+
+    // Set AtlasContext model type
+    AtlasContext* atlas_context = dynamic_cast<AtlasContext *>(context_);
+    CHECK_PARAM_NULL(atlas_context);
+    atlas_context->SetModelType(model_config.model_type);
+
+    // Init Model For different Model Types
+    if (model_config.model_type == MODEL_TYPE_TORCHSCRIPT) {
+        LOGE("Fail to init AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to init AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET");
+    } else if (model_config.model_type == MODEL_TYPE_TNN ||
+               model_config.model_type == MODEL_TYPE_RAPIDNET) {
+        LOGE("Fail to init AtlasNetwork, MODEL_TYPE_TNN not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to init AtlasNetwork, MODEL_TYPE_TNN not supported YET");
+    } else if (model_config.model_type == MODEL_TYPE_ATLAS) {
+        return InitOMModel(model_config, interpreter, min_inputs_shape, max_inputs_shape,
+                           inputs_data_type, enable_const_folder);
+    } else {
+        LOGE("Fail to init AtlasNetwork, model type not supported.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to init AtlasNetwork, model type not supported");
+    }
+}
+
+
+Status AtlasNetwork::GetForwardMemorySize(size_t &memory_size) {
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        if (!om_model_info_) {
+            LOGE("Unable to Get ForwardMemorySize, ATLAS om ModelInfo Missing.\n");
+            return Status(TNNERR_DEVICE_NOT_SUPPORT, "Unable to Get ForwardMemorySize, ATLAS om ModelInfo Missing.");
+        }
+        memory_size = om_model_info_->memory_size + om_model_info_->weight_size;
+    }
+    return TNN_OK;
+}
+
+Status AtlasNetwork::SetCommandQueue(void *command_queue) {
+    return TNN_OK;
+}
+
+Status AtlasNetwork::SetForwardMemory(void *memory) {
+    LOGE("Not support setting forward memory in Atlas!\n");
+    return Status(TNNERR_DEVICE_NOT_SUPPORT, "Not support setting forward memory in Atlas!");
+
+}
+
+Status AtlasNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blob_map_;
+    return TNN_OK;
+}
+
+Status AtlasNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blob_map_;
+    return TNN_OK;
+}
+
+std::shared_ptr<AtlasOMModelInfo> AtlasNetwork::GetOMModelInfo() {
+    return this->om_model_info_;
+}
+
+Status AtlasNetwork::ReshapeOMModel(const InputShapesMap &inputs) {
+    AtlasContext* atlas_context = dynamic_cast<AtlasContext *>(context_);
+    CHECK_PARAM_NULL(atlas_context);
+
+    aclError acl_ret = aclrtSetCurrentContext(om_model_info_->aclrt_context);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("ReshapeOMModel set context failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "ReshapeOMModel set context failed");
+    }
+
+    for (auto item : inputs) {
+        if (input_blob_map_.find(item.first) != input_blob_map_.end()) {
+            auto dims_org = input_blob_map_[item.first]->GetBlobDesc().dims;
+            auto dims     = item.second;
+            
+            LOGD("reshape input %s form [%d,%d,%d,%d] to [%d,%d,%d,%d]\n", item.first.c_str(), dims_org[0], dims_org[1],
+                 dims_org[2], dims_org[3], dims[0], dims[1], dims[2], dims[3]);
+            input_blob_map_[item.first]->GetBlobDesc().dims = dims;
+
+            bool all_dims_equal = true;
+            for (int d = 0; d < dims.size(); d++) {
+                if (dims_org[d] != dims[d]) {
+                    all_dims_equal = false;
+                }
+            }
+            if (all_dims_equal) {
+                LOGD("input '%s' shape is same, no need to do reshape.\n",
+                     input_blob_map_[item.first]->GetBlobDesc().name.c_str());
+                continue;
+            }
+
+            // Traditional Dynamic Batch, Set Input/Output Blob Shape.
+            if (this->om_model_info_->dynamic_mode == AtlasOmModelDynamicMode::DynamicBatch) {
+                Status tnn_ret = SetDynamicBatchSize(item.first, dims[0]);
+                if (TNN_OK != tnn_ret)
+                    return tnn_ret;
+            }
+
+            // Range input for Model Converted with --input_shape_range
+            // Range input output shape cannot be infered from input shape.
+            // Output Shape will be deduced after ACL Forward() API is called.
+            if (this->om_model_info_->generic_dynamic_input_names.find(item.first) !=
+                this->om_model_info_->generic_dynamic_input_names.end()) {
+                Status tnn_ret = SetRangeDynamicInputDim(item.first, dims);
+                if (TNN_OK != tnn_ret)
+                    return tnn_ret;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+
+Status AtlasNetwork::Reshape(const InputShapesMap &inputs) {
+    // Reshape Model For different Model Types
+    if (this->model_type_ == MODEL_TYPE_TORCHSCRIPT) {
+        LOGE("Fail to reshape AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to reshape AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET");
+    } else if (this->model_type_ == MODEL_TYPE_TNN || this->model_type_ == MODEL_TYPE_RAPIDNET) {
+        LOGE("Fail to reshape AtlasNetwork, MODEL_TYPE_TNN not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to reshape AtlasNetwork, MODEL_TYPE_TNN not supported YET");
+    } else if (this->model_type_ == MODEL_TYPE_ATLAS) {
+        return ReshapeOMModel(inputs);
+    } else {
+        LOGE("Fail to reshape AtlasNetwork, model type not supported.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to reshape AtlasNetwork, model type not supported");
+    }
+}
+
+Status AtlasNetwork::GetCommandQueue(void **command_queue) {
+    return context_->GetCommandQueue(command_queue);
+}
+
+Status AtlasNetwork::Forward() {
+    // Reshape Model For different Model Types
+    if (this->model_type_ == MODEL_TYPE_TORCHSCRIPT) {
+        LOGE("Fail to execute AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to execute AtlasNetwork, MODEL_TYPE_TORCHSCRIPT not supported YET");
+    } else if (this->model_type_ == MODEL_TYPE_TNN || this->model_type_ == MODEL_TYPE_RAPIDNET ||
+               this->model_type_ == MODEL_TYPE_ATLAS) {
+        LOGD("Atlas Forward!\n");
+        AtlasContext* atlas_context = dynamic_cast<AtlasContext *>(context_);
+        CHECK_PARAM_NULL(atlas_context);
+
+        aclError acl_ret = aclrtSetCurrentContext(om_model_info_->aclrt_context);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("ReshapeOMModel set context failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "ReshapeOMModel set context failed");
+        }
+
+        acl_ret = aclrtSynchronizeStream(atlas_context->GetAclrtStream());
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("before forward synchronize stream failed\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "before forward synchronize stream failed");
+        }
+
+        acl_ret = aclmdlExecute(this->om_model_info_->model_id, aclmdl_input_dataset_, aclmdl_output_dataset_);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("execute model failed, modelId is %u\n", this->om_model_info_->model_id);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "execute model failed");
+        }
+
+        // For Range Dynamic Models with --input_shape_range
+        // Update Output Blob Shapes here.
+        if (!this->om_model_info_->generic_dynamic_input_names.empty()) {
+            Status tnn_ret = UpdateRangeDynamicOutputDims();
+            if (TNN_OK != tnn_ret) {
+                return tnn_ret;
+            }
+        }
+    } else {
+        LOGE("Fail to reshape AtlasNetwork, model type not supported.\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to reshape AtlasNetwork, model type not supported");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::ForwardAsync(Callback call_back) {
+    LOGD("Atlas Async Forward! (as same as Forward by now)\n");
+    return Forward();
+}
+
+
+Status AtlasNetwork::AllocateDatasetCreateBlob(aclmdlDataset **data_set, const InputShapesMap &max_input_shapes_map,
+                                               bool is_input) {
+    // This Function should be called twice.
+    // Input should be called first, then output should also be called.
+
+    if (nullptr == om_model_info_->model_desc) {
+        LOGE("no model description, create ouput failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "no model description, create ouput failed");
+    }
+
+    *data_set = aclmdlCreateDataset();
+    if (nullptr == *data_set) {
+        LOGE("can't create dataset, create output failed\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't create dataset, create output failed");
+    }
+
+    bool infer_output_shape_required = false;
+
+    size_t count = 0;
+    if (is_input) {
+        count = aclmdlGetNumInputs(om_model_info_->model_desc);
+        LOGD("AllocateDataset for input (count=%d)\n", count);
+    } else {
+        count = aclmdlGetNumOutputs(om_model_info_->model_desc);
+        LOGD("AllocateDataset for output (count=%d)\n", count);
+    }
+
+    for (size_t i = 0; i < count; ++i) {
+        size_t buffer_size = 0;
+        // OM Model Converted with atc config "--input_shape_range"
+        // does not have buffer_size info. buffer_size should be provided externally
+        // from MAX_INPUTS_SHAPE in "tnn::CreateInst() API"
+        if (is_input) {
+            buffer_size = aclmdlGetInputSizeByIndex(om_model_info_->model_desc, i);
+            if (buffer_size == 0) {
+                std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+                auto iter = max_input_shapes_map.find(input_name);
+                if (iter == max_input_shapes_map.end()) {
+                    LOGE("Shape of dynamic input: %s, not found in max_input_shapes_map.\n", input_name.c_str());
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR,
+                                  "Shape of dynamic input not found in max_input_shapes_map.");
+                }
+                buffer_size = sizeof(int64_t)*DimsVectorUtils::Count(iter->second);
+            }
+        } else {
+            buffer_size = aclmdlGetOutputSizeByIndex(om_model_info_->model_desc, i);
+            if (buffer_size == 0) {
+                std::string output_name = aclmdlGetOutputNameByIndex(om_model_info_->model_desc, i);
+                auto iter = max_input_shapes_map.find(output_name);
+                if (iter == max_input_shapes_map.end()) {
+                    LOGE("Shape of dynamic output: %s, not found in max_input_shapes_map.\n", output_name.c_str());
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR,
+                                  "Shape of dynamic output not found in max_input_shapes_map.");
+                } 
+                buffer_size = sizeof(int64_t)*DimsVectorUtils::Count(iter->second);
+            }
+        }
+
+        void *buffer     = nullptr;
+        aclError acl_ret = aclrtMalloc(&buffer, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("can't malloc buffer, size is %zu\n", buffer_size);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't malloc buffer");
+        }
+        LOGD("acl malloc buffer size: %zu  addr: 0x%lx\n", buffer_size, (long long)buffer);
+
+        aclDataBuffer *data_buffer = aclCreateDataBuffer(buffer, buffer_size);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("can't create data buffer\n");
+            aclrtFree(buffer);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't create data buffer");
+        }
+
+        acl_ret = aclmdlAddDatasetBuffer(*data_set, data_buffer);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("can't add data buffer, create output failed\n");
+            aclrtFree(buffer);
+            aclDestroyDataBuffer(data_buffer);
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't add data buffer");
+        }
+
+        // Add Blob to TNN Blob Map
+        Status ret = AddBlobToMap(max_input_shapes_map, i, buffer, is_input);
+        if (TNN_OK != ret) {
+            return ret;
+        }
+        
+        // for type 2 --input_shape_range input,
+        // Call ATC Dynamic Input API
+        // Create Tensor Desc for dynamic Input
+        // https://www.hiascend.com/document/detail/zh/canncommercial/601/inferapplicationdev/atctool/atctool_0053.html
+        if (is_input) {
+            std::string input_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, i);
+            if (om_model_info_->generic_dynamic_input_names.find(input_name) !=
+                om_model_info_->generic_dynamic_input_names.end()) {
+                auto iter = max_input_shapes_map.find(input_name);
+                if (iter == max_input_shapes_map.end()) {
+                    LOGE("MAX shape of Dynamic Input Range input '%s' not found.\n", input_name.c_str());
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "MAX shape of Dynamic Input Range input not found");
+                }
+
+                int64_t dim_arr[iter->second.size()];
+                for (int d = 0; d < iter->second.size(); d++) {
+                    dim_arr[d] = iter->second[d];
+                }
+                // Input TensorDesc should only be created ONCE.
+                // It will be destroyed in DeInit()
+                aclTensorDesc *input_desc =
+                    aclCreateTensorDesc(aclmdlGetInputDataType(om_model_info_->model_desc, i), iter->second.size(), dim_arr,
+                                                aclmdlGetInputFormat(om_model_info_->model_desc, i));
+                acl_ret = aclmdlSetDatasetTensorDesc(*data_set, input_desc, i);
+                if (acl_ret != ACL_ERROR_NONE) {
+                    LOGE("API aclmdlSetDatasetTensorDesc failed for input '%s'.\n", input_name.c_str());
+                    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "API aclmdlSetDatasetTensorDesc failed.");
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::AddBlobToMap(const InputShapesMap &max_input_shapes_map, size_t index, void *data, bool is_input) {
+    if (om_model_info_->model_desc == nullptr) {
+        LOGE("no model description\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "no model description");
+    }
+
+    Status ret = TNN_OK;
+    std::string blob_name = "";
+    std::vector<int> io_dims;
+    aclDataType data_type;
+    aclFormat data_format;
+
+    io_dims.clear();
+    if (is_input) {
+        // get blob name
+        blob_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, index);
+        // skip dynamic aipp input
+        if (blob_name.find(ACL_DYNAMIC_AIPP_NAME) != std::string::npos) {
+            LOGD("find dynamic aipp input (%s) and skip...\n", blob_name.c_str());
+            return TNN_OK;
+        }
+        // skip dynamic batch input
+        if (blob_name.find(ACL_DYNAMIC_TENSOR_NAME) != std::string::npos) {
+            LOGD("find dynamic batch/hw/dims input (%s) and skip...\n", blob_name.c_str());
+            //atc_mode_dynamic_batch_hw_dim_ = true;
+            //dynamic_batch_name_.push_back(blob_name);
+            return TNN_OK;
+        }
+        // get dims info and data format
+        ret = GetInputInfo(index, io_dims, data_format, data_type);
+        if (TNN_OK != ret) {
+            return ret;
+        }
+
+        // If "max_input_shapes" is externally provided.
+        // Set io_dims to max_input_shape.
+        auto max_input_shape_iter = max_input_shapes_map.find(blob_name);
+        auto max_input_range_iter = om_model_info_->generic_dynamic_input_names.find(blob_name);
+        if (max_input_range_iter != om_model_info_->generic_dynamic_input_names.end() &&
+            max_input_shape_iter == max_input_shapes_map.end()) {
+            // For MODELS with '--input_shape_range' type dynamic input,
+            // external "max_input_shapes" is REQUIRED.
+            LOGE("Max Input Shape is REQUIRED for dynamic input : '%s'.\n", blob_name.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Max Input Shape is REQUIRED for dynamic input.");
+        }
+        if (max_input_shape_iter != max_input_shapes_map.end()) {
+            io_dims.clear();
+            for (const auto& dim : max_input_shape_iter->second) {
+                io_dims.push_back(dim);
+            }
+        }
+
+        LOGD("input data type: %d, input data format: %d\n", data_type, data_format);
+        LOGD("input '%s' shape:\n", blob_name.c_str());
+        for (int i = 0; i < io_dims.size(); ++i) {
+            LOGD("[%d]\n", io_dims[i]);
+        }
+    } else {
+        // get blob name
+        blob_name = aclmdlGetOutputNameByIndex(om_model_info_->model_desc, index);
+        // get dims info
+        aclmdlIODims acl_dims;
+        aclError acl_ret = aclmdlGetOutputDims(om_model_info_->model_desc, index, &acl_dims);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("can't get output dims\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't get output dims");
+        }
+
+        if (om_model_info_->dynamic_mode == AtlasOmModelDynamicMode::DynamicBatch) {
+            // get dims0
+            int max_batch = GetMaxBatchSize(om_model_info_->model_desc, 1);
+            if (0 == max_batch) {
+                LOGE("get batch size failed\n");
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get batch size failed");
+            }
+            output_dim0_map_[blob_name] = std::max(1, (int)acl_dims.dims[0] / max_batch);
+        }
+        // get data type
+        data_type = aclmdlGetOutputDataType(om_model_info_->model_desc, index);
+        // get data format
+        data_format = aclmdlGetOutputFormat(om_model_info_->model_desc, index);
+        for (int i = 0; i < acl_dims.dimCount; ++i) {
+            io_dims.push_back((int)acl_dims.dims[i]);
+        }
+
+        // In rare cases like Detection Models
+        // When Max Output Dims cannot be infered directly from Max Input Dims
+        // TNN ATLAS Allow pre-defined "max_output_shapes" in "max_input_shapes"
+        // If "max_output_shapes" is externally provided in "max_input_shapes"
+        // Set io_dims to value in max_input_shape.
+        auto max_input_shape_iter = max_input_shapes_map.find(blob_name);
+        if (max_input_shape_iter != max_input_shapes_map.end()) {
+            LOGI("WARNING!!! Set MAX output shape of output '%s' to externally defined values in 'max_input_shapes'.\n",
+                 blob_name.c_str());
+            io_dims.clear();
+            for (const auto& dim : max_input_shape_iter->second) {
+                io_dims.push_back(dim);
+            }
+        }
+
+        LOGD("output data type: %d, output data format: %d\n", data_type, data_format);
+        LOGD("output '%s' shape:\n", blob_name.c_str());
+        for (int i = 0; i < io_dims.size(); ++i) {
+            LOGD("[%d]\n", (int)io_dims[i]);
+        }
+    }
+
+    BlobDesc blob_desc;
+    blob_desc.device_type = DEVICE_ATLAS;
+    ret                   = ConvertFromAclDataTypeToTnnDataType(data_type, blob_desc.data_type);
+    if (TNN_OK != ret) {
+        LOGE("convert from acl data type to tnn data type falied\n");
+        return ret;
+    }
+    ret = ConvertAclDataFormatToTnnDataFormat(data_format, blob_desc.data_format);
+    if (TNN_OK != ret) {
+        LOGE("convert from acl data format to tnn data format falied\n");
+        return ret;
+    }
+    for (int i = 0; i < io_dims.size(); ++i) {
+        blob_desc.dims.push_back((int)io_dims[i]);
+    }
+    for (int i = io_dims.size(); i < 4; ++i) {
+        blob_desc.dims.push_back(1);
+    }
+    blob_desc.name = blob_name;
+
+    BlobHandle blob_handle;
+    blob_handle.base = data;
+
+    Blob *blob = new Blob(blob_desc, blob_handle);
+    
+    // Add Blob To global_blob_om_model_map
+    global_blob_om_model_info_map[blob] = om_model_info_;
+    LOGD("Added Blob to global_blob_model_info_map, map.size = %d\n", global_blob_om_model_info_map.size());
+
+    if (is_input) {
+        input_blob_map_[blob_name] = blob;
+    } else {
+        output_blob_map_[blob_name] = blob;
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::GetInputInfo(size_t index, std::vector<int> &input_dims, aclFormat &input_format,
+                                  aclDataType &input_data_type) {
+    std::string blob_name = aclmdlGetInputNameByIndex(om_model_info_->model_desc, index);
+    aclAippInfo aipp_info;
+    aclError acl_ret = aclmdlGetFirstAippInfo(om_model_info_->model_id, index, &aipp_info);
+
+    input_dims.clear();
+    if (ACL_ERROR_NONE == acl_ret) {
+        // has static aipp
+        LOGD("shapeCount: %d   srcDimNum: %d\n", aipp_info.shapeCount, aipp_info.srcDimNum);
+
+        // get data format
+        input_format = aipp_info.srcFormat;
+
+        // get data type
+        input_data_type = aipp_info.srcDatatype;
+
+        if (aipp_info.shapeCount < 1) {
+            LOGE("model input is less than 1\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "model input is less than 1");
+        }
+        // get the max input dims
+        aclmdlIODims acl_dims = aipp_info.outDims[0].srcDims;
+        for (int i = 0; i < acl_dims.dimCount; ++i) {
+            input_dims.push_back((int)acl_dims.dims[i]);
+        }
+
+        for (int i = 1; i < aipp_info.shapeCount; ++i) {
+            acl_dims = aipp_info.outDims[i].srcDims;
+            for (int i = 0; i < acl_dims.dimCount; ++i) {
+                input_dims[i] = std::max((int)acl_dims.dims[i], input_dims[i]);
+            }
+        }
+    } else {
+        LOGD("get aipp info failed (ret=%d), use input info directly\n", acl_ret);
+
+        // get data format
+        input_format = aclmdlGetInputFormat(om_model_info_->model_desc, index);
+
+        // get data type
+        input_data_type = aclmdlGetInputDataType(om_model_info_->model_desc, index);
+
+        // get dims info
+        aclmdlIODims acl_dims;
+        aclError acl_ret = aclmdlGetInputDims(om_model_info_->model_desc, index, &acl_dims);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("can't get input dims\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "can't get input dims");
+        }
+        // in dynamic batch input, reset batch
+        if (-1 == acl_dims.dims[0]) {
+            auto buffer_size = aclmdlGetInputSizeByIndex(om_model_info_->model_desc, index);
+            int chw_size     = aclDataTypeSize(input_data_type);
+            for (int i = 1; i < acl_dims.dimCount; ++i) {
+                chw_size *= acl_dims.dims[i];
+            }
+            acl_dims.dims[0] = buffer_size / chw_size;
+
+            LOGD("dynamic batch input, batch is set to %d\n", acl_dims.dims[0]);
+        }
+        for (int i = 0; i < acl_dims.dimCount; ++i) {
+            input_dims.push_back((int)acl_dims.dims[i]);
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::SetRangeDynamicInputDim(std::string input_name, const DimsVector& target_input_shape) {
+    size_t index = 0;
+    aclError acl_ret = aclmdlGetInputIndexByName(om_model_info_->model_desc, input_name.c_str(), &index);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("get dynamic batch input index falied!\n");
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get dynamic batch input index falied");
+    }
+
+    // Deprecated in CANN 7+ Version
+    // Get & Destroy Old Output TensorDesc
+    //aclTensorDesc* old_input_desc = aclmdlGetDatasetTensorDesc(this->aclmdl_input_dataset_, index);
+    //if (old_input_desc == nullptr) {
+    //    LOGE("failed to get existing TensorDesc for input '%s'.\n", input_name.c_str());
+    //    return Status(TNNERR_ATLAS_RUNTIME_ERROR, "failed to get existing TensorDesc for dynamic input.");
+    //}
+    //aclDestroyTensorDesc(old_input_desc);
+
+    // Create & Set New Output TensorDesc
+    int64_t dim_arr[target_input_shape.size()];
+    for (int d = 0; d < target_input_shape.size(); d++) {
+        dim_arr[d] = target_input_shape[d];
+    }
+    aclTensorDesc *new_input_desc =
+        aclCreateTensorDesc(aclmdlGetInputDataType(om_model_info_->model_desc, index), target_input_shape.size(), dim_arr,
+                                        aclmdlGetInputFormat(om_model_info_->model_desc, index));
+    acl_ret = aclmdlSetDatasetTensorDesc(this->aclmdl_input_dataset_, new_input_desc, index);
+    if (acl_ret != ACL_ERROR_NONE) {
+        LOGE("API aclmdlSetDatasetTensorDesc failed for input '%s'.\n", input_name.c_str());
+        return Status(TNNERR_ATLAS_RUNTIME_ERROR, "API aclmdlSetDatasetTensorDesc failed.");
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::UpdateRangeDynamicOutputDims() {
+    int out_count = aclmdlGetNumOutputs(this->om_model_info_->model_desc);
+    for (int i=0; i<out_count; i++) {
+        aclTensorDesc* desc_i = aclmdlGetDatasetTensorDesc(this->aclmdl_output_dataset_, i);
+        std::string output_name = aclmdlGetOutputNameByIndex(this->om_model_info_->model_desc, i);
+        if (output_blob_map_.find(output_name) == output_blob_map_.end()) {
+            LOGE("Unable to find output '%s' in output blob map.\n", output_name.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Unable to find output in output blob map.");
+        }
+
+        int num_dims = aclGetTensorDescNumDims(desc_i);
+        if (num_dims > output_blob_map_[output_name]->GetBlobDesc().dims.size()) {
+            // Some default-created output blob come with dim = 4. Checking non-equity here will not work.
+            LOGE("Output '%s' ACL num_dim=%d, not equal with stored blob num_dim=%d\n", output_name.c_str(), num_dims,
+                 output_blob_map_[output_name]->GetBlobDesc().dims.size());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Output num_dim not equal with stored blob num_dim.");
+        }
+
+        for (int d=0; d<num_dims; d++) {
+            int64_t cur_dim = -1;
+            aclError acl_ret = aclGetTensorDescDimV2(desc_i, d, &cur_dim);
+            if (acl_ret != ACL_ERROR_NONE || cur_dim < 0) {
+                LOGE("API aclGetTensorDescDimV2 failed for output '%s'::dim[%d].\n", output_name.c_str(), d);
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "API aclGetTensorDescDimV2 failed for output '%s' dim[%d].");
+            }
+            if (cur_dim != output_blob_map_[output_name]->GetBlobDesc().dims[d]) {
+                LOGD("Update output '%s'::dim[%d] from %d to %d.\n", output_name.c_str(), d,
+                     output_blob_map_[output_name]->GetBlobDesc().dims[d], cur_dim);
+                output_blob_map_[output_name]->GetBlobDesc().dims[d] = cur_dim;
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status AtlasNetwork::SetDynamicBatchSize(std::string blob_name, int batch_size) {
+    if (IsDynamicBatch(this->om_model_info_->model_desc, blob_name) &&
+        om_model_info_->dynamic_mode != AtlasOmModelDynamicMode::Static) {
+        // set dynamic batch
+        size_t index     = 0;
+        aclError acl_ret = aclmdlGetInputIndexByName(om_model_info_->model_desc, ACL_DYNAMIC_TENSOR_NAME, &index);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("get dynamic batch input index falied!\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "get dynamic batch input index falied");
+        }
+        acl_ret = aclmdlSetDynamicBatchSize(om_model_info_->model_id, aclmdl_input_dataset_, index, batch_size);
+        if (acl_ret != ACL_ERROR_NONE) {
+            LOGE("set batch size (%s) in reshape failed\n", blob_name.c_str());
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "set batch size in reshape failed");
+        }
+        LOGD("input (%s) set dynamic batch size %d (index: %d)\n", blob_name.c_str(), batch_size, index);
+
+        // set output batch size
+        for (auto output_item : output_blob_map_) {
+            output_item.second->GetBlobDesc().dims[0] = output_dim0_map_[output_item.first] * batch_size;
+        }
+    } else {
+        LOGD("not dymamic batch input, skip\n");
+    }
+
+    return TNN_OK;
+}
+
+
+void AtlasNetwork::DestroyDataset(aclmdlDataset *&data_set) {
+    if (nullptr == data_set) {
+        return;
+    }
+
+    for (size_t i = 0; i < aclmdlGetDatasetNumBuffers(data_set); ++i) {
+        aclDataBuffer *data_buffer = aclmdlGetDatasetBuffer(data_set, i);
+        void *data                 = aclGetDataBufferAddr(data_buffer);
+        (void)aclrtFree(data);
+        (void)aclDestroyDataBuffer(data_buffer);
+    }
+
+    (void)aclmdlDestroyDataset(data_set);
+    data_set = nullptr;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_network.h b/source/tnn/device/atlas/atlas_network.h
new file mode 100644
index 000000000..535edd4bd
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_network.h
@@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_DEVICE_ATLAS_ATLAS_NETWORK_H_
+#define TNN_SOURCE_DEVICE_ATLAS_ATLAS_NETWORK_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "acl/acl.h"
+#include "tnn/core/default_network.h"
+#include "tnn/core/macro.h"
+#include "tnn/device/atlas/atlas_common_types.h"
+#include "tnn/device/atlas/atlas_context.h"
+
+namespace TNN_NS {
+
+class AtlasNetwork : public DefaultNetwork {
+public:
+    // @brief virtual default destructor
+    virtual ~AtlasNetwork();
+
+    // @brief init network with net cfg and net res.
+    // @param net_cfg
+    // @param net_res
+    virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type,
+                        bool enable_const_folder = true);
+
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by tnn layers for
+    //  forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    virtual Status GetForwardMemorySize(size_t &memory_size);
+
+    //  @brief: set memory used by the tnn instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the tnn network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by tnn layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, returns
+    //  an error code.
+    //
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief reshape network
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief set tnn command queue
+    virtual Status SetCommandQueue(void *command_queue);
+
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward();
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+    // @brief get OM info of ATLAS OM model
+    std::shared_ptr<AtlasOMModelInfo> GetOMModelInfo();
+
+private:
+    // OM RELATED
+    
+    // @brief load model from om file
+    Status LoadOMModelFromFile(const std::string &om_file);
+
+    // @brief load model from memory
+    Status LoadOMModelFromMemory(const std::string &om_content);
+
+    // @brief deduce model dynamic input mode
+    Status DeduceOMModelDynamicMode();
+    
+    // @brief deduce model AIPP input format
+    Status DeduceOMModelAIPPInputFormat();
+
+    // @brief internal init network for OM model
+    Status InitOMModel(ModelConfig &model_config, AbstractModelInterpreter *interpreter,
+                       InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                       InputDataTypeMap inputs_data_type, bool enable_const_folder);
+
+    // @brief internal reshape network for OM model
+    virtual Status ReshapeOMModel(const InputShapesMap &inputs);
+
+    // @brief get input dims
+    Status GetInputInfo(size_t index, std::vector<int> &input_dims, aclFormat &input_format,
+                        aclDataType &input_data_type);
+    
+    // @brief set dynamic input dims for OM models converted with --input_shape_range
+    Status SetRangeDynamicInputDim(std::string input_name, const DimsVector& target_input_shape);
+
+    // @brief update dynamic output dims for OM models converted with --input_shape_range
+    Status UpdateRangeDynamicOutputDims();
+
+    // @brief set dynmaic batch size
+    Status SetDynamicBatchSize(std::string blob_name, int batch_size);
+
+    std::map<std::string, int> output_dim0_map_;
+    void* om_model_memory_ptr_                        = nullptr;
+    void* om_model_weight_ptr_                        = nullptr;
+    std::shared_ptr<AtlasOMModelInfo> om_model_info_  = nullptr;
+
+
+
+    // @brief add blob into map
+    Status AddBlobToMap(const InputShapesMap &max_input_shapes_map, size_t index, void *data, bool is_input);
+
+    // @brief allocate data set and create Blob
+    Status AllocateDatasetCreateBlob(aclmdlDataset **data_set, const InputShapesMap &max_input_shapes_map,
+                                     bool is_input);
+
+    // @brief destory dataset
+    void DestroyDataset(aclmdlDataset *&data_set);
+
+    ModelType model_type_;
+    BlobMap input_blob_map_;
+    BlobMap output_blob_map_;
+
+    bool network_init_called_                         = false;
+    aclmdlDataset* aclmdl_input_dataset_              = nullptr;
+    aclmdlDataset* aclmdl_output_dataset_             = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_ATLAS_ATLAS_NETWORK_H_
diff --git a/source/tnn/device/atlas/atlas_om_model_interpreter.cc b/source/tnn/device/atlas/atlas_om_model_interpreter.cc
new file mode 100644
index 000000000..3fe1bf669
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_om_model_interpreter.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include "tnn/device/atlas/atlas_om_model_interpreter.h"
+#include "tnn/device/atlas/atlas_utils.h"
+#include "tnn/utils/split_utils.h"
+
+namespace TNN_NS {
+
+AtlasOMModelInterpreter::AtlasOMModelInterpreter() {}
+
+AtlasOMModelInterpreter::~AtlasOMModelInterpreter() {}
+
+Status AtlasOMModelInterpreter::Interpret(std::vector<std::string> &params) {
+    // OM Model Load API only support LOAD model directly ONTO device (Card)
+    // So the real model interpret path is in AtlasNetwork instead.
+    
+    // The only thing we need to do here is to store om_string locally,
+    // we USE MOVE here to save memory for large OM model.
+    this->om_str_ = std::move(params[0]);
+    //this->om_str_ = params[0];
+
+    return TNN_OK;
+}
+
+std::string& AtlasOMModelInterpreter::GetOmString() {
+    return this->om_str_;
+}
+
+TypeModelInterpreterRegister<TypeModelInterpreterCreator<AtlasOMModelInterpreter>> g_atlas_model_interpreter_register(
+    MODEL_TYPE_ATLAS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_om_model_interpreter.h b/source/tnn/device/atlas/atlas_om_model_interpreter.h
new file mode 100644
index 000000000..773900681
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_om_model_interpreter.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_DEVICE_ATLAS_ATLAS_OM_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_DEVICE_ATLAS_ATLAS_OM_MODEL_INTERPRETER_H_
+
+#include <climits>
+#include <memory>
+#include <map>
+#include <mutex>
+#include <vector>
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/device/atlas/atlas_common_types.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+// @brief Atlas OM model interpreter that interprets Atlas OM Model
+class AtlasOMModelInterpreter : public AbstractModelInterpreter {
+public:
+    AtlasOMModelInterpreter();
+
+    // @brief virtual destructor
+    virtual ~AtlasOMModelInterpreter();
+
+    // @brief different interpreter has different order param
+    virtual Status Interpret(std::vector<std::string> &params);
+
+    // @brief get model om string
+    std::string& GetOmString();
+    
+private:
+    std::string om_str_;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_ATLAS_ATLAS_OM_MODEL_INTERPRETER_H_
diff --git a/source/tnn/device/atlas/atlas_utils.cc b/source/tnn/device/atlas/atlas_utils.cc
new file mode 100644
index 000000000..12c9da093
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_utils.cc
@@ -0,0 +1,119 @@
+// Copyright 2019 Tencent. All Rights Reserved
+
+#include "atlas_utils.h"
+#include <stdio.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+namespace TNN_NS {
+
+Status ConvertFromAclDataTypeToTnnDataType(aclDataType acl_datatype, DataType& tnn_datatype) {
+    if (ACL_FLOAT == acl_datatype) {
+        tnn_datatype = DATA_TYPE_FLOAT;
+    } else if (ACL_FLOAT16 == acl_datatype) {
+        tnn_datatype = DATA_TYPE_HALF;
+    } else if (ACL_INT8 == acl_datatype || ACL_UINT8 == acl_datatype) {
+        tnn_datatype = DATA_TYPE_INT8;
+    } else if (ACL_INT32 == acl_datatype || ACL_UINT32 == acl_datatype) {
+        tnn_datatype = DATA_TYPE_INT32;
+    } else if (ACL_INT64 == acl_datatype || ACL_UINT64 == acl_datatype) {
+        tnn_datatype = DATA_TYPE_INT64;
+    } else {
+        LOGE("not support convert from acl datatype (%d) to tnn datatype\n", acl_datatype);
+        return Status(TNNERR_COMMON_ERROR, "the data type is not support");
+    }
+    return TNN_OK;
+}
+
+Status ConvertAclDataFormatToTnnDataFormat(aclFormat acl_format, DataFormat& tnn_dataformat) {
+    if (ACL_FORMAT_NCHW == acl_format || ACL_FORMAT_ND == acl_format) {
+        tnn_dataformat = DATA_FORMAT_NCHW;
+    } else if (ACL_FORMAT_NHWC == acl_format) {
+        tnn_dataformat = DATA_FORMAT_NHWC;
+    } else {
+        LOGE("not support convert from acl dataformat (%d) to tnn datatype\n", acl_format);
+        return Status(TNNERR_COMMON_ERROR, "the data format is not support");
+    }
+    return TNN_OK;
+}
+
+Status ConvertFromMatTypeToAippInputFormat(MatType mat_type, aclAippInputFormat& aipp_input_format) {
+    if (N8UC3 == mat_type) {
+        aipp_input_format = ACL_RGB888_U8;
+    } else if (N8UC4 == mat_type) {
+        aipp_input_format = ACL_XRGB8888_U8;
+    } else if (NNV12 == mat_type || NNV21 == mat_type) {
+        aipp_input_format = ACL_YUV420SP_U8;
+    } else if (NGRAY == mat_type) {
+        aipp_input_format = ACL_YUV400_U8;
+    } else {
+        LOGE("not support convert from mat type (%d) to aipp input format\n", mat_type);
+        return Status(TNNERR_ATLAS_AIPP_NOT_SUPPORT, "the mat type is not support");
+    }
+
+    return TNN_OK;
+}
+
+Status ConvertFromMatTypeToDvppPixelFormat(MatType mat_type, acldvppPixelFormat& dvpp_pixel_format) {
+    if (N8UC3 == mat_type) {
+        dvpp_pixel_format = PIXEL_FORMAT_RGB_888;
+    } else if (N8UC4 == mat_type) {
+        dvpp_pixel_format = PIXEL_FORMAT_RGBA_8888;
+    } else if (NNV12 == mat_type) {
+        dvpp_pixel_format = PIXEL_FORMAT_YUV_SEMIPLANAR_420;
+    } else if (NNV21 == mat_type) {
+        dvpp_pixel_format = PIXEL_FORMAT_YVU_SEMIPLANAR_420;
+    } else {
+        LOGE("not support convert from mat type (%d) to dvpp pixel format\n", mat_type);
+        return Status(TNNERR_ATLAS_DVPP_NOT_SUPPORT, "the mat type is not support");
+    }
+
+    return TNN_OK;
+}
+
+bool IsDynamicBatch(aclmdlDesc* model_desc, std::string input_name) {
+    size_t index     = 0;
+    aclError acl_ret = aclmdlGetInputIndexByName(model_desc, input_name.c_str(), &index);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return false;
+    }
+
+    aclmdlIODims acl_dims;
+    acl_ret = aclmdlGetInputDims(model_desc, index, &acl_dims);
+    if (ACL_ERROR_NONE != acl_ret) {
+        return false;
+    }
+
+    if (-1 == acl_dims.dims[0]) {
+        return true;
+    }
+    return false;
+}
+
+int GetMaxBatchSize(aclmdlDesc *desc, int default_batch) {
+    aclmdlBatch batch_info;
+
+    aclError acl_ret = aclmdlGetDynamicBatch(desc, &batch_info);
+    if (ACL_ERROR_NONE != acl_ret) {
+        LOGE("get dynamic batch info failed\n");
+        return 0;
+    }
+
+    int max_batchsize = 0;
+    if (batch_info.batchCount > 0) {
+        // dynamic batch
+        for (int i = 0; i < batch_info.batchCount; ++i) {
+            if (batch_info.batch[i] > max_batchsize) {
+                max_batchsize = batch_info.batch[i];
+            }
+        }
+    } else {
+        // static batch
+        max_batchsize = default_batch;
+    }
+
+    LOGD("get max batch size: %d\n", max_batchsize);
+    return max_batchsize;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/atlas_utils.h b/source/tnn/device/atlas/atlas_utils.h
new file mode 100644
index 000000000..add2203c8
--- /dev/null
+++ b/source/tnn/device/atlas/atlas_utils.h
@@ -0,0 +1,34 @@
+// Copyright 2019 Tencent. All Rights Reserved
+
+#ifndef TNN_SOURCE_DEVICE_ATLAS_ATLAS_UTILS_H_
+#define TNN_SOURCE_DEVICE_ATLAS_ATLAS_UTILS_H_
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "acl/acl.h"
+#include "acl/ops/acl_dvpp.h"
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/utils/blob_converter.h"
+
+namespace TNN_NS {
+
+Status ConvertFromAclDataTypeToTnnDataType(aclDataType acl_datatype, DataType& tnn_datatype);
+
+Status ConvertAclDataFormatToTnnDataFormat(aclFormat acl_format, DataFormat& tnn_dataformat);
+
+Status ConvertFromMatTypeToAippInputFormat(MatType mat_type, aclAippInputFormat& aipp_input_format);
+
+Status ConvertFromMatTypeToDvppPixelFormat(MatType mat_type, acldvppPixelFormat& dvpp_pixel_format);
+
+bool IsDynamicBatch(aclmdlDesc* model_desc, std::string input_name);
+
+int GetMaxBatchSize(aclmdlDesc *desc, int default_batch);
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_ATLAS_ATLAS_UTILS_H_
diff --git a/source/tnn/device/atlas/tnn_impl_atlas.cc b/source/tnn/device/atlas/tnn_impl_atlas.cc
new file mode 100644
index 000000000..2dc8b6adc
--- /dev/null
+++ b/source/tnn/device/atlas/tnn_impl_atlas.cc
@@ -0,0 +1,208 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include "tnn/core/instance.h"
+#include "tnn/device/atlas/atlas_network.h"
+#include "tnn/device/atlas/atlas_utils.h"
+#include "tnn/device/atlas/tnn_impl_atlas.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplAtlas>> g_tnn_impl_atlas_factory_register(MODEL_TYPE_ATLAS);
+
+TNNImplAtlas::TNNImplAtlas() {}
+
+TNNImplAtlas::~TNNImplAtlas() {}
+
+Status TNNImplAtlas::Init(ModelConfig& config) {
+    TNNImpl::Init(config);
+    
+    this->model_type_ = config.model_type;
+
+    if (config.model_type == TNN_NS::MODEL_TYPE_TNN ||
+        config.model_type == TNN_NS::MODEL_TYPE_RAPIDNET ||
+        config.model_type == TNN_NS::MODEL_TYPE_ATLAS) {
+        LOGD("Model Type is TNN or ATLAS OM, ACL API Required. Call aclInit() ...\n");
+        aclError acl_ret = aclInit(nullptr);
+        if (acl_ret != ACL_ERROR_NONE && acl_ret != ACL_ERROR_REPEAT_INITIALIZE) {
+            LOGE("Atlas API: aclInit failed!\n");
+            return TNNERR_ATLAS_RUNTIME_ERROR;
+        }
+        LOGD("Model Type is TNN or ATLAS OM, ACL API Required. Call aclInit() ... done.\n");
+        this->acl_init_called_ = true;
+    }
+
+    auto interpreter = CreateModelInterpreter(config.model_type);
+    if (!interpreter) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+    interpreter_ = std::shared_ptr<AbstractModelInterpreter>(interpreter);
+    return interpreter_->Interpret(config.params);
+}
+
+Status TNNImplAtlas::DeInit() {
+    if (this->acl_init_called_) {
+        LOGD("TNNImplAtlas DeInit: to call aclFinalize().\n");
+        aclError ret = aclFinalize();
+        if (ret != ACL_ERROR_NONE) {
+            LOGD("TNNImplAtlas DeInit: ATLAS API: aclFinalize failed!\n");
+        }
+    }
+
+    return TNN_OK;
+}
+
+Status TNNImplAtlas::AddOutput(const std::string& layer_name, int output_index) {
+    LOGE("AddOutput() API not supported on TNN ATLAS.\n");
+    return Status(TNNERR_DEVICE_NOT_SUPPORT, "AddOutput() API not supported on TNN ATLAS.\n");
+}
+
+Status TNNImplAtlas::GetModelInputNames(std::vector<std::string>& input_names) {
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        if (this->om_model_desc_of_the_first_instance_ == nullptr) {
+            LOGE("Fail to Get TNN Atlas ModelInputNames, model desc missing.");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to Get TNN Atlas ModelInputNames, model desc missing.");
+        }
+        
+        size_t num_inputs = aclmdlGetNumInputs(this->om_model_desc_of_the_first_instance_);
+        std::vector<std::string> in_names_vec;
+        for (size_t i=0; i<num_inputs; i++) {
+            std::string input_name;
+            input_name.assign(aclmdlGetInputNameByIndex(this->om_model_desc_of_the_first_instance_, i));
+            in_names_vec.emplace_back(input_name);
+        }
+        input_names = in_names_vec;
+    } else {
+        LOGE("API not supported for current MODEL TYPE.\n");
+    }
+
+    return TNN_OK;
+}
+
+Status TNNImplAtlas::GetModelOutputNames(std::vector<std::string>& output_names) {
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        if (this->om_model_desc_of_the_first_instance_ == nullptr) {
+            LOGE("Fail to Get TNN Atlas ModelOutputNames, model desc missing.\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to Get TNN Atlas ModelOutputNames, model desc missing.");
+        }
+        
+        size_t num_outputs = aclmdlGetNumOutputs(this->om_model_desc_of_the_first_instance_);
+        std::vector<std::string> out_names_vec;
+        for (size_t i=0; i<num_outputs; i++) {
+            std::string output_name;
+            output_name.assign(aclmdlGetOutputNameByIndex(this->om_model_desc_of_the_first_instance_, i));
+            out_names_vec.emplace_back(output_name);
+        }
+        output_names = out_names_vec;
+    } else {
+        LOGE("API not supported for current MODEL TYPE.\n");
+    }
+
+    return TNN_OK;
+}
+
+Status TNNImplAtlas::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        if (this->om_model_desc_of_the_first_instance_ == nullptr) {
+            LOGE("Fail to Get TNN Atlas ModelInputNames, model desc missing.\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to Get TNN Atlas ModelInputNames, model desc missing.");
+        }
+        
+        size_t num_inputs = aclmdlGetNumInputs(this->om_model_desc_of_the_first_instance_);
+        InputShapesMap in_shapes_map;
+        for (size_t i=0; i<num_inputs; i++) {
+            aclmdlIODims acl_dims;
+            aclError acl_ret = aclmdlGetInputDims(this->om_model_desc_of_the_first_instance_, i, &acl_dims);
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("acl get input dim failed (acl error code: %d)\n", acl_ret);
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl get input dim falied");
+            }
+            std::string input_name;
+            input_name.assign(aclmdlGetInputNameByIndex(this->om_model_desc_of_the_first_instance_, i));
+            std::vector<int> in_dims;
+            for (int d=0; d<std::min(int(acl_dims.dimCount),7); d++) { // Max Dim Allowed is 6.
+                if (acl_dims.dims[d]!=0) {
+                    in_dims.push_back(acl_dims.dims[d]);
+                } else {
+                    break;
+                }
+            }
+            in_shapes_map[input_name] = in_dims;
+        }
+        shapes_map = in_shapes_map;
+    } else {
+        LOGE("API not supported for current MODEL TYPE.\n");
+    }
+    return TNN_OK;
+}
+
+Status TNNImplAtlas::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        if (this->om_model_desc_of_the_first_instance_ == nullptr) {
+            LOGE("Fail to Get TNN Atlas ModelInputNames, model desc missing.\n");
+            return Status(TNNERR_ATLAS_RUNTIME_ERROR, "Fail to Get TNN Atlas ModelInputNames, model desc missing.");
+        }
+        
+        size_t num_inputs = aclmdlGetNumInputs(this->om_model_desc_of_the_first_instance_);
+        InputDataTypeMap in_dtype_map;
+        for (size_t i=0; i<num_inputs; i++) {
+            std::string input_name;
+            input_name.assign(aclmdlGetInputNameByIndex(this->om_model_desc_of_the_first_instance_, i));
+            aclDataType acl_dtype = aclmdlGetInputDataType(this->om_model_desc_of_the_first_instance_, i);
+            DataType tnn_dtype;
+            aclError acl_ret = ConvertFromAclDataTypeToTnnDataType(acl_dtype, tnn_dtype);
+            if (acl_ret != ACL_ERROR_NONE) {
+                LOGE("acl get input data type failed, maybe unsupported data type (acl error code: %d)\n", acl_ret);
+                return Status(TNNERR_ATLAS_RUNTIME_ERROR, "acl get input data type failed");
+            }
+            in_dtype_map[input_name] = tnn_dtype;
+        }
+        data_type_map = in_dtype_map;
+    } else {
+        LOGE("API not supported for current MODEL TYPE.\n");
+    }
+    return TNN_OK;
+}
+
+std::shared_ptr<Instance> TNNImplAtlas::CreateInst(NetworkConfig& net_config, Status& status,
+                                                   InputShapesMap inputs_shape, InputDataTypeMap inputs_data_type) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, inputs_shape, inputs_data_type);
+    return instance;
+}
+
+std::shared_ptr<Instance> TNNImplAtlas::CreateInst(NetworkConfig& net_config, Status& status,
+                                                   InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, 
+                                                   InputDataTypeMap inputs_data_type) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape, inputs_data_type);
+
+    if (model_type_ == MODEL_TYPE_ATLAS) {
+        AtlasNetwork* atlas_net = reinterpret_cast<AtlasNetwork*>(instance->GetNetwork());
+        if (this->om_model_id_of_the_first_instance_ == 0) {
+            this->om_model_id_of_the_first_instance_ = atlas_net->GetOMModelInfo()->model_id;
+            LOGD("TNNImplAtlas init the first Instance, get model id.\n");
+        }
+        if (this->om_model_desc_of_the_first_instance_ == nullptr) {
+            this->om_model_desc_of_the_first_instance_ = atlas_net->GetOMModelInfo()->model_desc;
+            LOGD("TNNImplAtlas init the first Instance, get model desc.\n");
+        }
+    }
+
+    return instance;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/atlas/tnn_impl_atlas.h b/source/tnn/device/atlas/tnn_impl_atlas.h
new file mode 100644
index 000000000..ff4023cb6
--- /dev/null
+++ b/source/tnn/device/atlas/tnn_impl_atlas.h
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_DEVICE_ATLAS_TNN_IMPL_ATLAS_H_
+#define TNN_SOURCE_DEVICE_ATLAS_TNN_IMPL_ATLAS_H_
+
+#include "acl/acl.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/tnn_impl.h"
+
+namespace TNN_NS {
+
+// @brief tnn impl with interpreter
+class TNNImplAtlas : public TNNImpl {
+public:
+    // @brief tnn constructor
+    TNNImplAtlas();
+
+    // @brief tnn destructor
+    virtual ~TNNImplAtlas();
+
+    // @brief init the tnn, contruct model interpreter
+    // @param config config model type and params
+    // @return status code: 0 if succeed elsewise error codes
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: 0 if succeed elsewise error codes
+    virtual Status AddOutput(const std::string& output_name, int output_index = 0);
+
+    //@brief get input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    //@brief return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    //@brief return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& output_names);
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use shape in the proto
+    // @param status code: 0 if succeed elsewise error codes
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap inputs_shape = InputShapesMap(),
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param status code: 0 if succeed elsewise error codes
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status, InputShapesMap min_inputs_shape,
+                                                 InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type = InputDataTypeMap());
+
+private:
+    std::shared_ptr<AbstractModelInterpreter> interpreter_;
+    ModelType model_type_;
+    bool acl_init_called_ = false;
+
+    // OM Model Desc and OM Model id for the first instance.
+    // Set when the first Effective CreateInst is called.
+    // Usage: Get input/output names, shapes, datatypes ... etc.
+    uint32_t om_model_id_of_the_first_instance_ = 0;
+    aclmdlDesc* om_model_desc_of_the_first_instance_ = nullptr;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_DEVICE_ATLAS_TNN_IMPL_ATLAS_H_
diff --git a/source/tnn/device/cpu/acc/compute/compute_elewise.h b/source/tnn/device/cpu/acc/compute/compute_elewise.h
index 3c5e47d67..faac35114 100644
--- a/source/tnn/device/cpu/acc/compute/compute_elewise.h
+++ b/source/tnn/device/cpu/acc/compute/compute_elewise.h
@@ -124,6 +124,43 @@ void CPU_ELEMENT_WISE(const std::vector<void *> &input_ptrs, const std::vector<D
     }
 }
 
+/*
+ * Output[i] = input0[i] + input1[i] with output type cast to higher level data type.
+ * float > half/bfp16 > int32 > int16/int8
+ * CPU_ELEMENT_WISE_BINARY_TYPECAST supports broadcast on all dimensions
+ */
+template <typename T_IN_0, typename T_IN_1, typename T_OUT>
+void CPU_ELEMENT_WISE_BINARY_TYPECAST(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
+                             const DimsVector& shape_output, std::function<T_OUT(T_IN_0, T_IN_1)> op) {
+    const int count = DimsVectorUtils::Count(shape_output);
+    T_OUT *output_data = static_cast<T_OUT *>(output);
+
+    OMP_PARALLEL_FOR_
+    for (int offset = 0; offset < count; ++offset) {
+        DimsVector output_index = DimsOffsetUtils::ConvertOffsetToIndex(shape_output, offset);
+
+        T_IN_0 *in0_data = static_cast<T_IN_0 *>(input_ptrs[0]);
+        T_IN_1 *in1_data = static_cast<T_IN_1 *>(input_ptrs[1]);
+        auto in0_shape = input_shapes[0];
+        auto in1_shape = input_shapes[1];
+        auto in0_diff = shape_output.size() - in0_shape.size();
+        auto in1_diff = shape_output.size() - in1_shape.size();
+           
+        DimsVector in0_index;
+        DimsVector in1_index;
+        for (int i = 0; i < in0_shape.size(); ++i) {
+            in0_index.push_back(std::min(output_index[i + in0_diff], in0_shape[i] - 1));
+        }
+        for (int i = 0; i < in1_shape.size(); ++i) {
+            in1_index.push_back(std::min(output_index[i + in1_diff], in1_shape[i] - 1));
+        }
+        int in0_offset = DimsOffsetUtils::ConvertIndexToOffset(in0_shape, in0_index);
+        int in1_offset = DimsOffsetUtils::ConvertIndexToOffset(in1_shape, in1_index);
+ 
+        output_data[offset] = op(in0_data[in0_offset], in1_data[in1_offset]);
+    }
+}
+
 // float add
 void CPU_MIN(const std::vector<void *> &input_ptrs, const std::vector<DimsVector> &input_shapes, void *output,
              DimsVector shape_output);
diff --git a/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc
index b99615665..ad1dab787 100644
--- a/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_add_layer_acc.cc
@@ -21,14 +21,12 @@ DECLARE_CPU_BINARY_OP_ACC(Add, LAYER_ADD);
 
 Status CpuAddLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
-    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        CPU_ADD(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
-        void *output_data = output->GetHandle().base;
-        const auto &output_dims = output->GetBlobDesc().dims;
-        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](int a, int b) -> int { return a + b; });
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+    void *output_data       = output->GetHandle().base;
+    const auto &output_dims = output->GetBlobDesc().dims;
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    auto layer_res          = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
         std::vector<float *> scale_ptrs;
         std::vector<int8_t *> zero_point_ptrs;
 
@@ -44,10 +42,58 @@ Status CpuAddLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
                 reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>(),
                 reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->zero_point_handle.force_to<int8_t *>(),
                 output->GetBlobDesc().dims);        
+        return TNN_OK;
+    }
+
+    DataType in0_dtype, in1_dtype;
+    if (input_blobs.size() == 2) {
+        in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        in1_dtype = input_blobs[1]->GetBlobDesc().data_type;
+    } else if (input_blobs.size() == 1) {
+        if (layer_param->weight_input_index == 0) {
+            in0_dtype = layer_res->element_handle.GetDataType();
+            in1_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        } else {
+            in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+            in1_dtype = layer_res->element_handle.GetDataType();
+        }
+    }
+
+    if (input_blobs.size()<=2 && in0_dtype != in1_dtype) {
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, fp16_t b) -> float { return a + float(b); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, float b) -> float { return float(a) + b; });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, int b) -> float { return a + float(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, float b) -> float { return float(a) + b; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, int b) -> fp16_t { return a + fp16_t(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, fp16_t b) -> fp16_t { return fp16_t(a) + b; });
+        } else {
+            LOGE("Error: CpuAddLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuAddLayerAcc don't support in0, in1 data type combination");
+        }
     } else {
-        LOGE("Error: CpuAddLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
-        return Status(TNNERR_MODEL_ERR, "Error: CpuAddLayerAcc don't support data type");
+        if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            CPU_ADD(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                      [](int a, int b) -> int { return a + b; });
+        } else {
+            LOGE("Error: CpuAddLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuAddLayerAcc don't support data type");
+        }
     }
+
     return TNN_OK;
 }
 REGISTER_CPU_ACC(Add, LAYER_ADD);
diff --git a/source/tnn/device/cpu/acc/cpu_and_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_and_layer_acc.cc
index a2da87e95..3c56acafd 100644
--- a/source/tnn/device/cpu/acc/cpu_and_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_and_layer_acc.cc
@@ -21,7 +21,12 @@ DECLARE_CPU_BINARY_OP_ACC(And, LAYER_AND);
 
 Status CpuAndLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
-    if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data       = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                     [](int a, int b) -> char { return a && b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
         void *output_data       = output->GetHandle().base;
         const auto &output_dims = output->GetBlobDesc().dims;
         CPU_ELEMENT_WISE<char, char>(input_ptrs, input_shapes, output_data, output_dims,
@@ -32,6 +37,7 @@ Status CpuAndLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
     }
     return TNN_OK;
 }
+
 REGISTER_CPU_ACC(And, LAYER_AND);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc
index a2a869452..6d48ef1f9 100644
--- a/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_bias_add_layer_acc.cc
@@ -25,9 +25,9 @@ Status CpuBiasAddLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
 }
 
 Status CpuBiasAddLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-    auto resource = dynamic_cast<BiasAddLayerResource *>(resource_);
+    auto resource = dynamic_cast<BatchNormLayerResource *>(resource_);
     if (!resource) {
-        return Status(TNNERR_MODEL_ERR, "Error: BiasAddLayerResource is nil");
+        return Status(TNNERR_MODEL_ERR, "Error: BatchNormLayerResource is nil");
     }
 
     auto input_blob        = inputs[0];
diff --git a/source/tnn/device/cpu/acc/cpu_cumsum_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_cumsum_layer_acc.cc
new file mode 100644
index 000000000..355587abf
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_cumsum_layer_acc.cc
@@ -0,0 +1,154 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Cumsum, LAYER_Cumsum);
+
+Status CpuCumsumLayerAcc::Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    return TNN_OK;
+}
+
+template <typename T>
+void CPUCumsumKernel(const T* input, T* output, const int dim_pre, const int dim_curr, const int dim_post,
+                     const bool exclusive, const bool exclusive_extend, const bool reverse) {
+    // Set std::function according to 'exclusive' and 'reverse' settings.
+    std::function<void(const T*, T*, int, int, const int, const int)> func_cumsum_loop;
+    if (exclusive && !exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            for (int i=0; i<dim_curr; i++) {
+                output[offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset -= dim_post;
+            }
+        };
+    } else if (exclusive && !exclusive_extend && !reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr; i++) {
+                output[offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset += dim_post;
+            }
+        };
+    } else if (!exclusive && exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            out_offset += dim_post * dim_curr;
+            for (int i=0; i<dim_curr+1; i++) {
+                output[out_offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset -= dim_post;
+                out_offset -= dim_post;
+            }
+        };
+    } else if (!exclusive && exclusive_extend && !reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr+1; i++) {
+                output[out_offset] = curr_cumsum;
+                curr_cumsum += input[offset];
+                offset += dim_post;
+                out_offset += dim_post;
+            }
+        };
+    } else if (!exclusive && !exclusive_extend && reverse) {
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            offset += dim_post * (dim_curr - 1);
+            for (int i=0; i<dim_curr; i++) {
+                curr_cumsum += input[offset];
+                output[offset] = curr_cumsum;
+                offset -= dim_post;
+            }
+        };
+    } else { // !exclusive && !reverse
+        func_cumsum_loop = [](const T* input, T* output, int offset, int out_offset, const int dim_curr, const int dim_post) -> void {
+            T curr_cumsum = T(0);
+            for (int i=0; i<dim_curr; i++) {
+                curr_cumsum += input[offset];
+                output[offset] = curr_cumsum;
+                offset += dim_post;
+            }
+        };
+    }
+
+    // Main Compute Loop
+    for (int i = 0; i<dim_pre; i++) {
+        for (int j = 0; j<dim_post; j++) {
+            int curr_offset = i * dim_curr * dim_post + j;
+            int curr_out_offset = i * (dim_curr+1) * dim_post + j; // used ONLY in Exclusive Extend Mode.
+            func_cumsum_loop(input, output, curr_offset, curr_out_offset, dim_curr, dim_post);
+        }
+    }
+}
+
+Status CpuCumsumLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // Operator Cumsum input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+
+    auto cumsum_param  = dynamic_cast<CumsumLayerParam*>(param_);
+    if (cumsum_param == nullptr) {
+        LOGE("Error: CpuCumsumLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CpuCumsumLayer forward Load layer param failed!");
+    }
+    if (cumsum_param->axis < 0) {
+        cumsum_param->axis += input_dims.size();
+    }
+
+    int dim_pre  = 1;
+    int dim_curr = input_dims[cumsum_param->axis];
+    int dim_post = 1;
+    for (int i=0; i<cumsum_param->axis; i++) {
+        dim_pre *= input_dims[i];
+    }
+    for (int i=cumsum_param->axis+1; i<input_dims.size(); i++) {
+        dim_post *= input_dims[i];
+    }
+
+    DataType in_dtype = input_blob->GetBlobDesc().data_type;
+    if (in_dtype==DATA_TYPE_FLOAT) {
+        float* input_data  = reinterpret_cast<float *>(input_blob->GetHandle().base);
+        float* output_data = reinterpret_cast<float *>(output_blob->GetHandle().base);
+        CPUCumsumKernel<float>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                               cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else if (in_dtype==DATA_TYPE_HALF) {
+        fp16_t* input_data  = reinterpret_cast<fp16_t *>(input_blob->GetHandle().base);
+        fp16_t* output_data = reinterpret_cast<fp16_t *>(output_blob->GetHandle().base);
+        CPUCumsumKernel<fp16_t>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                                cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else if (in_dtype==DATA_TYPE_INT32) {
+        int* input_data  = reinterpret_cast<int *>(input_blob->GetHandle().base);
+        int* output_data = reinterpret_cast<int *>(output_blob->GetHandle().base);
+        CPUCumsumKernel<int>(input_data, output_data, dim_pre, dim_curr, dim_post,
+                             cumsum_param->exclusive, cumsum_param->exclusive_extend, cumsum_param->reverse);
+    } else {
+        LOGE("Error: CpuCumsumLayerAcc don't support data type: %d\n", output_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuCumsumLayerAcc don't support data type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Cumsum, LAYER_CUMSUM);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.cc
new file mode 100644
index 000000000..cfee5b879
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.cc
@@ -0,0 +1,173 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.h"
+
+#include <algorithm>
+
+#include "tnn/interpreter/layer_resource_generator.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+static int LeastCommonMultiple(int m, int n) {
+    int a = m, b = n;
+    while (a != b) {
+        if (a > b) {
+            a = a - b;
+        } else {
+            b = b - a;
+        }
+    }
+    return m * n / a;
+}
+
+CpuDeconv1DLayerAcc::~CpuDeconv1DLayerAcc() {}
+
+Status CpuDeconv1DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                               const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    CPU_CONVERT_HALF_RESOURCE(LAYER_DECONVOLUTION_1D);
+
+    if (outputs[0]->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        LOGE("CpuDeconvLayerAcc dont support DATA_TYPE_INT8");
+        return Status(TNNERR_PARAM_ERR, "CpuDeconvLayerAcc dont support DATA_TYPE_INT8");
+    }
+    return TNN_OK;
+}
+
+Status CpuDeconv1DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuDeconv1DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        return Exec<float>(inputs, outputs);
+    } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_BFP16) {
+        return Exec<bfp16_t>(inputs, outputs);
+    }
+    return Status(TNNERR_LAYER_ERR, "data type not support in deconv");
+}
+
+void CpuDeconv1DLayerAcc::ActiveOutput(ConvLayerParam *param, float &sum) {
+    if (param->activation_type == ActivationType_ReLU) {
+        sum = sum > 0.0f ? sum : 0.0f;
+    } else if (param->activation_type == ActivationType_ReLU6) {
+        if (sum > 6.0f) {
+            sum = 6.0f;
+        } else if (sum < 0.0f) {
+            sum = 0.0f;
+        }
+    } else if(param->activation_type == ActivationType_SIGMOID_MUL) {
+        sum = 1.0f / (1.0f + exp(-sum)) * sum;
+    }
+}
+
+template <typename T>
+Status CpuDeconv1DLayerAcc::Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param    = dynamic_cast<ConvLayerParam *>(param_);
+    auto resource = dynamic_cast<ConvLayerResource *>(resource_);
+    if (!param || !resource) {
+        return Status(TNNERR_MODEL_ERR, "Error: DeconvLayerParam or DeconvLayerResource is empty");
+    }
+
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    void *input_ptr   = input_blob->GetHandle().base;
+    void *output_ptr  = output_blob->GetHandle().base;
+    // NOTE: weight is format [n][i][o][w]
+    // different form conv weight layout [n][o][i][w]
+    void *weight_ptr   = resource->filter_handle.force_to<void *>();
+    void *bias_ptr     = param->bias ? resource->bias_handle.force_to<void *>() : nullptr;
+    DataType data_type = output_blob->GetBlobDesc().data_type;
+
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    const int batch        = output_dims[0];
+    const int group        = param->group;
+
+    const int output_channel           = output_dims[1];
+    const int output_channel_per_group = output_channel / group;
+    const int output_width             = output_dims[2];
+    const int output_size              = output_width;
+
+    const int input_channel           = input_dims[1];
+    const int input_channel_per_group = input_channel / group;
+    const int input_width             = input_dims[2];
+    const int input_size              = input_width;
+
+    const int pad_w_begin = param->pads[0];
+    const int kernel_w    = param->kernels[0];
+    const int kernel_size = kernel_w;
+
+    const int stride_w = param->strides[0];
+    const int dilation_w = param->dialations[0];
+
+    const int delta_kx = LeastCommonMultiple(dilation_w, stride_w) / dilation_w;
+    const int delta_ix = delta_kx * dilation_w / stride_w;
+
+    if (data_type != DATA_TYPE_INT8) {
+        // #pragma omp parallel
+        for (int b = 0; b < batch; b++) {
+            T *output_ptr_base = (T *)output_ptr + b * group * output_channel_per_group * output_size;
+            T *input_ptr_base  = (T *)input_ptr + b * group * input_channel_per_group * input_size;
+            for (int g = 0; g < group; g++) {
+                const float *weight_ptr_g =
+                    (float *)weight_ptr + g * input_channel_per_group * output_channel_per_group * kernel_size;
+                const float *bias_g = bias_ptr ? (float *)bias_ptr + g * output_channel_per_group : nullptr;
+                T *output_ptr_g     = output_ptr_base + g * output_channel_per_group * output_size;
+                T *input_ptr_g      = input_ptr_base + g * input_channel_per_group * input_size;
+
+                for (int oc = 0; oc < output_channel_per_group; oc++) {
+                    const float bias      = bias_g ? bias_g[oc] : 0.f;
+                    T *output_channel_ptr = output_ptr_g + oc * output_size;
+                    for (int ow = 0; ow < output_width; ow++) {
+                        T *outout_data_ptr = output_channel_ptr + ow;
+                        float sum          = bias;
+
+                        int ox     = ow + pad_w_begin;
+                        int max_sx = std::min((input_width - 1) * stride_w, ox / stride_w * stride_w);
+                        int min_kx = UP_DIV(ox - max_sx, dilation_w);
+                        if ((ox - min_kx * dilation_w) % stride_w == 0) {
+                            int min_sx = std::max(0, ROUND_UP(ox + dilation_w - kernel_w * dilation_w, stride_w));
+                            int max_kx = (ox - min_sx) / dilation_w;
+                            int min_ix = (ox - max_kx * dilation_w) / stride_w;
+
+                            auto weight_data = weight_ptr_g + oc * kernel_size;
+                            auto input_data  = (T *)input_ptr_g;
+                            for (auto ic = 0; ic < input_channel_per_group; ic++) {
+                                for (auto kx = max_kx, ix = min_ix; kx >= min_kx;
+                                     kx -= delta_kx, ix += delta_ix) {
+                                    auto wt4 = weight_data[ic * output_channel_per_group * kernel_size + kx];
+                                    auto in4 = input_data[ic * input_size + ix];
+                                    sum += float(in4) * wt4;
+                                }
+                            }
+                        }
+                        // post op : only support relu and relu6
+                        ActiveOutput(param, sum);
+                        *outout_data_ptr = sum;
+                    }
+                }
+            }
+        }
+    } else {
+        LOGE("Error: CpuDeconvLayerAcc layer acc dont support datatype: %d\n", data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuDeconvLayerAcc layer acc dont support datatype");
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuDeconv1DLayerAcc>> g_cpu_deconv_1d_layer_acc_register(LAYER_DECONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.h b/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.h
new file mode 100644
index 000000000..b441f387e
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_deconv_1d_layer_acc.h
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_1D_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_1D_LAYER_ACC_H_
+
+#include <vector>
+
+#include "tnn/core/blob.h"
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief Deconv layer cpu acc
+class CpuDeconv1DLayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuDeconv1DLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    template <typename T>
+    Status Exec(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    RawBuffer buffer_scale_;
+
+    std::shared_ptr<LayerResource> fp32_resource_ = nullptr;
+
+    void ActiveOutput(ConvLayerParam *param, float &sum);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_DECONV_1D_LAYER_ACC_H_
diff --git a/source/tnn/device/cpu/acc/cpu_dequantize_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_dequantize_layer_acc.cc
new file mode 100644
index 000000000..84097acbb
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_dequantize_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(DeQuantize, LAYER_DEQUANTIZE);
+
+Status CpuDeQuantizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuDeQuantizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(DeQuantize, LAYER_DEQUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc
index c8cd9d4fe..54ed730a2 100644
--- a/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_div_layer_acc.cc
@@ -23,15 +23,57 @@ Status CpuDivLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
     void *output_data       = output->GetHandle().base;
     const auto &output_dims = output->GetBlobDesc().dims;
-    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        CPU_ELEMENT_WISE<float, float>(input_ptrs, input_shapes, output_data, output_dims,
-                                [](float a, float b) -> float { return a / b; });
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
-        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](int a, int b) -> int { return a / b; });
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    auto layer_res          = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    DataType in0_dtype, in1_dtype;
+    if (input_blobs.size() == 2) {
+        in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        in1_dtype = input_blobs[1]->GetBlobDesc().data_type;
+    } else if (input_blobs.size() == 1) {
+        if (layer_param->weight_input_index == 0) {
+            in0_dtype = layer_res->element_handle.GetDataType();
+            in1_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        } else {
+            in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+            in1_dtype = layer_res->element_handle.GetDataType();
+        }
+    }
+
+    if (input_blobs.size()<=2 && in0_dtype != in1_dtype) {
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, fp16_t b) -> float { return a / float(b); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, float b) -> float { return float(a) / b; });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, int b) -> float { return a / float(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, float b) -> float { return float(a) / b; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, int b) -> fp16_t { return a / fp16_t(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, fp16_t b) -> fp16_t { return fp16_t(a) / b; });
+        } else {
+            LOGE("Error: CpuDivLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuDivLayerAcc don't support in0, in1 data type combination");
+        }
     } else {
-        LOGE("Error: CpuDivLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
-        return Status(TNNERR_MODEL_ERR, "CpuDivLayerAcc don't support data type");
+        if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE<float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, float b) -> float { return a / b; });
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                      [](int a, int b) -> int { return a / b; });
+        } else {
+            LOGE("Error: CpuDivLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "CpuDivLayerAcc don't support data type");
+        }
     }
     return TNN_OK;
 }
diff --git a/source/tnn/device/cpu/acc/cpu_effective_transformer_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_effective_transformer_layer_acc.cc
new file mode 100644
index 000000000..b0fc66874
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_effective_transformer_layer_acc.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+Status CpuEffectiveTransformerLayerAcc::Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuEffectiveTransformerLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // TODO: Fill-in EffectiveTransformer OP Implementation.
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc
index ca1259c58..5b80c3c6f 100644
--- a/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_equal_layer_acc.cc
@@ -12,7 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
 #include "tnn/utils/naive_compute.h"
 
 namespace TNN_NS {
@@ -24,32 +25,89 @@ Status CpuEqualLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::v
 }
 
 Status CpuEqualLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-    Blob *output_blob = outputs[0];
-    
+    void *output_data       = outputs[0]->GetHandle().base;
+    const auto &output_dims = outputs[0]->GetBlobDesc().dims;
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    auto layer_res          = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    const float FLOAT_EQUAL_EPSILON = 1e-6;
+
+    DataType in0_dtype, in1_dtype;
     std::vector<void *> input_ptrs;
     std::vector<DimsVector> input_shapes;
-    for (size_t inid = 0; inid < inputs.size(); inid++) {
-        input_ptrs.push_back(inputs[inid]->GetHandle().base);
-        input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
+    if (inputs.size() >= 2) {
+        for (size_t inid = 0; inid < inputs.size(); inid++) {
+            input_ptrs.push_back(inputs[inid]->GetHandle().base);
+            input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
+        }
+        in0_dtype = inputs[0]->GetBlobDesc().data_type;
+        in1_dtype = inputs[1]->GetBlobDesc().data_type;
+    } else {
+        DimsVector input_shape0 = inputs[0]->GetBlobDesc().dims;
+        if (layer_param->weight_input_index == 0) {
+            input_ptrs.push_back(layer_res->element_handle.force_to<void *>());
+            input_shapes.push_back(layer_res->element_shape);
+            in0_dtype = layer_res->element_handle.GetDataType();
+
+            input_ptrs.push_back(inputs[0]->GetHandle().base);
+            input_shapes.push_back(input_shape0);
+            in1_dtype = inputs[0]->GetBlobDesc().data_type;
+        } else {
+            input_ptrs.push_back(inputs[0]->GetHandle().base);
+            input_shapes.push_back(input_shape0);
+            in0_dtype = inputs[0]->GetBlobDesc().data_type;
+
+            input_ptrs.push_back(layer_res->element_handle.force_to<void *>());
+            input_shapes.push_back(layer_res->element_shape);
+            in1_dtype = layer_res->element_handle.GetDataType();
+        }
     }
-    
-    auto data_type = inputs[0]->GetBlobDesc().data_type;
-    void *output_data = output_blob->GetHandle().base;
-    const auto &output_dims = output_blob->GetBlobDesc().dims;
- 
-    if (data_type == DATA_TYPE_FLOAT) {
-        CPU_ELEMENT_WISE_COMPARE<float, char>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](float a, float b) -> char { return a == b; });
-    } else if(data_type == DATA_TYPE_INT32) {  
-        CPU_ELEMENT_WISE_COMPARE<int, char>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](int a, int b) -> char { return a == b; });
-    } else if(data_type == DATA_TYPE_INT8) {
-        CPU_ELEMENT_WISE_COMPARE<char, char>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](char a, char b) -> char { return a == b; });
+
+    if (inputs.size()<=2 && in0_dtype != in1_dtype) {
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](float a, fp16_t b) -> int8_t { return std::abs(a-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](fp16_t a, float b) -> int8_t { return std::abs(float(a)-b) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](float a, int b) -> int8_t { return std::abs(a-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](int a, float b) -> int8_t { return std::abs(float(a)-b) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](fp16_t a, int b) -> int8_t { return std::abs(float(a)-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [FLOAT_EQUAL_EPSILON](int a, fp16_t b) -> int8_t { return std::abs(float(a)-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else {
+            LOGE("Error: CpuEqualLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuEqualLayerAcc don't support in0, in1 data type combination");
+        }
     } else {
-        LOGE("Error: CpuEqualLayerAcc don't support data type: %d\n", data_type);
-        return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support data type");
+        if (in0_dtype == DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_COMPARE<float, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [FLOAT_EQUAL_EPSILON](float a, float b) -> int8_t { return std::abs(a-b) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype == DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_COMPARE<fp16_t, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [FLOAT_EQUAL_EPSILON](fp16_t a, fp16_t b) -> int8_t { return std::abs(float(a)-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype == DATA_TYPE_BFP16) {
+            CPU_ELEMENT_WISE_COMPARE<fp16_t, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [FLOAT_EQUAL_EPSILON](fp16_t a, fp16_t b) -> int8_t { return std::abs(float(a)-float(b)) < FLOAT_EQUAL_EPSILON ? 1 : 0; });
+        } else if (in0_dtype == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_COMPARE<int, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int a, int b) -> int8_t { return a == b; });
+        } else if (in0_dtype == DATA_TYPE_INT8) {
+            CPU_ELEMENT_WISE_COMPARE<int8_t, int8_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int8_t a, int8_t b) -> int8_t { return a == b; });
+        } else {
+            LOGE("Error: CpuEqualLayerAcc don't support data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support data type");
+        }
     }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc
index d0a28d614..682df897d 100644
--- a/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_expand_layer_acc.cc
@@ -37,7 +37,17 @@ Status CpuExpandLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inp
         for (int i=0; i<shape_data_count; i++) {
             shape_dims.push_back(shape_data[i]);
         }
-        
+ 
+        // Add for Dynamic Expand,try to Lower Dynamic Expand to static.
+        // Because TRT Does not accept dynamic Expand.
+        if (data_dims.size() == shape_data_count) {
+            for (int i=0; i<shape_data_count; i++) {
+                if (expand_param->shape.size()==shape_dims.size() && expand_param->shape[i]!=shape_dims[i] && data_dims[i]==shape_dims[i]) {
+                    shape_dims[i] = -1;
+                }
+            }
+        }
+
         expand_param->shape = shape_dims;
         
         auto output_dims = DimsFunctionUtils::Expand(data_dims, shape_dims, nullptr);
@@ -54,8 +64,7 @@ Status CpuExpandLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::
     auto input_dims = input_blob->GetBlobDesc().dims;
     
     const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
-    
-    
+
     int diff = output_dims.size() - input_dims.size();
     
     char *input_data  = reinterpret_cast<char *>(input_blob->GetHandle().base);
diff --git a/source/tnn/device/cpu/acc/cpu_flatten_torch_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_flatten_torch_layer_acc.cc
new file mode 100644
index 000000000..76a360a6c
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_flatten_torch_layer_acc.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <tnn/utils/data_type_utils.h>
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(FlattenTorch, LAYER_FLATTENTORCH);
+
+Status CpuFlattenTorchLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuFlattenTorchLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<FlattenTorchLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: FlattenTorchLayerParam is nil");
+    }
+
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    if (output->GetHandle().base != input->GetHandle().base) {
+        auto dims_input    = input->GetBlobDesc().dims;
+        int data_byte_size = DataTypeUtils::GetBytesSize(output->GetBlobDesc().data_type);
+        auto size_in_bytes = DimsVectorUtils::Count(dims_input) * data_byte_size;
+        memcpy(output->GetHandle().base, input->GetHandle().base, size_in_bytes);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(FlattenTorch, LAYER_FLATTENTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_fused_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_fused_layer_acc.cc
new file mode 100644
index 000000000..b2c8982d0
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_fused_layer_acc.cc
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Fused, LAYER_FUSED);
+
+Status CpuFusedLayerAcc::Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuFusedLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // TODO: Fill-in Fused OP Implementation for Fused Layer.
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Fused, LAYER_FUSED);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_greater_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_greater_layer_acc.cc
index 7baaf825c..ac899a0a8 100644
--- a/source/tnn/device/cpu/acc/cpu_greater_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_greater_layer_acc.cc
@@ -27,6 +27,12 @@ Status CpuGreaterLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, con
     if (data_type == DATA_TYPE_FLOAT) {
         CPU_ELEMENT_WISE_COMPARE<float, char>(input_ptrs, input_shapes, output_data, output_dims,
                                       [](float a, float b) -> char { return a > b ? 1 : 0; });
+    } else if (data_type == DATA_TYPE_HALF) {
+        CPU_ELEMENT_WISE_COMPARE<fp16_t, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, fp16_t b) -> char { return a > b ? 1 : 0; });
+    } else if (data_type == DATA_TYPE_BFP16) {
+        CPU_ELEMENT_WISE_COMPARE<bfp16_t, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](bfp16_t a, bfp16_t b) -> char { return a > b ? 1 : 0; });
     } else if (data_type == DATA_TYPE_INT32) {
         CPU_ELEMENT_WISE_COMPARE<int, char>(input_ptrs, input_shapes, output_data, output_dims,
                                     [](int a, int b) -> char { return a > b ? 1 : 0; });
diff --git a/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc
index 47965b804..fe1377fa1 100644
--- a/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_layer_norm_layer_acc.cc
@@ -19,9 +19,49 @@
 
 namespace TNN_NS {
 
-DECLARE_CPU_ACC_WITH_FUNC(LayerNorm, LAYER_LAYER_NORM,
-                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
-                                                                 const std::vector<Blob *> &outputs););
+class CpuLayerNormLayerAcc : public CpuLayerAcc {
+public:
+    virtual ~CpuLayerNormLayerAcc(){};
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+Status CpuLayerNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto scale_blob = inputs[1];
+    auto bias_blob  = inputs[2];
+
+    // Convert Scale and Bias to float if they are of half type.
+    if (scale_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        std::string name = scale_blob->GetBlobDesc().name;
+        if (const_resource_ == nullptr || const_resource_->find(name) == const_resource_->end()) {
+            return Status(TNNERR_LAYER_ERR, "CPULayerNormLayerAcc has invalid scale, unable to find scale in constant_map.");
+        }
+        auto scale_fp16 = (*const_resource_)[name];
+        auto scale_fp32 = std::make_shared<RawBuffer>(ConvertHalfHandle(*scale_fp16));
+        scale_fp32->SetBufferDims(scale_fp16->GetBufferDims());
+        (*const_resource_)[name] = scale_fp32;
+    }
+    
+    if (bias_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        std::string name = bias_blob->GetBlobDesc().name;
+        if (const_resource_ == nullptr || const_resource_->find(name) == const_resource_->end()) {
+            return Status(TNNERR_LAYER_ERR, "CPULayerNormLayerAcc has invalid bias, unable to find bias in constant_map.");
+        }
+        auto bias_fp16 = (*const_resource_)[name];
+        auto bias_fp32 = std::make_shared<RawBuffer>(ConvertHalfHandle(*bias_fp16));
+        bias_fp32->SetBufferDims(bias_fp16->GetBufferDims());
+        (*const_resource_)[name] = bias_fp32;
+    }
+
+    Status ret = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    
+    RETURN_ON_NEQ(ret, TNN_OK);
+    return TNN_OK;
+}
 
 Status CpuLayerNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return TNN_OK;
diff --git a/source/tnn/device/cpu/acc/cpu_less_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_less_layer_acc.cc
index 695eac0c6..76a8254f4 100644
--- a/source/tnn/device/cpu/acc/cpu_less_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_less_layer_acc.cc
@@ -27,6 +27,12 @@ Status CpuLessLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const
     if (data_type == DATA_TYPE_FLOAT) {
         CPU_ELEMENT_WISE_COMPARE<float, char>(input_ptrs, input_shapes, output_data, output_dims,
                                       [](float a, float b) -> char { return a < b ? 1 : 0; });
+    } else if (data_type == DATA_TYPE_HALF) {
+        CPU_ELEMENT_WISE_COMPARE<fp16_t, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, fp16_t b) -> char { return a < b ? 1 : 0; });
+    } else if (data_type == DATA_TYPE_BFP16) {
+        CPU_ELEMENT_WISE_COMPARE<bfp16_t, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](bfp16_t a, bfp16_t b) -> char { return a < b ? 1 : 0; });
     } else if (data_type == DATA_TYPE_INT32) {
         CPU_ELEMENT_WISE_COMPARE<int, char>(input_ptrs, input_shapes, output_data, output_dims,
                                     [](int a, int b) -> char { return a < b ? 1 : 0; });
diff --git a/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc
index 5d226017d..0df17b10b 100644
--- a/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_mat_mul_layer_acc.cc
@@ -61,6 +61,56 @@ Status CpuMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::
     return TNN_OK;
 }
 
+
+//in align with onnx, use Tprecision=double to compute here for decision for fp types.
+//or for align with bert model, use COSINE distance ??? not checked
+template<typename Ta,typename Tb,typename Tprecision,typename Tc>
+void CpuMatMulKernel(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs, float *weight,
+                     const DimsVector &matrix_a_dims, const DimsVector &matrix_b_dims, const DimsVector & matrix_c_dims,
+                     const MatMulLayerParam *param) {
+    Ta *matrix_a;
+    Tb *matrix_b;
+    if (inputs.size() == 2) {
+        matrix_a = static_cast<Ta *>(inputs[0]->GetHandle().base);
+        matrix_b = static_cast<Tb *>(inputs[1]->GetHandle().base);
+    } else {
+        //matrix_a = param->weight_position == 0 ? weight_.get() : static_cast<Ta *>(inputs[0]->GetHandle().base);
+        //matrix_b = param->weight_position == 1 ? weight_.get() : static_cast<Tb *>(inputs[0]->GetHandle().base);
+        matrix_a = param->weight_position == 0 ? reinterpret_cast<Ta *>(weight) : static_cast<Ta *>(inputs[0]->GetHandle().base);
+        matrix_b = param->weight_position == 1 ? reinterpret_cast<Tb *>(weight) : static_cast<Tb *>(inputs[0]->GetHandle().base);
+    }
+ 
+    Tc *matrix_c = static_cast<Tc *>(outputs[0]->GetHandle().base);
+    int M        = matrix_a_dims[matrix_a_dims.size() - 2];
+    int N        = matrix_a_dims[matrix_a_dims.size() - 1];
+    int K        = matrix_b_dims[matrix_b_dims.size() - 1];
+    int count_a  = DimsVectorUtils::Count(matrix_a_dims);
+    int count_b  = DimsVectorUtils::Count(matrix_b_dims);
+    int count_c  = DimsVectorUtils::Count(matrix_c_dims);
+    int batch_a  = count_a / (M * N);
+    int batch_b  = count_b / (N * K);
+    int batch_c  = count_c / (M * K);
+
+    for (int bc = 0; bc < batch_c; ++bc) {
+        int ba = bc % batch_a;
+        int bb = bc % batch_b;
+            
+        for (int m = 0; m < M; ++m) {
+            for (int k = 0; k < K; ++k) {
+                Tprecision sum = 0;
+                for (int n = 0; n < N; ++n) {
+                    sum += Tprecision(matrix_a[ba * M * N + m * N + n]) * Tprecision(matrix_b[bb * N * K + n * K + k]);
+                }
+                matrix_c[bc * M * K + m * K + k] = Tc(sum);
+            }
+        }
+    }
+}
+
+
+
+
+
 Status CpuMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     auto param               = dynamic_cast<MatMulLayerParam *>(param_);
     auto resource            = dynamic_cast<MatMulLayerResource *>(resource_);
@@ -72,45 +122,53 @@ Status CpuMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::
     if (matrix_b_dims.size() == 1) {
         matrix_b_dims.push_back(1);
     }
-    DataType data_type       = inputs[0]->GetBlobDesc().data_type;
     auto matrix_c_dims       = outputs[0]->GetBlobDesc().dims;
-    if (data_type == DATA_TYPE_FLOAT) {
-        float *matrix_a;
-        float *matrix_b;
+    
+    DataType matrix_a_dtype;
+    DataType matrix_b_dtype;
+    DataType matrix_c_dtype  = outputs[0]->GetBlobDesc().data_type;
+    if (inputs.size() == 1) {
+        if (param->weight_position == 0) {
+            //matrix_a_dtype = resource->weight.GetDataType();
+            matrix_a_dtype = DATA_TYPE_FLOAT;
+            matrix_b_dtype = inputs[0]->GetBlobDesc().data_type;
+        } else if (param->weight_position == 1) {
+            matrix_a_dtype = inputs[0]->GetBlobDesc().data_type;
+            //matrix_b_dtype = resource->weight.GetDataType();
+            matrix_b_dtype = DATA_TYPE_FLOAT;
+        } else {
+            return Status(TNNERR_INVALID_MODEL, "MatMul input size error. param.weight_position invalid when num of input is 1.");
+        }
+    } else if (inputs.size() == 2) {
+        matrix_a_dtype = inputs[0]->GetBlobDesc().data_type;
+        matrix_b_dtype = inputs[1]->GetBlobDesc().data_type;
+    } else {
+        return Status(TNNERR_INVALID_MODEL, "MatMul OP number of inputs should be 1 or 2.");
+    }
 
-        if (inputs.size() == 2) {
-            matrix_a = static_cast<float *>(inputs[0]->GetHandle().base);
-            matrix_b = static_cast<float *>(inputs[1]->GetHandle().base);
+
+    if (matrix_c_dtype == DATA_TYPE_FLOAT) {
+        if (matrix_a_dtype==DATA_TYPE_FLOAT && matrix_b_dtype==DATA_TYPE_FLOAT) {
+            CpuMatMulKernel<float,float,double,float>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
+        } else if (matrix_a_dtype==DATA_TYPE_FLOAT && matrix_b_dtype==DATA_TYPE_HALF) {
+            CpuMatMulKernel<float,fp16_t,double,float>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
+        } else if (matrix_a_dtype==DATA_TYPE_HALF && matrix_b_dtype==DATA_TYPE_FLOAT) {
+            CpuMatMulKernel<fp16_t,float,double,float>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
         } else {
-            matrix_a    = param->weight_position == 0 ? weight_.get() : static_cast<float *>(inputs[0]->GetHandle().base);
-            matrix_b    = param->weight_position == 1 ? weight_.get() : static_cast<float *>(inputs[0]->GetHandle().base);
+            return Status(TNNERR_INVALID_MODEL, "MatMul OP CPU: data type combination of matrix a and b not supported.");
         }
-        auto matrix_c = static_cast<float *>(outputs[0]->GetHandle().base);
-        int M         = matrix_a_dims[matrix_a_dims.size() - 2];
-        int N         = matrix_a_dims[matrix_a_dims.size() - 1];
-        int K         = matrix_b_dims[matrix_b_dims.size() - 1];
-        int count_a     = DimsVectorUtils::Count(matrix_a_dims);
-        int count_b     = DimsVectorUtils::Count(matrix_b_dims);
-        int count_c     = DimsVectorUtils::Count(matrix_c_dims);
-        int batch_a   = count_a / (M * N);
-        int batch_b   = count_b / (N * K);
-        int batch_c   = count_c / (M * K);
-        for (int bc = 0; bc < batch_c; ++bc) {
-            int ba = bc % batch_a;
-            int bb = bc % batch_b;
-            
-            for (int m = 0; m < M; ++m) {
-                for (int k = 0; k < K; ++k) {
-                    //in align with onnx, use double to compute here for decision.
-                    //or for align with bert model, use COSINE distance ??? not checked
-                    double sum = 0;
-                    for (int n = 0; n < N; ++n) {
-                        sum += double(matrix_a[ba * M * N + m * N + n]) * double(matrix_b[bb * N * K + n * K + k]);
-                    }
-                    matrix_c[bc * M * K + m * K + k] = float(sum);
-                }
-            }
+    } else if (matrix_c_dtype == DATA_TYPE_HALF) {
+        if (matrix_a_dtype==DATA_TYPE_HALF && matrix_b_dtype==DATA_TYPE_HALF) {
+            CpuMatMulKernel<fp16_t,fp16_t,float,fp16_t>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
+        } else if (matrix_a_dtype==DATA_TYPE_FLOAT && matrix_b_dtype==DATA_TYPE_HALF) {
+            CpuMatMulKernel<float,fp16_t,float,fp16_t>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
+        } else if (matrix_a_dtype==DATA_TYPE_HALF && matrix_b_dtype==DATA_TYPE_FLOAT) {
+            CpuMatMulKernel<fp16_t,float,float,fp16_t>(inputs, outputs, weight_.get(), matrix_a_dims, matrix_b_dims, matrix_c_dims, param); 
+        } else {
+            return Status(TNNERR_INVALID_MODEL, "MatMul OP CPU: data type combination of matrix a and b not supported.");
         }
+    } else {
+        return Status(TNNERR_INVALID_MODEL, "MatMul OP CPU: OUTPUT matrix C, data type not supported.");
     }
 
     return TNN_OK;
diff --git a/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc
index 9a3a6ffbb..9f65a4fe1 100644
--- a/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_max_layer_acc.cc
@@ -22,6 +22,10 @@ Status CpuMaxLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
     if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
         CPU_MAX(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output->GetHandle().base, 
+                                   output->GetBlobDesc().dims,
+                                   [](int a, int b) -> int { return std::max(a, b); });
     } else {
         LOGE("Error: CpuMaxLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
         return Status(TNNERR_MODEL_ERR, "Error: CpuMaxLayerAcc don't support data type");
diff --git a/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc
index 874bc1b92..4dd166a30 100644
--- a/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_min_layer_acc.cc
@@ -23,6 +23,10 @@ Status CpuMinLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
     if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
         CPU_MIN(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output->GetHandle().base, 
+                                   output->GetBlobDesc().dims,
+                                   [](int a, int b) -> int { return std::min(a, b); });
     } else {
         LOGE("Error: CpuMinLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
         return Status(TNNERR_MODEL_ERR, "Error: CpuMinLayerAcc don't support data type");
diff --git a/source/tnn/device/cpu/acc/cpu_mod_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_mod_layer_acc.cc
new file mode 100644
index 000000000..e2bd55726
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_mod_layer_acc.cc
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Mod, LAYER_MOD);
+
+Status CpuModLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    void *output_data       = output->GetHandle().base;
+    const auto &output_dims = output->GetBlobDesc().dims;
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if ((input_blobs.size()==2 && input_blobs[0]->GetBlobDesc().data_type != input_blobs[1]->GetBlobDesc().data_type) ||
+        (input_blobs.size()==1 && layer_res && layer_res->element_handle.GetDataType()!=input_blobs[0]->GetBlobDesc().data_type)) {
+        DataType in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        DataType in1_dtype = input_blobs.size()==2 ? input_blobs[1]->GetBlobDesc().data_type : layer_res->element_handle.GetDataType();
+ 
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, fp16_t b) -> float { return std::fmod(a, float(b)); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, float b) -> float { return std::fmod(float(a), b); });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, int b) -> float { return std::fmod(a, float(b)); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, float b) -> float { return std::fmod(float(a), b); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, int b) -> fp16_t { return fp16_t(std::fmod(a, float(b))); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, fp16_t b) -> fp16_t { return fp16_t(std::fmod(float(a), b)); });
+        } else {
+            LOGE("Error: CpuModLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuModLayerAcc don't support in0, in1 data type combination");
+        }
+    } else {
+        if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE<float, float>(input_ptrs, input_shapes, output_data, output_dims, [](float a, float b) -> float { return std::fmod(a,b); });
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims, [](fp16_t a, fp16_t b) -> fp16_t { return fp16_t(std::fmod(float(a),float(b))); });
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims, [](int a, int b) -> int { return a % b; });
+        } else {
+            LOGE("Error: CpuModLayerAcc don't support in0 data type: %d and in1 data type: %d\n", input_blobs[0]->GetBlobDesc().data_type, input_blobs[1]->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuModLayerAcc don't support input data type combinations.");
+        }
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Mod, LAYER_MOD);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc
index 4eb817e1c..fec5699a2 100644
--- a/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_mul_layer_acc.cc
@@ -21,16 +21,58 @@ DECLARE_CPU_BINARY_OP_ACC(Mul, LAYER_SUB);
 
 Status CpuMulLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
-    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        CPU_MUL(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
-        void *output_data = output->GetHandle().base;
-        const auto &output_dims = output->GetBlobDesc().dims;
-        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](int a, int b) -> int { return a * b; });
+    void *output_data       = output->GetHandle().base;
+    const auto &output_dims = output->GetBlobDesc().dims;
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    auto layer_res          = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    DataType in0_dtype, in1_dtype;
+    if (input_blobs.size() == 2) {
+        in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        in1_dtype = input_blobs[1]->GetBlobDesc().data_type;
+    } else if (input_blobs.size() == 1) {
+        if (layer_param->weight_input_index == 0) {
+            in0_dtype = layer_res->element_handle.GetDataType();
+            in1_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        } else {
+            in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+            in1_dtype = layer_res->element_handle.GetDataType();
+        }
+    }
+
+    if (input_blobs.size()<=2 && in0_dtype != in1_dtype) {
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, fp16_t b) -> float { return a * float(b); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, float b) -> float { return float(a) * b; });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, int b) -> float { return a * float(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, float b) -> float { return float(a) * b; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, int b) -> fp16_t { return a * fp16_t(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, fp16_t b) -> fp16_t { return fp16_t(a) * b; });
+        } else {
+            LOGE("Error: CpuMulLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuMulLayerAcc don't support in0, in1 data type combination");
+        }
     } else {
-        LOGE("Error: CpuMulLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
-        return Status(TNNERR_MODEL_ERR, "Error: CpuMulLayerAcc don't support data type");
+        if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            CPU_MUL(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                      [](int a, int b) -> int { return a * b; });
+        } else {
+            LOGE("Error: CpuMulLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuMulLayerAcc don't support data type");
+        }
     }
     return TNN_OK;
 }
diff --git a/source/tnn/device/cpu/acc/cpu_or_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_or_layer_acc.cc
new file mode 100644
index 000000000..e1256c600
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_or_layer_acc.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Or, LAYER_OR);
+
+Status CpuOrLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data       = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                     [](int a, int b) -> char { return a || b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        void *output_data       = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<char, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                     [](char a, char b) -> char { return a || b; });
+    } else {
+        LOGE("Error: CpuOrLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuOrLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Or, LAYER_OR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc
index cf1baeb37..56c98eea0 100644
--- a/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_pad_layer_acc.cc
@@ -173,15 +173,20 @@ Status CpuPadLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vec
 
     auto input_dims  = input_blob->GetBlobDesc().dims;
     auto output_dims = output_blob->GetBlobDesc().dims;
+    if (input_dims.size()!=3 && input_dims.size()!=4) {
+        LOGE("Error: unsupported PAD input format, should be NCHW or CHW.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: unsupported PAD input format, should be NCHW or CHW.");
+    }
 
-    int batch                   = output_dims[0];
-    int channels                = output_dims[0] * output_dims[1];
-    int output_channel          = output_dims[1];
-    int output_height           = output_dims[2];
-    int output_width            = output_dims[3];
-    int input_channel           = input_dims[1];
-    int input_height            = input_dims[2];
-    int input_width             = input_dims[3];
+    // Support HCHW and CHW.
+    int batch                   = output_dims.size()==4 ? output_dims[0] : 1;
+    int channels                = output_dims.size()==4 ? output_dims[0] * output_dims[1] : output_dims[0];
+    int output_channel          = output_dims.size()==4 ? output_dims[1] : output_dims[0];
+    int output_height           = output_dims.size()==4 ? output_dims[2] : output_dims[1];
+    int output_width            = output_dims.size()==4 ? output_dims[3] : output_dims[2];
+    int input_channel           = input_dims.size()==4 ? input_dims[1] : input_dims[0];
+    int input_height            = input_dims.size()==4 ? input_dims[2] : input_dims[1];
+    int input_width             = input_dims.size()==4 ? input_dims[3] : input_dims[2];
     int data_byte_size          = DataTypeUtils::GetBytesSize(input_blob->GetBlobDesc().data_type);
     const int input_width_bytes = input_width * data_byte_size;
 
diff --git a/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc
index 6561b525b..cfd4cf7d2 100644
--- a/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_permute_layer_acc.cc
@@ -55,12 +55,18 @@ Status CpuPermuteLayerAcc::Forward(const std::vector<Blob *> &inputs, const std:
         output_step.push_back(CpuPermuteLayerAcc::count(output_dims, i + 1));
     }
 
-    if (data_type != DATA_TYPE_INT8) {
+    if (data_type == DATA_TYPE_INT32 || data_type == DATA_TYPE_FLOAT) {
+        // 32-bit data types.
         float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
         float *output_data = static_cast<float *>(output_blob->GetHandle().base);
         NaivePermute<float>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    } else if (data_type == DATA_TYPE_BFP16 || data_type == DATA_TYPE_HALF) {
+        // 16-bit data types.
+        fp16_t *input_data  = static_cast<fp16_t *>(input_blob->GetHandle().base);
+        fp16_t *output_data = static_cast<fp16_t *>(output_blob->GetHandle().base);
+        NaivePermute<fp16_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
     } else {
-        // DATA_TYPE_INT8
+        // 8-bit data types. DATA_TYPE_INT8
         int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
         int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
         NaivePermute<int8_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
diff --git a/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.cc
new file mode 100644
index 000000000..b8c27f31f
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.cc
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_permutev2_layer_acc.h"
+
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/naive_compute.h"
+
+namespace TNN_NS {
+
+CpuPermuteV2LayerAcc::~CpuPermuteV2LayerAcc(){};
+
+Status CpuPermuteV2LayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto status = CpuLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (status != TNN_OK) {
+        return status;
+    }
+    return TNN_OK;
+}
+
+Status CpuPermuteV2LayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuPermuteV2LayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<PermuteV2LayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: PermuteV2LayerParam is empyt");
+    }
+    Blob *input_blob       = inputs[0];
+    Blob *output_blob      = outputs[0];
+    DataType data_type     = output_blob->GetBlobDesc().data_type;
+    DimsVector input_dims  = input_blob->GetBlobDesc().dims;
+    DimsVector output_dims = output_blob->GetBlobDesc().dims;
+    const int output_count = DimsVectorUtils::Count(output_dims);
+
+    std::vector<int> input_step;
+    std::vector<int> output_step;
+    int num_dims = int(input_dims.size());
+    ASSERT(input_dims.size() == output_dims.size());
+    for (int i = 0; i < input_dims.size(); ++i) {
+        input_step.push_back(CpuPermuteV2LayerAcc::count(input_dims, i + 1));
+        output_step.push_back(CpuPermuteV2LayerAcc::count(output_dims, i + 1));
+    }
+
+    if (data_type == DATA_TYPE_INT32 || data_type == DATA_TYPE_FLOAT) {
+        // 32-bit data types.
+        float *input_data  = static_cast<float *>(input_blob->GetHandle().base);
+        float *output_data = static_cast<float *>(output_blob->GetHandle().base);
+        NaivePermute<float>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    } else if (data_type == DATA_TYPE_BFP16 || data_type == DATA_TYPE_HALF) {
+        // 16-bit data types.
+        fp16_t *input_data  = static_cast<fp16_t *>(input_blob->GetHandle().base);
+        fp16_t *output_data = static_cast<fp16_t *>(output_blob->GetHandle().base);
+        NaivePermute<fp16_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    } else {
+        // 8-bit data types. DATA_TYPE_INT8
+        int8_t *input_data  = static_cast<int8_t *>(input_blob->GetHandle().base);
+        int8_t *output_data = static_cast<int8_t *>(output_blob->GetHandle().base);
+        NaivePermute<int8_t>(output_count, output_dims, input_data, param->orders, input_step, output_step, num_dims, output_data);
+    }
+    return TNN_OK;
+}
+
+CpuTypeLayerAccRegister<TypeLayerAccCreator<CpuPermuteV2LayerAcc>> g_cpu_permutev2_layer_acc_register(LAYER_PERMUTEV2);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.h b/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.h
new file mode 100644
index 000000000..7c98bd36b
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_permutev2_layer_acc.h
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTEV2_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTEV2_LAYER_ACC_H_
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/device/cpu/cpu_device.h"
+
+namespace TNN_NS {
+
+// @brief conv layer cpu acc
+class CpuPermuteV2LayerAcc : public CpuLayerAcc {
+    // @brief virtual destrcutor
+    virtual ~CpuPermuteV2LayerAcc();
+
+    /**
+     * @brief init layer with param, resouce, input blobs and output blobs.
+     * @param context cpu context
+     * @param param    layer param
+     * @param resource  layer resouce
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     */
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief input or output blobs reshape.
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return reshape result
+     */
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+    /**
+     * @brief layer forward
+     * @param inputs    input blobs
+     * @param outputs   output blobs
+     * @return execution result
+     */
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+private:
+    /**
+     * @brief Compute the volume of a slice; i.e., the product of dimensions
+     *        among a range of axes.
+     *
+     * @param dimes the dimensions
+     *
+     * @param start_axis The first axis to include in the slice.
+     *
+     */
+    inline int count(std::vector<int> &dimes, int start_axis) const {
+        const int end_axis = int(dimes.size());
+        ASSERT(start_axis <= end_axis);
+        ASSERT(start_axis >= 0);
+        ASSERT(end_axis >= 0);
+        int count = 1;
+        for (int i = start_axis; i < end_axis; ++i) {
+            count *= dimes[i];
+        }
+        return count;
+    };
+};
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_CPU_CPU_PERMUTEV2_LAYER_ACC_H_
diff --git a/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc
index 1fe21e49d..a3353c48c 100644
--- a/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_pool_layer_acc.cc
@@ -18,12 +18,36 @@
 
 namespace TNN_NS {
 
-DECLARE_CPU_ACC(Pool, LAYER_POOLING);
+DECLARE_CPU_ACC_WITH_FUNC(Pool, LAYER_POOLING,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
 
 Status CpuPoolLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return TNN_OK;
 }
 
+Status CpuPoolLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<PoolingLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (layer_param->is_adaptive_pool && inputs.size() > 1) {
+        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+            return Status(TNNERR_PARAM_ERR, "adaptive pool input(shape) has invalid data type");
+        }
+        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+        layer_param->output_shape.clear();
+        for (int i = dim_count - 1; i >= 0; i--) {
+            layer_param->output_shape.push_back(dim_data[i]);
+        }
+        std::vector<int> output_dims = {input_dims[0], input_dims[1], layer_param->output_shape[1], layer_param->output_shape[0]};
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    }
+
+    return TNN_OK;
+}
+
 Status CpuPoolLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     auto param = dynamic_cast<PoolingLayerParam *>(param_);
     if (!param) {
diff --git a/source/tnn/device/cpu/acc/cpu_quantize_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_quantize_layer_acc.cc
new file mode 100644
index 000000000..304b8642b
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_quantize_layer_acc.cc
@@ -0,0 +1,31 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Quantize, LAYER_QUANTIZE);
+
+Status CpuQuantizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuQuantizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Quantize, LAYER_QUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc
index 5740e5abb..5a087c44f 100644
--- a/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_range_layer_acc.cc
@@ -80,6 +80,61 @@ Status CpuRagneLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inpu
                                                   layer_param->delta, layer_param->data_type, &status);
         RETURN_ON_NEQ(status, TNN_OK);
         
+        outputs[0]->GetBlobDesc().dims = output_dims;
+    } else if (inputs.size()>0) {
+        // Network have 1 or 2 inputs, this may happen under TNN-Torch Mode.
+        // When one or two of start, limit, delta can is in inputs.
+        if (layer_param->start_index!=-1) {
+            int idx = layer_param->start_index;
+            layer_param->data_type = inputs[idx]->GetBlobDesc().data_type;
+            auto start_data = (void *)((char *)inputs[idx]->GetHandle().base + inputs[idx]->GetHandle().bytes_offset);
+            auto start = layer_param->start;
+            if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                start.f = *((float *)start_data);
+            } else if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                start.i = *((int *)start_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid start data type when param.start_index is set.");
+            }
+            layer_param->start = start;
+        }
+        
+        if (layer_param->limit_index!=-1) {
+            int idx = layer_param->limit_index;
+            layer_param->data_type = inputs[idx]->GetBlobDesc().data_type;
+            auto limit_data = (void *)((char *)inputs[idx]->GetHandle().base + inputs[idx]->GetHandle().bytes_offset);
+            auto limit = layer_param->limit;
+            if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                limit.f = *((float *)limit_data);
+            } else if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                limit.i = *((int *)limit_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid limit data type when param.limit_index is set.");
+            }
+            layer_param->limit = limit;
+        }
+        
+        if (layer_param->delta_index!=-1) {
+            int idx = layer_param->delta_index;
+            layer_param->data_type = inputs[idx]->GetBlobDesc().data_type;
+            auto delta_data = (void *)((char *)inputs[idx]->GetHandle().base + inputs[idx]->GetHandle().bytes_offset);
+            auto delta = layer_param->delta;
+            if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                delta.f = *((float *)delta_data);
+            } else if (inputs[idx]->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+                delta.i = *((int *)delta_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid delta data type when param.delta_index is set.");
+            }
+            layer_param->delta = delta;
+        }
+    
+        //infer output shape
+        Status status = TNN_OK;
+        auto output_dims = DimsFunctionUtils::Range(layer_param->start, layer_param->limit,
+                                                  layer_param->delta, layer_param->data_type, &status);
+        RETURN_ON_NEQ(status, TNN_OK);
+        
         outputs[0]->GetBlobDesc().dims = output_dims;
     }
     
@@ -107,6 +162,7 @@ Status CpuRagneLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::v
         LOGE("output blob of Shape Layer has wrong data type \n");
         return Status(TNNERR_COMMON_ERROR, "output blob has wrong data type");
     }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc
index dc9f3ea09..cfa360168 100644
--- a/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_reshape_layer_acc.cc
@@ -30,34 +30,36 @@ Status CpuReshapeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
 Status CpuReshapeLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     auto *layer_param = dynamic_cast<ReshapeLayerParam *>(param_);
     CHECK_PARAM_NULL(layer_param);
-    
+
     Status status = TNN_OK;
     auto input_dims = inputs[0]->GetBlobDesc().dims;
     if (inputs.size() >= 2) {
         if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
             return Status(TNNERR_PARAM_ERR, "Reshape input(shape) has invalid data type");
         }
-        
+
         auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
         auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
         DimsVector dims;
         for (int i=0; i<dim_count; i++) {
             dims.push_back(dim_data[i]);
         }
-        layer_param->shape = dims;
+        if (layer_param->shape.empty()) {
+            layer_param->shape = dims;
+        }
         layer_param->num_axes = dim_count;
         auto output_dims = DimsFunctionUtils::Reshape(input_dims, dims, layer_param->axis, dim_count, &status);
         RETURN_ON_NEQ(status, TNN_OK);
         
         outputs[0]->GetBlobDesc().dims = output_dims;
     }
-    
+
     //Adjust params to different batch\height\width with 0 and -1
     auto shape = layer_param->shape;
     auto output_dims = outputs[0]->GetBlobDesc().dims;
     if (shape.size() == output_dims.size()) {
         const auto count = MIN(output_dims.size(), input_dims.size());
-        
+
         //reset 0
         {
             for (auto i=0; i<count; i++) {
@@ -66,7 +68,53 @@ Status CpuReshapeLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &in
                 }
             }
         }
-        
+
+        // In rare cases, mainly in TNN-Torch,
+        // e.g.
+        // input0.shape = [16, batch, 768]
+        // input1.data = [-1, 8*batch, 96], provided by torch
+        // output.shape = [16, 8*batch, 96]
+        //
+        // In this case, the -1, 0th input1 data & 0th output dim is actually fixed.
+        // the 0th -1 here exists because the 0th output dim "16" is related to a fixed Whole Net Input DIM.
+        //
+        // The fix Whole Net Input DIM may be something like "max_sequence length",
+        // which is not specified until Whole Net Init but fixed since then.
+        // So, when this happens, our job here is to reset layer_param->shape here on a second call, to its true value.
+        //
+        // In the this example, when InferRuntimeOutputShape is called for the first time.
+        // layer_param->shape is set to [-1, 8*batch_call_0, 96]
+        // When InferRuntimeOutputShape is then called with a different batch, say, batch_call_1, later,
+        // layer_param->shape is not setted in the code above, but this time, input1.data = [-1, 8*batch_call_1, 96]
+        // Difference exists, we are able to infer the true dim that need to be set to -1 , which is dim1 in this example.
+        //
+        // Besides, the reason we can change layer_param->shape here is based upon the fact that.
+        // Reshape should have only one -1 dim.
+        if (inputs.size() >= 2) {
+            auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
+            auto curr_dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
+            DimsVector curr_dims;
+            for (int i=0; i<dim_count; i++) {
+                curr_dims.push_back(curr_dim_data[i]);
+            }
+            if (!DimsVectorUtils::Equal(curr_dims, shape)) {
+                int diff_index = -1;
+                int diff_count = 0;
+                for (auto i=0; i<shape.size(); i++) {
+                    if (curr_dims[i]!=shape[i] && curr_dims[i]!=0 && shape[i]!=0 ) {
+                        diff_count += 1;
+                        diff_index = i;
+                    }
+                }
+                if (diff_count == 1) {
+                    // Reset LayerParam.shape.
+                    shape = output_dims;
+                    shape[diff_index] = -1;
+                    layer_param->shape = shape;
+                }
+            }
+        }
+
         //reset -1
         {
             int non_zero_index = -1;
@@ -85,10 +133,12 @@ Status CpuReshapeLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &in
         
         auto infer_output_dims = DimsFunctionUtils::Reshape(input_dims, shape, layer_param->axis, (int)shape.size(), &status);
         if (status == TNN_OK && DimsVectorUtils::Equal(infer_output_dims, output_dims)) {
-            layer_param->shape = shape;
+            if (inputs.size()==1 || !layer_param->shape.empty()) {
+                layer_param->shape = shape;
+            }
         }
     }
-    
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cpu/acc/cpu_roll_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_roll_layer_acc.cc
new file mode 100644
index 000000000..d6371bd88
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_roll_layer_acc.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cpu/acc/cpu_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CPU_ACC(Roll, LAYER_ROLL);
+
+Status CpuRollLayerAcc::Reshape(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    return TNN_OK;
+}
+
+Status CpuRollLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // Operator Roll input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+
+    auto roll_param  = dynamic_cast<RollLayerParam*>(param_);
+    if (roll_param == nullptr) {
+        LOGE("Error: CpuRollLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CpuRollLayer forward Load layer param failed!");
+    }
+    if (roll_param->dims.size() != roll_param->shifts.size()) {
+        LOGE("Error: CpuRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CpuRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims!");
+    }
+ 
+    char *input_data   = reinterpret_cast<char *>(input_blob->GetHandle().base);
+    char *output_data  = reinterpret_cast<char *>(output_blob->GetHandle().base);
+    const int ele_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type); 
+    auto count         = DimsVectorUtils::Count(input_dims);
+    
+    // Create Ordered, Positive shifts from param.shifts. 
+    std::vector<int> shifts(input_dims.size(), 0);
+    for (int d=0; d<roll_param->dims.size(); d++) {
+        int dim = roll_param->dims[d];
+        shifts[dim] = roll_param->shifts[d] < 0 ? roll_param->shifts[d] + input_dims[dim] : roll_param->shifts[d]; 
+    }
+
+    for (int i=0; i<count; i++) {
+        // Too Many Calls of Memcpy is not a good choice here when speed matters.
+        // Address of input, output should be different. Inplace Mode not supported.
+        int out_i = 0;
+        int remainder = i;
+        for (int d=0; d<input_dims.size(); d++) {
+            int stride_dim    = DimsVectorUtils::Count(input_dims, d+1);
+            int in_index_dim  = remainder / stride_dim;
+            int out_index_dim = (in_index_dim + shifts[d]) % input_dims[d];
+            out_i += stride_dim * out_index_dim;
+            remainder %= stride_dim;
+        }
+        memcpy(output_data + ele_size*out_i, input_data + ele_size*i, ele_size);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Roll, LAYER_ROLL);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc
index 27b351162..4a9182192 100644
--- a/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_splitv_layer_acc.cc
@@ -84,5 +84,6 @@ Status CpuSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::
 }
 
 REGISTER_CPU_ACC(SplitV, LAYER_SPLITV);
+REGISTER_CPU_ACC(SplitV, LAYER_SPLITTORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc
index 8a7b5d097..ffed6921c 100644
--- a/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_squeeze_layer_acc.cc
@@ -18,12 +18,59 @@
 
 namespace TNN_NS {
 
-DECLARE_CPU_ACC(Squeeze, LAYER_SQUEEZE);
+DECLARE_CPU_ACC_WITH_FUNC(Squeeze, LAYER_SQUEEZE,
+                          virtual Status InferRuntimeOutputShape(const std::vector<Blob *> &inputs,
+                                                                 const std::vector<Blob *> &outputs););
 
 Status CpuSqueezeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return TNN_OK;
 }
 
+Status CpuSqueezeLayerAcc::InferRuntimeOutputShape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto *layer_param = dynamic_cast<SqueezeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    RETURN_VALUE_ON_NEQ(input_dims.size() > 0, true, Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid inpu    t size"));
+
+    std::vector<int> axes  = layer_param->axes;
+
+    if (!axes.empty()) {
+        DimsVector output_dims = input_dims;
+        for (auto iter = axes.rbegin(); iter != axes.rend(); iter++) {
+            int axis = *iter;
+            axis =  axis < 0 ? axis + (int)output_dims.size() : axis;
+            if (axis < 0 || axis >= output_dims.size() || output_dims[axis] != 1) {
+                return Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input axes");
+            }
+            output_dims.erase(output_dims.begin() + axis);
+        }
+        outputs[0]->GetBlobDesc().dims = output_dims;
+        return TNN_OK;
+    } else {
+        // axes is empty, this may occur in pytorch
+        // https://pytorch.org/docs/stable/generated/torch.squeeze.html?highlight=squeeze#torch.squeeze
+        // This Squeeze may be dangerous, pytorch has the following warning:
+        // If the tensor has a batch dimension of size 1, then squeeze(input) will also remove the batch dimension, which can lead to unexpected errors.
+        DimsVector output_dims = {};
+        for (int i=0; i<input_dims.size(); i++) {
+            if (input_dims[i] == 1) {
+                axes.push_back(i);
+            } else {
+                output_dims.push_back(input_dims[i]);
+            }
+        }
+        if (output_dims.empty()) {
+            output_dims.push_back(0);
+        }
+        layer_param->axes = axes;
+        outputs[0]->GetBlobDesc().dims = output_dims;
+        return TNN_OK;
+    }
+
+    return TNN_OK;
+}
+
 Status CpuSqueezeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     void *input_data  = inputs[0]->GetHandle().base;
     void *output_data = outputs[0]->GetHandle().base;
diff --git a/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc
index bd7cc823b..05212b99b 100644
--- a/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_stride_slice_v2_layer_acc.cc
@@ -33,31 +33,43 @@ Status CpuStrideSliceV2LayerAcc::InferRuntimeOutputShape(const std::vector<Blob
     CHECK_PARAM_NULL(layer_param);
     
     if (inputs.size() >= 2) {
-        if (inputs[1]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
-            return Status(TNNERR_PARAM_ERR, "stride slice input(begins) has invalid data type");
-        }
-        auto dim_count = DimsVectorUtils::Count(inputs[1]->GetBlobDesc().dims);
-        auto dim_data = (int *)((char *)inputs[1]->GetHandle().base + inputs[1]->GetHandle().bytes_offset);
-        DimsVector dims;
-        for (int i=0; i<dim_count; i++) {
-            dims.push_back(dim_data[i]);
-        }
-        layer_param->begins = dims;
-    }
-    
-    if (inputs.size() >= 3) {
-        if (inputs[2]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
-            return Status(TNNERR_PARAM_ERR, "stride slice input(ends) has invalid data type");
+        if (inputs.size() >= 3) {
+            layer_param->begins_index = 1;
+            layer_param->ends_index = 2;
+        } else {
+            if (layer_param->begins_index==-1 && layer_param->ends_index==-1) {
+                // 2 inputs, the second input not specified, set to begins by default.
+                layer_param->begins_index = 1;
+            }
         }
-        auto input_dims = inputs[2]->GetBlobDesc().dims;
         
-        auto dim_count = DimsVectorUtils::Count(inputs[2]->GetBlobDesc().dims);
-        auto dim_data = (int *)((char *)inputs[2]->GetHandle().base + inputs[2]->GetHandle().bytes_offset);
-        DimsVector dims;
-        for (int i=0; i<dim_count; i++) {
-            dims.push_back(dim_data[i]);
+        if (layer_param->begins_index != -1) {
+            int begins_index = layer_param->begins_index;
+            if (inputs[begins_index]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+                return Status(TNNERR_PARAM_ERR, "stride slice input(begins) has invalid data type");
+            }
+            auto dim_count = DimsVectorUtils::Count(inputs[begins_index]->GetBlobDesc().dims);
+            auto dim_data = (int *)((char *)inputs[begins_index]->GetHandle().base + inputs[begins_index]->GetHandle().bytes_offset);
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->begins = dims;
+        }
+
+        if (layer_param->ends_index != -1) {
+            int ends_index = layer_param->ends_index;
+            if (inputs[ends_index]->GetBlobDesc().data_type != DATA_TYPE_INT32) {
+                return Status(TNNERR_PARAM_ERR, "stride slice input(ends) has invalid data type");
+            }
+            auto dim_count = DimsVectorUtils::Count(inputs[ends_index]->GetBlobDesc().dims);
+            auto dim_data = (int *)((char *)inputs[ends_index]->GetHandle().base + inputs[ends_index]->GetHandle().bytes_offset);
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            layer_param->ends = dims;
         }
-        layer_param->ends = dims;
     }
     
     auto input_dims = inputs[0]->GetBlobDesc().dims;
diff --git a/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc
index 21031e57b..be175df63 100644
--- a/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_sub_layer_acc.cc
@@ -25,14 +25,12 @@ DECLARE_CPU_BINARY_OP_ACC(Sub, LAYER_SUB);
 
 Status CpuSubLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
                                  const std::vector<DimsVector> &input_shapes, Blob *output) {
-    if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        CPU_SUB(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
-        void *output_data = output->GetHandle().base;
-        const auto &output_dims = output->GetBlobDesc().dims;
-        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
-                                  [](int a, int b) -> int { return a - b; });
-    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+    void *output_data       = output->GetHandle().base;
+    const auto &output_dims = output->GetBlobDesc().dims;
+    auto layer_param        = dynamic_cast<MultidirBroadcastLayerParam *>(param_);
+    auto layer_res          = dynamic_cast<EltwiseLayerResource *>(resource_);
+
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
         std::vector<float *> scale_ptrs;
 
         for (size_t inid = 0; inid < input_blobs.size(); inid++) {
@@ -44,9 +42,60 @@ Status CpuSubLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const s
                 output->GetHandle().base,
                 reinterpret_cast<BlobInt8 *>(output)->GetIntResource()->scale_handle.force_to<float *>(),
                 output->GetBlobDesc().dims);
+    
+        return TNN_OK;
+    } 
+
+    DataType in0_dtype, in1_dtype;
+    if (input_blobs.size() == 2) {
+        in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        in1_dtype = input_blobs[1]->GetBlobDesc().data_type;
+    } else if (input_blobs.size() == 1) {
+        if (layer_param->weight_input_index == 0) {
+            in0_dtype = layer_res->element_handle.GetDataType();
+            in1_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        } else {
+            in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+            in1_dtype = layer_res->element_handle.GetDataType();
+        }
+    }
+
+    if (input_blobs.size()<=2 && in0_dtype != in1_dtype) {
+        DataType in0_dtype = input_blobs[0]->GetBlobDesc().data_type;
+        DataType in1_dtype = input_blobs.size()==2 ? input_blobs[1]->GetBlobDesc().data_type : layer_res->element_handle.GetDataType();
+ 
+        if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, fp16_t, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, fp16_t b) -> float { return a - float(b); });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, float b) -> float { return float(a) - b; });
+        } else if (in0_dtype==DATA_TYPE_FLOAT && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<float, int, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](float a, int b) -> float { return a - float(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_FLOAT) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, float, float>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, float b) -> float { return float(a) - b; });
+        } else if (in0_dtype==DATA_TYPE_HALF && in1_dtype==DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<fp16_t, int, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](fp16_t a, int b) -> fp16_t { return a - fp16_t(b); });
+        } else if (in0_dtype==DATA_TYPE_INT32 && in1_dtype==DATA_TYPE_HALF) {
+            CPU_ELEMENT_WISE_BINARY_TYPECAST<int, fp16_t, fp16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                                    [](int a, fp16_t b) -> fp16_t { return fp16_t(a) - b; });
+        } else {
+            LOGE("Error: CpuSubLayerAcc don't support in0.type: %d and in1.type: %d\n", in0_dtype, in1_dtype);
+            return Status(TNNERR_MODEL_ERR, "CpuSubLayerAcc don't support in0, in1 data type combination");
+        }
     } else {
-        LOGE("Error: CpuSubLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
-        return Status(TNNERR_MODEL_ERR, "Error: CpuSubLayerAcc don't support data type");
+        if (output->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            CPU_SUB(input_ptrs, input_shapes, output->GetHandle().base, output->GetBlobDesc().dims);
+        } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                      [](int a, int b) -> int { return a - b; });
+        } else {
+            LOGE("Error: CpuSubLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuSubLayerAcc don't support data type");
+        }
     }
     return TNN_OK;
 }
diff --git a/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc
index 4cc0f97e5..2421c084d 100644
--- a/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc
+++ b/source/tnn/device/cpu/acc/cpu_where_layer_acc.cc
@@ -31,18 +31,55 @@ Status CpuWhereLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::v
     
     std::vector<void *> input_ptrs;
     std::vector<DimsVector> input_shapes;
+    // if x, y in resource, push back x and y first.
+    WhereLayerResource* layer_resource = dynamic_cast<WhereLayerResource *>(resource_);
+    if (layer_resource) {
+        if (layer_resource->x.GetBytesSize()>0) {
+            auto dims = layer_resource->x.GetBufferDims();
+            input_ptrs.push_back(layer_resource->x.force_to<int*>());
+            input_shapes.push_back(dims);
+        }
+        if (layer_resource->y.GetBytesSize()>0) {
+            auto dims = layer_resource->y.GetBufferDims();
+            input_ptrs.push_back(layer_resource->y.force_to<int*>());
+            input_shapes.push_back(dims);
+        }
+    }
+    
     for (size_t inid = 0; inid < inputs.size(); inid++) {
         input_ptrs.push_back(inputs[inid]->GetHandle().base);
         input_shapes.push_back(inputs[inid]->GetBlobDesc().dims);
     }
-    
-    auto data_type = output_blob->GetBlobDesc().data_type;
-    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_INT32) {
+
+    DataType condition_dtype = inputs[inputs.size()-1]->GetBlobDesc().data_type;
+    DataType output_dtype    = output_blob->GetBlobDesc().data_type;
+    if (output_dtype == DATA_TYPE_FLOAT || output_dtype == DATA_TYPE_INT32) {
         void *output_data = output_blob->GetHandle().base;
         const auto &output_dims = output_blob->GetBlobDesc().dims;
-        CPU_ELEMENT_WISE<int, int, char, int>(input_ptrs, input_shapes, output_data, output_dims,
-                                              [](int a, int b, char c) -> int { return c!=0 ? a : b; });
-    }  else {
+        if (condition_dtype == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int, int, int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int a, int b, int c) -> int { return c!=0 ? a : b; });
+        } else if (condition_dtype == DATA_TYPE_INT8) {
+            CPU_ELEMENT_WISE<int, int, char, int>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int a, int b, char c) -> int { return c!=0 ? a : b; });
+        } else {
+            LOGE("Error: CpuWhereLayerAcc Condition don't support data type: %d\n", condition_dtype);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support your input condition data type");
+        }
+    } else if (output_dtype == DATA_TYPE_HALF || output_dtype == DATA_TYPE_BFP16) {
+        int16_t *output_data = reinterpret_cast<int16_t*>(output_blob->GetHandle().base);
+        const auto &output_dims = output_blob->GetBlobDesc().dims;
+        if (condition_dtype == DATA_TYPE_INT32) {
+            CPU_ELEMENT_WISE<int16_t, int16_t, int, int16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int16_t a, int16_t b, int c) -> int16_t { return c!=0 ? a : b; });
+        } else if (condition_dtype == DATA_TYPE_INT8) {
+            CPU_ELEMENT_WISE<int16_t, int16_t, char, int16_t>(input_ptrs, input_shapes, output_data, output_dims,
+                [](int16_t a, int16_t b, char c) -> int16_t { return c!=0 ? a : b; });
+        } else {
+            LOGE("Error: CpuWhereLayerAcc Condition don't support data type: %d\n", condition_dtype);
+            return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support your input condition data type");
+        } 
+    } else {
         LOGE("Error: CpuEqualLayerAcc don't support data type: %d\n", output_blob->GetBlobDesc().data_type);
         return Status(TNNERR_MODEL_ERR, "Error: CpuEqualLayerAcc don't support data type");
     }
diff --git a/source/tnn/device/cpu/acc/cpu_xor_layer_acc.cc b/source/tnn/device/cpu/acc/cpu_xor_layer_acc.cc
new file mode 100644
index 000000000..4631ec6a7
--- /dev/null
+++ b/source/tnn/device/cpu/acc/cpu_xor_layer_acc.cc
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cpu_binary_op_layer_acc.h"
+#include "tnn/core/blob_int8.h"
+#include "tnn/utils/naive_compute.h"
+namespace TNN_NS {
+
+DECLARE_CPU_BINARY_OP_ACC(Xor, LAYER_XOR);
+
+Status CpuXorLayerAcc::Calculate(const std::vector<Blob *> &input_blobs, const std::vector<void *> &input_ptrs,
+                                 const std::vector<DimsVector> &input_shapes, Blob *output) {
+    if (output->GetBlobDesc().data_type == DATA_TYPE_INT32) {
+        void *output_data       = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<int, int>(input_ptrs, input_shapes, output_data, output_dims,
+                                     [](int a, int b) -> char { return a ^ b; });
+    } else if (output->GetBlobDesc().data_type == DATA_TYPE_INT8) {
+        void *output_data       = output->GetHandle().base;
+        const auto &output_dims = output->GetBlobDesc().dims;
+        CPU_ELEMENT_WISE<char, char>(input_ptrs, input_shapes, output_data, output_dims,
+                                     [](char a, char b) -> char { return a ^ b; });
+    } else {
+        LOGE("Error: CpuXorLayerAcc don't support data type: %d\n", output->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: CpuXorLayerAcc don't support data type");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CPU_ACC(Xor, LAYER_XOR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cpu/cpu_context.cc b/source/tnn/device/cpu/cpu_context.cc
index 94ad13251..2f5a86335 100644
--- a/source/tnn/device/cpu/cpu_context.cc
+++ b/source/tnn/device/cpu/cpu_context.cc
@@ -25,6 +25,10 @@ Status CpuContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status CpuContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status CpuContext::ShareCommandQueue(Context* context) {
     return TNN_OK;
 }
diff --git a/source/tnn/device/cpu/cpu_context.h b/source/tnn/device/cpu/cpu_context.h
index 0b3ca9d12..47995236a 100644
--- a/source/tnn/device/cpu/cpu_context.h
+++ b/source/tnn/device/cpu/cpu_context.h
@@ -31,6 +31,10 @@ class CpuContext : public Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
     // @brief share tnn command queue to another context
     Status ShareCommandQueue(Context* context) override;
     
diff --git a/source/tnn/device/cpu/cpu_device.cc b/source/tnn/device/cpu/cpu_device.cc
index f5571faf9..c275f86b2 100644
--- a/source/tnn/device/cpu/cpu_device.cc
+++ b/source/tnn/device/cpu/cpu_device.cc
@@ -31,7 +31,8 @@ Status CpuDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
     desc.dims        = dims;
     desc.device_type = DEVICE_NAIVE;
     if (mat_type == NCHW_FLOAT || 
-        mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_INT8_TEST || mat_type == RESERVED_FP16_TEST) {
+        mat_type == NCHW_BFP16 || mat_type == NC_INT8 || mat_type == NCHW_HALF || mat_type == NC_UINT8 ||
+        mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_FP16_TEST || mat_type == RESERVED_INT8_TEST) {
         desc.data_type   = DATA_TYPE_FLOAT;
         desc.data_format = DATA_FORMAT_NCHW;
         auto size_info   = Calculate(desc);
@@ -51,6 +52,10 @@ Status CpuDevice::Allocate(void** handle, MatType mat_type, DimsVector dims) {
         auto size_info   = Calculate(desc);
         size_info.data_type     = DATA_TYPE_INT32;
         return Allocate(handle, size_info);
+    } else if (mat_type == NC_INT64) {
+        auto size_info   = Calculate(desc);
+        size_info.data_type     = DATA_TYPE_INT64;
+        return Allocate(handle, size_info);
     } else {
         LOGE("CpuDevice dont support mat_type:%d\n", mat_type);
         return Status(TNNERR_PARAM_ERR, "cpu dont support mat_type");
diff --git a/source/tnn/device/cpu/cpu_mat_converter.cc b/source/tnn/device/cpu/cpu_mat_converter.cc
index 8cd05bdfc..f27e08a68 100644
--- a/source/tnn/device/cpu/cpu_mat_converter.cc
+++ b/source/tnn/device/cpu/cpu_mat_converter.cc
@@ -97,7 +97,7 @@ Status CpuMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void* c
         } else {
             return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
         }
-    } else if (src.GetMatType() == RESERVED_BFP16_TEST) {
+    } else if (src.GetMatType() == NCHW_BFP16 || src.GetMatType() == RESERVED_BFP16_TEST) {
         ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
     } else {
         ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
diff --git a/source/tnn/device/cuda/CMakeLists.txt b/source/tnn/device/cuda/CMakeLists.txt
index 03dc534f9..ae66494dd 100644
--- a/source/tnn/device/cuda/CMakeLists.txt
+++ b/source/tnn/device/cuda/CMakeLists.txt
@@ -14,11 +14,54 @@ if(NOT DEFINED ENV{CUDNN_ROOT_DIR})
 endif()
 include_directories($ENV{CUDNN_ROOT_DIR}/include)
 
+message(${CMAKE_CUDA_COMPILER_VERSION})
+
+# sm_70: V100
+# sm_75: T4, RTX 2080
 set(TARGET_ARCH "-gencode arch=compute_75,code=sm_75 \
-                 -gencode arch=compute_70,code=sm_70 \
-                 -gencode arch=compute_61,code=sm_61 \
-                 -gencode arch=compute_60,code=sm_60 \
-                 -gencode arch=compute_53,code=sm_53")
+                 -gencode arch=compute_70,code=sm_70")
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
+    # sm_60: P100
+    # sm_61: P4, P40, GTX 1080
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_60,code=sm_60")
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_61,code=sm_61")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0)
+    # sm_80: A100, A800
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_80,code=sm_80")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.1)
+    # sm_86: A20, A40, RTX 3080
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_86,code=sm_86")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.4 AND TNN_CUDA_JETSON_ENABLE)
+    # sm_87: Jetson Orin
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_87,code=sm_87")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8)
+    # sm_89: L4, L20, L40, RTX 4090
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_89,code=sm_89")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
+    # sm_90: H20, H100, H200
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_90,code=sm_90")
+endif()
+
+#if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.6)
+#    # sm_100: B40, B100, B200, RTX 5080
+#    string(APPEND TARGET_ARCH " -gencode=arch=compute_100,code=sm_100")
+#endif()
+
+# ptx forward-compatible for cuda12 or later
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0)
+    string(APPEND TARGET_ARCH " -gencode=arch=compute_80,code=compute_80")
+endif()
 
 message(${TARGET_ARCH})
 
@@ -30,9 +73,18 @@ else()
         -lineinfo -Xptxas -dlcm=cg -use_fast_math -D_GLIBCXX_USE_CXX11_ABI=0 ${TARGET_ARCH}")
 endif()
 
-file(GLOB CUDA_SRCS *.cc *.h *.cu acc/*.cc acc/*.h acc/*.cu)
+file(GLOB CUDA_SRCS *.cc *.h *.cu acc/*.cc acc/*.h acc/*.cu acc/compute/*.h acc/compute/*.cuh acc/compute/*.cu)
 message(${CUDA_SRCS})
 
+#file(GLOB FUSED_ATTENTION_SRCS acc/compute/trt_fused_multihead_attention/*.cpp acc/compute/trt_fused_multihead_attention/*.cu acc/compute/trt_unfused_multihead_attention/*.cu)
+#file(GLOB ATTENTION_FLASH_SRCS acc/compute/trt_multihead_flash_attention/*.cpp acc/compute/trt_multihead_flash_attention/*.h acc/compute/trt_multihead_flash_attention/fmha_flash_attention/include/*.h acc/compute/trt_multihead_flash_attention/fmha_flash_attention/src/*.cpp)
+#file(GLOB ATTENTION_CROSS_SRCS acc/compute/trt_multihead_cross_attention/*.cpp acc/compute/trt_multihead_cross_attention/*.h acc/compute/trt_multihead_cross_attention/fmha_cross_attention/include/*.h acc/compute/trt_multihead_cross_attention/fmha_cross_attention/src/*.cpp)
+#message(${ATTENTION_FLASH_SRCS})
+#file(GLOB ATTENTION_CROSS_SRCS acc/compute/trt_multihead_cross_attention/*.cpp acc/compute/trt_multihead_cross_attention/*.cu)
+# message(${FUSED_ATTENTION_SRCS})
+#set(CUDA_SRCS ${CUDA_SRCS} ${FUSED_ATTENTION_SRCS} ${ATTENTION_FLASH_SRCS} ${ATTENTION_CROSS_SRCS})
+set(CUDA_SRCS ${CUDA_SRCS})
+
 add_library(TNNCuda OBJECT ${CUDA_SRCS})
 
 set_property(TARGET TNNCuda PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/source/tnn/device/cuda/acc/compute/compute.cu b/source/tnn/device/cuda/acc/compute/compute.cu
new file mode 100644
index 000000000..8daadea2f
--- /dev/null
+++ b/source/tnn/device/cuda/acc/compute/compute.cu
@@ -0,0 +1,1275 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/device/cuda/acc/compute/compute.h"
+#include "tnn/device/cuda/acc/compute/reduce_utils.cuh"
+
+namespace TNN_NS {
+
+__global__ void addBiasResidualPostLayerNormV2(float* out,
+                                               const float* __restrict input_1,
+                                               const float* __restrict input_2,
+                                               const float* __restrict bias,
+                                               const float* __restrict gamma,
+                                               const float* __restrict beta,
+                                               const float layernorm_eps,
+                                               int         n)
+{
+    const int ite = 4;
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    __shared__ float s_mean;
+    __shared__ float s_variance;
+    float            mean     = 0.0f;
+    float            variance = 0.0f;
+    float            local_out[ite];
+
+    float sum = 0.0f;
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        int col_id   = i * blockDim.x + tid;
+        int id       = bid * n + col_id;
+        local_out[i] = (float)(input_1[id] + __ldg(&input_2[id]) + __ldg(&bias[col_id]));
+        sum += local_out[i];
+    }
+
+    mean = blockReduceSum<float>(sum);
+    if (tid == 0) {
+        s_mean = mean / n;
+    }
+    __syncthreads();
+
+    float var = 0.0f;
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        float diff = local_out[i] - s_mean;
+        var += diff * diff;
+    }
+
+    variance = blockReduceSum<float>(var);
+    if (tid == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        int col_id = i * blockDim.x + tid;
+        int id     = bid * n + col_id;
+        out[id] =
+            (float)((local_out[i] - s_mean) * s_variance * (float)__ldg(&gamma[col_id]) + (float)__ldg(&beta[col_id]));
+    }
+}
+
+__global__ void addBiasResidualPostLayerNormV2(half* out,
+                                               const half* __restrict input_1,
+                                               const half* __restrict input_2,
+                                               const half* __restrict bias,
+                                               const half* __restrict gamma,
+                                               const half* __restrict beta,
+                                               const float layernorm_eps,
+                                               int         n)
+{
+    using T2             = half2;
+    const int        ite = 4;
+    const int        tid = threadIdx.x;
+    const int        bid = blockIdx.x;
+    __shared__ float s_mean;
+    __shared__ float s_variance;
+    float            mean     = 0.0f;
+    float            variance = 0.0f;
+    T2               local_out_half2[ite];
+
+    T2*       out_ptr    = (T2*)out;
+    const T2* input1_ptr = (const T2*)input_1;
+    const T2* input2_ptr = (const T2*)input_2;
+    const T2* bias_ptr   = (const T2*)bias;
+    const T2* gamma_ptr  = (const T2*)gamma;
+    const T2* beta_ptr   = (const T2*)beta;
+
+    // float sum = 0.0f;
+    T2 sum = __float2half2_rn(0.0f);
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        int col_id         = i * blockDim.x + tid;
+        int id             = bid * n / 2 + col_id;
+        local_out_half2[i] = input1_ptr[id] + __ldg(&input2_ptr[id]) + __ldg(&bias_ptr[col_id]);
+        sum                = sum + local_out_half2[i];
+    }
+
+    mean = blockReduceSum<float>((float)(sum.x + sum.y));
+    if (threadIdx.x == 0) {
+        s_mean = mean / n;
+    }
+    __syncthreads();
+
+    float var      = 0.0f;
+    T2    s_mean_2 = __float2half2_rn(s_mean);
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        local_out_half2[i] = local_out_half2[i] - s_mean_2;
+        float v1           = (float)local_out_half2[i].x;
+        float v2           = (float)local_out_half2[i].y;
+        var += v1 * v1 + v2 * v2;
+    }
+
+    variance = blockReduceSum<float>(var);
+    if (tid == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+    T2 s_var_2 = __float2half2_rn(s_variance);
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        int col_id  = i * blockDim.x + tid;
+        int id      = bid * n / 2 + col_id;
+        out_ptr[id] = local_out_half2[i] * s_var_2 * __ldg(&gamma_ptr[col_id]) + __ldg(&beta_ptr[col_id]);
+    }
+}
+
+template<typename T, int N>
+__global__ void addBiasResidualPostLayerNorm(
+    T* out, const T* input_1, const T* input_2, const T* bias, const T* gamma, const T* beta, const float layernorm_eps, int m, int n)
+{
+    __shared__ float s_mean;
+    __shared__ float s_variance;
+    float            mean     = 0.0f;
+    float            variance = 0.0f;
+    float            local_out_cache[N];
+
+#pragma unroll N
+    for (int idx = threadIdx.x, i = 0; idx < n && i < N; ++i) {
+        float local_out = (float)(input_1[blockIdx.x * n + idx] + input_2[blockIdx.x * n + idx] + __ldg(&bias[idx]));
+        mean += local_out;
+        // save local_out to local_out_cache to save some recompute
+        local_out_cache[i] = local_out;
+        idx += blockDim.x;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_mean = mean / n;
+    }
+    __syncthreads();
+
+#pragma unroll N
+    for (int idx = threadIdx.x, i = 0; idx < n && i < N; ++i) {
+        float local_out = local_out_cache[i];
+        variance += (local_out - s_mean) * (local_out - s_mean);
+        idx += blockDim.x;
+    }
+    variance = blockReduceSum<float>(variance);
+    if (threadIdx.x == 0) {
+        s_variance = variance / n + layernorm_eps;
+    }
+    __syncthreads();
+
+#pragma unroll N
+    for (int idx = threadIdx.x, i = 0; idx < n && i < N; ++i) {
+        float local_out = local_out_cache[i];
+        out[blockIdx.x * n + idx] =
+            (T)(((local_out - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx])) + (float)(__ldg(&beta[idx])));
+        idx += blockDim.x;
+    }
+}
+
+__global__ void generalAddBiasResidualPostLayerNorm(float*       out,
+                                                    const float* input_1,
+                                                    const float* input_2,
+                                                    const float* bias,
+                                                    const float* gamma,
+                                                    const float* beta,
+                                                    const float  layernorm_eps,
+                                                    int          m,
+                                                    int          n)
+{
+    __shared__ float s_mean;
+    __shared__ float s_variance;
+    float            mean     = 0.0f;
+    float            variance = 0.0f;
+
+    for (int idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float local_out = (float)(input_1[blockIdx.x * n + idx] + input_2[blockIdx.x * n + idx] + __ldg(&bias[idx]));
+        mean += local_out;
+        // save local_out to out to save some recompute
+        out[blockIdx.x * n + idx] = local_out;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_mean = mean / n;
+    }
+    __syncthreads();
+
+    for (int idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float local_out = out[blockIdx.x * n + idx];
+        variance += (local_out - s_mean) * (local_out - s_mean);
+    }
+    variance = blockReduceSum<float>(variance);
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float local_out = out[blockIdx.x * n + idx];
+        out[blockIdx.x * n + idx] =
+            (float)(((local_out - s_mean) * s_variance) * (float)(__ldg(&gamma[idx])) + (float)(__ldg(&beta[idx])));
+    }
+}
+
+__global__ void generalAddBiasResidualPostLayerNorm(half*       out,
+                                                    const half* input_1,
+                                                    const half* input_2,
+                                                    const half* bias,
+                                                    const half* gamma,
+                                                    const half* beta,
+                                                    const float  layernorm_eps,
+                                                    int          m,
+                                                    int          n)
+{
+    using T2 = half2;
+    __shared__ float s_mean;
+    __shared__ float s_variance;
+    float            mean     = 0.0f;
+    float            variance = 0.0f;
+
+    T2*       out_ptr    = (T2*)out;
+    const T2* input1_ptr = (const T2*)input_1;
+    const T2* input2_ptr = (const T2*)input_2;
+    const T2* bias_ptr   = (const T2*)bias;
+    const T2* gamma_ptr  = (const T2*)gamma;
+    const T2* beta_ptr   = (const T2*)beta;
+
+    float local_out = 0.0f;
+    for (int idx = threadIdx.x; idx < n / 2; idx += blockDim.x) {
+        int    id            = blockIdx.x * n / 2 + idx;
+        T2     tmp           = input1_ptr[id] + input2_ptr[id] + __ldg(&bias_ptr[idx]);
+        float2 local_out_fp2 = __half22float2(tmp);
+        local_out += local_out_fp2.x;
+        local_out += local_out_fp2.y;
+        // save tmp to out_ptr to save some recomputation
+        out_ptr[id] = tmp;
+    }
+
+    mean = blockReduceSum<float>(local_out);
+    if (threadIdx.x == 0) {
+        s_mean = mean / n;
+    }
+    __syncthreads();
+
+    for (int idx = threadIdx.x; idx < n / 2; idx += blockDim.x) {
+        int    id            = blockIdx.x * n / 2 + idx;
+        float2 local_out_fp2 = __half22float2(out_ptr[id]);
+        variance += (local_out_fp2.x - s_mean) * (local_out_fp2.x - s_mean);
+        variance += (local_out_fp2.y - s_mean) * (local_out_fp2.y - s_mean);
+    }
+
+    variance = blockReduceSum<float>(variance);
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int idx = threadIdx.x; idx < n / 2; idx += blockDim.x) {
+        int    id            = blockIdx.x * n / 2 + idx;
+        float2 local_out_fp2 = __half22float2(out_ptr[id]);
+        float2 gamma_val     = __half22float2(__ldg(&gamma_ptr[idx]));
+        float2 beta_val      = __half22float2(__ldg(&beta_ptr[idx]));
+        local_out_fp2.x      = (local_out_fp2.x - s_mean) * s_variance * gamma_val.x + beta_val.x;
+        local_out_fp2.y      = (local_out_fp2.y - s_mean) * s_variance * gamma_val.y + beta_val.y;
+        out_ptr[id]          = __float22half2_rn(local_out_fp2);
+    }
+}
+
+template<>
+void invokeAddBiasResidualLayerNorm(float*       out,
+                                    const float* input_1,
+                                    const float* input_2,
+                                    const float* bias,
+                                    const float* gamma,
+                                    const float* beta,
+                                    const float  layernorm_eps,
+                                    int          m,
+                                    int          n,
+                                    cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(std::min(n, 1024));
+    if (n == 768 || n == 1024) {
+        addBiasResidualPostLayerNormV2
+            <<<grid, n / 4, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, n);
+    }
+    else {
+        block.x       = std::min(n, 1024);
+        int num_trips = (n + block.x - 1) / block.x;
+        if (num_trips == 1) {
+            addBiasResidualPostLayerNorm<float, 1>
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+        else if (num_trips == 2) {
+            addBiasResidualPostLayerNorm<float, 2>
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+        else {
+            generalAddBiasResidualPostLayerNorm
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+    }
+}
+
+template <>
+void invokeAddBiasResidualLayerNorm(half* out,
+                                    const half*  input_1,
+                                    const half*  input_2,
+                                    const half*  bias,
+                                    const half*  gamma,
+                                    const half*  beta,
+                                    const float  layernorm_eps,
+                                    int          m,
+                                    int          n,
+                                    cudaStream_t stream) {
+    dim3 grid(m);
+    dim3 block(std::min(n, 1024));
+
+    if (n == 768 || n == 1024) {
+        addBiasResidualPostLayerNormV2
+            <<<grid, n / 8, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, n);
+    }
+    else {
+        block.x       = std::min(n, 1024);
+        int num_trips = (n + block.x - 1) / block.x;
+        if (num_trips == 1) {
+            addBiasResidualPostLayerNorm<half, 1>
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+        else if (num_trips == 2) {
+            addBiasResidualPostLayerNorm<half, 2>
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+        else {
+            generalAddBiasResidualPostLayerNorm
+                <<<grid, block, 0, stream>>>(out, input_1, input_2, bias, gamma, beta, layernorm_eps, m, n);
+        }
+    }
+}
+
+
+__forceinline__ __device__ float copysignf_pos(float a, float b)
+{
+    float r;
+    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+    return r;
+}
+
+__inline__ __device__ float tanh_opt(float x)
+{
+#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
+    float r;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+    return r;
+#else
+    const float exp_val = -1.f * fabs(2 * x);
+    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#endif
+}
+
+template<typename T>
+__inline__ __device__ T gelu(T x)
+{
+    float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (x + 0.044715f * x * x * x))));
+    return x * cdf;
+}
+
+template<>
+__inline__ __device__ half2 gelu(half2 val)
+{
+    half2  val_pow3 = __hmul2(val, __hmul2(val, val));
+    float2 tmp_pow  = __half22float2(val_pow3);
+    float2 tmp      = __half22float2(val);
+
+    tmp.x = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
+    tmp.y = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
+    return __hmul2(val, __float22half2_rn(tmp));
+}
+
+template<typename T>
+__global__ void addBiasGelu(T* out, const T* __restrict bias, int m, int n)
+{
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        T val = out[id];
+        if (bias != nullptr) {
+            T reg_bias = __ldg(&bias[id % n]);
+            val        = val + reg_bias;
+        }
+        out[id] = (T)(gelu(val));
+    }
+}
+
+template<>
+__global__ void addBiasGelu(half* out, const half* __restrict bias, int m, int n)
+{
+    half2*       out_ptr  = (half2*)out;
+    const half2* bias_ptr = (half2*)bias;
+
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        half2 val = out_ptr[id];
+        if (bias != nullptr) {
+            half2 reg_bias = __ldg(&bias_ptr[id % n]);
+            val            = __hadd2(val, reg_bias);
+        }
+        out_ptr[id] = gelu(val);
+    }
+}
+
+template<typename T>
+void invokeAddBiasGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream)
+{
+    const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16 and bf16
+    dim3      block, grid;
+    if (n / 4 / data_type_factor <= 1024) {
+        block.x = n / 4 / data_type_factor;
+        grid.x  = m;
+    }
+    else {
+        block.x = 1024;
+        grid.x  = ceil(m * n / 1024.);
+    }
+    addBiasGelu<T><<<grid, block, 0, stream>>>(out, bias, m, n / data_type_factor);
+}
+
+template void invokeAddBiasGelu(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
+template void invokeAddBiasGelu(half* out, const half* bias, const int m, const int n, cudaStream_t stream);
+
+
+template<typename T2, int N>
+__global__ void addBiasGeluV2(T2* out, const T2* __restrict bias, const int size)
+{
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
+        T2 val = out[id];
+        if (bias != nullptr) {
+            T2 reg_bias = __ldg(&bias[id % N]);
+            val         = __hadd2(val, reg_bias);
+        }
+        out[id] = gelu(val);
+    }
+}
+
+template<typename T2, int N, int ELEMENT_PER_ROUND>
+__global__ void addBiasGeluV3(T2* out, const T2* __restrict bias, const int size)
+{
+    T2 buffer[ELEMENT_PER_ROUND];
+    T2 tmp_bias[ELEMENT_PER_ROUND];
+    for (int id = blockIdx.x * blockDim.x * ELEMENT_PER_ROUND + threadIdx.x * ELEMENT_PER_ROUND; id < size;
+         id += blockDim.x * gridDim.x * ELEMENT_PER_ROUND) {
+#pragma unroll
+        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
+            buffer[i] = out[id + i];
+            if (bias != nullptr) {
+                tmp_bias[i] = __ldg(&bias[(id + i) % N]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
+            if (bias != nullptr) {
+                buffer[i] = __hadd2(buffer[i], tmp_bias[i]);
+            }
+            out[id + i] = gelu(buffer[i]);
+        }
+    }
+}
+
+#define ADD_BIAS_GELU(HALF_N, ELEMENT_PER_ROUND)                                                                       \
+    case HALF_N:                                                                                                       \
+        if (ELEMENT_PER_ROUND > 1) {                                                                                   \
+            grid.x = grid.x / ELEMENT_PER_ROUND;                                                                       \
+            addBiasGeluV3<T2, HALF_N, ELEMENT_PER_ROUND>                                                               \
+                <<<grid, block, 0, stream>>>((T2*)out, (const T2*)bias, m * half_n);                                   \
+        }                                                                                                              \
+        else {                                                                                                         \
+            addBiasGeluV2<T2, HALF_N><<<grid, block, 0, stream>>>((T2*)out, (const T2*)bias, m * half_n);              \
+        }                                                                                                              \
+        break;
+
+template<typename T>
+void invokeAddBiasGeluV2(T* out, const T* bias, const int m, const int n, cudaStream_t stream)
+{
+    if (n % 2 == 0 && sizeof(T) == 2) {
+        const int half_n = n / 2;
+        dim3      block, grid;
+        block.x  = std::min(half_n, 512);
+        grid.x   = (m * half_n + (block.x - 1)) / block.x;
+        using T2 = half2;
+
+        if (grid.x >= 512) {
+            switch (half_n) {
+                ADD_BIAS_GELU(256, 1)
+                ADD_BIAS_GELU(512, 1)
+                ADD_BIAS_GELU(1024, 1)
+                ADD_BIAS_GELU(1536, 1)
+                ADD_BIAS_GELU(2048, 1)
+                ADD_BIAS_GELU(4096, 2)
+                ADD_BIAS_GELU(8192, 2)
+                ADD_BIAS_GELU(16384, 2)
+                ADD_BIAS_GELU(24576, 2)
+                ADD_BIAS_GELU(40960, 4)
+                default:
+                    invokeAddBiasGelu(out, bias, m, n, stream);
+                    break;
+            }
+        }
+        else {
+            switch (half_n) {
+                ADD_BIAS_GELU(256, 1)
+                ADD_BIAS_GELU(512, 1)
+                ADD_BIAS_GELU(1024, 1)
+                ADD_BIAS_GELU(1536, 1)
+                ADD_BIAS_GELU(2048, 1)
+                ADD_BIAS_GELU(4096, 1)
+                ADD_BIAS_GELU(8192, 2)
+                ADD_BIAS_GELU(16384, 2)
+                ADD_BIAS_GELU(24576, 2)
+                ADD_BIAS_GELU(40960, 2)
+                default:
+                    invokeAddBiasGelu(out, bias, m, n, stream);
+                    break;
+            }
+        }
+    }
+    else {
+        invokeAddBiasGelu(out, bias, m, n, stream);
+    }
+}
+
+#undef ADD_BIAS_GELU
+
+template void invokeAddBiasGeluV2(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
+template void invokeAddBiasGeluV2(half* out, const half* bias, const int m, const int n, cudaStream_t stream);
+
+
+
+template<typename T>
+FfnLayer<T>::FfnLayer(cublasMMWrapper* cublas_wrapper) :
+    cublas_wrapper_in_(std::make_shared<cublasMMWrapper>(*cublas_wrapper)),
+    cublas_wrapper_out_(std::make_shared<cublasMMWrapper>(*cublas_wrapper)) {
+}
+
+template<typename T>
+void FfnLayer<T>::forward(T* output,
+                          T* input,
+                          T* ffn_matmul_in,
+                          T* ffn_bias,
+                          T* ffn_matmul_out,
+                          T* inter_buf,
+                          int token_num,
+                          int hidden_dimension,
+                          int inter_size,
+                          cudaStream_t stream) {
+    // input tensors:
+    //      ffn_input [token_num, hidden_dimension],
+
+    // output tensors:
+    //      ffn_output [token_num, hidden_dimension],
+
+    cublas_wrapper_in_->Gemm(CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             inter_size,
+                             token_num,
+                             hidden_dimension,
+                             ffn_matmul_in,
+                             inter_size,
+                             input,
+                             hidden_dimension,
+                             inter_buf,
+                             inter_size,
+                             stream);
+
+    invokeAddBiasActivation(token_num, inter_size, inter_buf, ffn_bias, stream);
+
+    cublas_wrapper_out_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              hidden_dimension,
+                              token_num,
+                              inter_size,
+                              ffn_matmul_out,
+                              hidden_dimension,
+                              inter_buf,
+                              inter_size,
+                              output,
+                              hidden_dimension,
+                              stream);
+
+}
+
+template<typename T>
+GeluFfnLayer<T>::GeluFfnLayer(cublasMMWrapper* cublas_wrapper) :
+    FfnLayer<T>(cublas_wrapper) {
+}
+
+template<typename T>
+void GeluFfnLayer<T>::invokeAddBiasActivation(const int token_num, const int inter_size, T* inter_buf, const T* bias, cudaStream_t stream) {
+    invokeAddBiasGeluV2<T>(inter_buf, bias, token_num, inter_size, stream);
+}
+
+template class GeluFfnLayer<float>;
+template class GeluFfnLayer<half>;
+
+__global__ void trt_add_QKV_bias(half2*       qkv_buf,
+                                 const half2* Q,
+                                 const half2* bias_Q,
+                                 const half2* K,
+                                 const half2* bias_K,
+                                 const half2* V,
+                                 const half2* bias_V,
+                                 const int    valid_word_num,
+                                 const int    head_num,
+                                 const int    size_per_head)
+{
+    // Add bias, and then transpose from
+    // [3, valid_word_num, head, size] -> [valid_word_num, head, 3, size]
+
+    // const int seq_id = blockIdx.x % valid_word_num;
+    // const int qkv_id = (blockIdx.x - seq_id) / valid_word_num;
+    const int seq_id = blockIdx.x;
+
+    for (int index = threadIdx.x; index < head_num * size_per_head; index += blockDim.x) {
+        const int size_id = index % size_per_head;
+        const int head_id = (index - size_id) / size_per_head;
+
+        const int target_offset = blockIdx.x * head_num * 3 * size_per_head + head_id * 3 * size_per_head;
+        const int src_id        = seq_id * head_num * size_per_head + index;
+
+        qkv_buf[target_offset + 0 * size_per_head + size_id] = Q[src_id] + bias_Q[index];
+        qkv_buf[target_offset + 1 * size_per_head + size_id] = K[src_id] + bias_K[index];
+        qkv_buf[target_offset + 2 * size_per_head + size_id] = V[src_id] + bias_V[index];
+    }
+}
+
+template<typename T>
+void invokeTrtAddQkvBias(size_t token_num, int head_num, int size_per_head,
+                         T* qkv_buf, T* q_buf, T* k_buf, T* v_buf,
+                         T* q_bias, T* k_bias, T* v_bias, cudaStream_t stream) {
+    dim3 grid(token_num);
+    dim3 block(min((int)(head_num * size_per_head / 2), 512));
+
+    trt_add_QKV_bias<<<grid, block, 0, stream>>>((half2*)qkv_buf,
+                                                 (const half2*)q_buf,
+                                                 (const half2*)q_bias,
+                                                 (const half2*)k_buf,
+                                                 (const half2*)k_bias,
+                                                 (const half2*)v_buf,
+                                                 (const half2*)v_bias,
+                                                 token_num,
+                                                 head_num,
+                                                 size_per_head / 2);
+}
+
+__global__ void getTrtPaddingOffsetKernel(int*       trt_mha_padding_offset,
+                                          const int* sequence_length,
+                                          const int  request_batch_size,
+                                          const int  request_seq_len)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we keep the padding
+
+    extern __shared__ int tmp_offset[];
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < request_batch_size; i++) {
+            tmp_offset[i * 2 + 1] = tmp_offset[i * 2] + sequence_length[i];
+            tmp_offset[i * 2 + 2] = request_seq_len * (i + 1);
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < 2 * request_batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               const int    request_seq_len,
+                               cudaStream_t stream)
+{
+    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (2 * request_batch_size + 1), stream>>>(
+        trt_mha_padding_offset, sequence_length, request_batch_size, request_seq_len);
+}
+
+template<typename T>
+__global__ void getTrtPaddingOffsetFromMaskKernel(int*       trt_mha_padding_offset,
+                                                  const T*   mask,
+                                                  const int  ld_mask,
+                                                  const int  request_batch_size,
+                                                  const int  request_seq_len)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we keep the padding
+
+    extern __shared__ int sequence_length[];
+    for (int i = threadIdx.x; i < request_batch_size; i += blockDim.x) {
+        if (mask != nullptr) {
+            const T* b_mask = mask + i * ld_mask;
+            int len = 0;
+            for (int j = 0; j < request_seq_len; ++j) {
+                if ((float(b_mask[j]) - 0.0) > -1e-5 && (float(b_mask[j]) - 0.0) < 1e-5) {
+                    ++len;
+                } else {
+                    break;
+                }
+            }
+            sequence_length[i] = len;
+        } else {
+            sequence_length[i] = request_seq_len;
+        }
+    }
+    __syncthreads();
+
+    int *tmp_offset = sequence_length + request_batch_size;
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < request_batch_size; i++) {
+            tmp_offset[i * 2 + 1] = tmp_offset[i * 2] + sequence_length[i];
+            tmp_offset[i * 2 + 2] = request_seq_len * (i + 1);
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < 2 * request_batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+template<typename T>
+void invokeGetTrtPaddingOffsetFromMask(int*         trt_mha_padding_offset,
+                                       const T*     mask,
+                                       const int    ld_mask,
+                                       const int    request_batch_size,
+                                       const int    request_seq_len,
+                                       cudaStream_t stream)
+{
+    getTrtPaddingOffsetFromMaskKernel<<<1, 256, sizeof(int) * (3 * request_batch_size + 1), stream>>>(
+        trt_mha_padding_offset, mask, ld_mask, request_batch_size, request_seq_len);
+}
+
+#if 0  // Fused Attention has 100mb + volume
+template<typename T>
+FusedAttentionLayer<T>::FusedAttentionLayer(size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           d_model,
+                                            float            q_scaling,
+                                            int              sm,
+                                            cublasMMWrapper* cublas_wrapper) :
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    d_model_(d_model),
+    q_scaling_(q_scaling),
+    sm_(sm),
+    cublas_wrapper_(std::make_shared<cublasMMWrapper>(*cublas_wrapper)) {
+    if ((sm_ == 70 || sm_ == 86 || sm_ == 80 || sm_ == 75 || sm_ == 72) && size_per_head_ == 64) {
+        dispatcher_fp16.reset(new FusedMHARunnerFP16v2(head_num_, size_per_head_, sm_, q_scaling_));
+    } else {
+        throw std::runtime_error(std::string("FusedAttentionLayer not support.\n"));
+    }
+    hidden_units_ = head_num_ * size_per_head_;
+}
+
+template<typename T>
+void FusedAttentionLayer<T>::forward(T* attention_out,
+                                     T* from_tensor,
+                                     T* attention_mask,
+                                     int* padding_offset,
+                                     T* inter_buf,
+                                     T* q_weight,
+                                     T* k_weight,
+                                     T* v_weight,
+                                     T* o_weight,
+                                     T* q_bias,
+                                     T* k_bias,
+                                     T* v_bias,
+                                     int h_token_num,
+                                     int max_seq_len,
+                                     int batch_size,
+                                     int ld_mask,
+                                     cudaStream_t stream) {
+    // input_tensors: [input_query (h_token_num, d_model),
+    //                 attention_mask (batch, 1, seqlen, seqlen) or (batch, 1, seqlen),
+    //                 padding_offset (batch + 1 or batch * 2 + 1))]
+
+    const size_t m = h_token_num;
+    int          k = d_model_;
+    int          n = hidden_units_;
+
+    T* q_buf_     = inter_buf;
+    T* k_buf_     = q_buf_ + m*n;
+    T* v_buf_     = k_buf_ + m*n;
+    T* qkv_buf_   = v_buf_ + m*n;
+    T* qkv_buf_2_ = qkv_buf_ + m*n * 3;
+    int* padding_offset_ = reinterpret_cast<int*>(qkv_buf_2_ + m*n);
+
+    // TODO: support batched gemm
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          q_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          q_buf_,
+                          n,
+                          stream);
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          k_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          k_buf_,
+                          n,
+                          stream);
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          v_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          v_buf_,
+                          n,
+                          stream);
+
+    invokeTrtAddQkvBias(m, head_num_, size_per_head_, qkv_buf_, q_buf_, k_buf_, v_buf_, q_bias, k_bias, v_bias, stream);
+
+    int S = dispatcher_fp16->getSFromMaxSeqLen(max_seq_len);
+    if(!dispatcher_fp16->isValid(S)) {
+        throw std::runtime_error(std::string("FusedAttentionLayer max_seq_len not valid.\n"));
+    }
+    int B = batch_size;
+    if (padding_offset == nullptr) {
+        invokeGetTrtPaddingOffsetFromMask(padding_offset_, attention_mask, ld_mask, batch_size, max_seq_len, stream);
+
+        // int seq_lenghts[] = {16,16};
+        // int* d_seq_lengths;
+        // cudaMalloc(&d_seq_lengths, batch_size*sizeof(int));
+        // cudaStreamSynchronize(stream);
+        // cudaMemcpy(d_seq_lengths, seq_lenghts, batch_size*sizeof(int), cudaMemcpyHostToDevice);
+        // invokeGetTrtPaddingOffset(padding_offset_, d_seq_lengths, batch_size, max_seq_len, stream);
+        // cudaFree(d_seq_lengths);
+
+        // cudaStreamSynchronize(stream);
+        // int *h_padding_offset = new int[batch_size*2+1];
+        // cudaMemcpy(h_padding_offset, npadding_offset_, (batch_size*2+1)*sizeof(int), cudaMemcpyDeviceToHost);
+        // for (int i = 0; i < batch_size*2+1; ++i) {
+        //     printf("[%d] %d\n", i, h_padding_offset[i]);
+        // }
+        // delete[] h_padding_offset;
+
+        padding_offset = padding_offset_;
+        B = batch_size * 2;
+    }
+    dispatcher_fp16->setup(S, B);
+    dispatcher_fp16->run(qkv_buf_,
+                         nullptr,
+                         padding_offset,
+                         nullptr,
+                         qkv_buf_2_,
+                         stream);
+
+    k = hidden_units_;
+    n = d_model_;
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          o_weight,
+                          n,
+                          qkv_buf_2_,
+                          k,
+                          attention_out,
+                          n,
+                          stream);
+}
+
+template class FusedAttentionLayer<half>;
+
+
+
+
+// UnfusedAttentionLayer
+template<typename T>
+UnfusedAttentionLayer<T>::UnfusedAttentionLayer(size_t           head_num,
+                                                size_t           size_per_head,
+                                                size_t           d_model,
+                                                float            q_scaling,
+                                                int              sm,
+                                                cublasMMWrapper* cublas_wrapper) :
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    d_model_(d_model),
+    q_scaling_(q_scaling),
+    sm_(sm),
+    cublas_wrapper_(std::make_shared<cublasMMWrapper>(*cublas_wrapper)) {
+    hidden_units_ = head_num_ * size_per_head_;
+}
+
+template<typename T>
+void UnfusedAttentionLayer<T>::forward(T* attention_out,
+                                       T* from_tensor,
+                                       T* attention_mask,
+                                       int* padding_offset,
+                                       T* inter_buf,
+                                       T* q_weight,
+                                       T* k_weight,
+                                       T* v_weight,
+                                       T* o_weight,
+                                       T* q_bias,
+                                       T* k_bias,
+                                       T* v_bias,
+                                       int h_token_num,
+                                       int max_seq_len,
+                                       int batch_size,
+                                       int ld_mask,
+                                       cudaStream_t stream) {
+    const size_t m = h_token_num;
+    int          k = d_model_;
+    int          n = hidden_units_;
+    bool mask_2d = false;
+    if (ld_mask == max_seq_len * max_seq_len) {
+        mask_2d = true;
+    } else if (ld_mask != max_seq_len) {
+        LOGE("Shape of Attention Mask should be [batch, 1, seq_len, seq_len] or [batch, 1, seq_len].\n");
+        return;
+    }
+
+    // TOTAL: real_time_total_seq_len * hidden_units * 9 + batch*head_num*max_seq_len*max_seq_len;
+    T* q_buf_     = inter_buf;
+    T* k_buf_     = q_buf_ + m*n;
+    T* v_buf_     = k_buf_ + m*n;
+    T* q_buf_2_   = v_buf_ + m*n;
+    T* k_buf_2_   = q_buf_2_ + batch_size * max_seq_len * hidden_units_;
+    T* v_buf_2_   = k_buf_2_ + batch_size * max_seq_len * hidden_units_;
+    T* qk_buf_    = v_buf_2_ + batch_size * max_seq_len * hidden_units_;
+    T* qkv_buf_   = qk_buf_ + batch_size*head_num_*max_seq_len*max_seq_len;
+    T* qkv_buf_2_ = qkv_buf_ + batch_size * max_seq_len * hidden_units_;
+
+    // TODO: support batched gemm
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          q_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          q_buf_,
+                          n,
+                          stream);
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          k_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          k_buf_,
+                          n,
+                          stream);
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          v_weight,
+                          n,
+                          from_tensor,
+                          k,
+                          v_buf_,
+                          n,
+                          stream);
+
+    if (padding_offset == nullptr) {
+        // Sparse Mode
+        invokeAddQKVBiasTranspose(q_buf_2_,
+                                  k_buf_2_,
+                                  v_buf_2_,
+                                  q_buf_,
+                                  q_bias,
+                                  k_buf_,
+                                  k_bias,
+                                  v_buf_,
+                                  v_bias,
+                                  batch_size,
+                                  max_seq_len,
+                                  head_num_,
+                                  size_per_head_,
+                                  stream);
+    } else {
+        // Dense Mode
+        cudaMemsetAsync(q_buf_2_, 0, 3 * batch_size * max_seq_len * hidden_units_ * sizeof(T), stream);
+        invokeAddQKVBiasRebuildPadding(q_buf_,
+                                       q_bias,
+                                       k_buf_,
+                                       k_bias,
+                                       v_buf_,
+                                       v_bias,
+                                       q_buf_2_,
+                                       k_buf_2_,
+                                       v_buf_2_,
+                                       batch_size,
+                                       max_seq_len,
+                                       head_num_,
+                                       size_per_head_,
+                                       m,
+                                       padding_offset,
+                                       stream);
+    }
+
+    float scalar = 1 / (std::sqrt(size_per_head_ * 1.0f) * q_scaling_);
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        batch_size * head_num_,
+                                        max_seq_len,
+                                        max_seq_len,
+                                        size_per_head_,
+                                        k_buf_2_,
+                                        size_per_head_,
+                                        max_seq_len * size_per_head_,
+                                        q_buf_2_,
+                                        size_per_head_,
+                                        max_seq_len * size_per_head_,
+                                        qk_buf_,
+                                        max_seq_len,
+                                        max_seq_len * max_seq_len,
+                                        stream,
+                                        scalar);
+
+    // TODO (Relative Position Bias Not Supported)
+    //if (use_relative_position_bias) {
+    //    invokeAddRelativeAttentionBias(
+    //        qk_buf_, relative_attention_bias, request_batch_size, head_num_, request_seq_len, stream_);
+    //}
+
+    invokeMaskedSoftMax(qk_buf_,
+                        qk_buf_,
+                        attention_mask,
+                        batch_size,
+                        max_seq_len,
+                        max_seq_len,
+                        head_num_,
+                        (T)1.0f,
+                        mask_2d,
+                        stream);
+
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        batch_size * head_num_,
+                                        size_per_head_,
+                                        max_seq_len,
+                                        max_seq_len,
+                                        v_buf_2_,
+                                        size_per_head_,
+                                        max_seq_len * size_per_head_,
+                                        qk_buf_,
+                                        max_seq_len,
+                                        max_seq_len * max_seq_len,
+                                        qkv_buf_,
+                                        size_per_head_,
+                                        max_seq_len * size_per_head_,
+                                        stream);
+
+    if (padding_offset == nullptr) {
+        // Sparse Mode
+        invokeTransposeQKV(qkv_buf_2_, qkv_buf_, batch_size, max_seq_len, head_num_, size_per_head_, stream);
+    } else {
+        // Dense Mode
+        invokeTransposeAttentionOutRemovePadding(qkv_buf_,
+                                                 qkv_buf_2_,
+                                                 m,
+                                                 batch_size,
+                                                 max_seq_len,
+                                                 head_num_,
+                                                 size_per_head_,
+                                                 padding_offset,
+                                                 stream);
+    }
+
+    // Output MatMul
+    k = hidden_units_;
+    n = d_model_;
+    cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          m,
+                          k,
+                          o_weight,
+                          n,
+                          qkv_buf_2_,
+                          k,
+                          attention_out,
+                          n,
+                          stream);
+}
+
+template class UnfusedAttentionLayer<half>;
+template class UnfusedAttentionLayer<float>;
+
+template<typename T>
+void FlashAttentionLayer<T>::forward(T* devQKV,
+                         T* output,
+                         int32_t batch_size,
+                         int32_t head_num, 
+                         int32_t size_per_head, 
+                         int32_t seq_len,
+                         cudaStream_t stream) {
+    if (batch_size != mOptBatchSize || seq_len != mOptSeqLen)
+    {
+        initializeSeqlens(batch_size, seq_len, mCuSeqLen.get(), stream);
+    }
+    size_t const total = mOptBatchSize * mOptSeqLen;
+    runFMHFAKernel(devQKV, mCuSeqLen.get(), output, total, mSM, mKernels,
+            mOptBatchSize, head_num, size_per_head, mOptSeqLen, stream);
+}
+
+template<typename T>
+void FlashAttentionLayer<T>::createMHARunner()
+{
+    mKernels = getFMHAFlashCubinKernels(MHA_DATA_TYPE_FP16, mSM); 
+}
+
+
+template<typename T>
+void FlashAttentionLayer<T>::allocateSeqlens(int32_t maxBatchSize)
+{
+    // allocate seqlens buffer
+    if (!mCuSeqLen && maxBatchSize)
+    {
+        void* cudaMem{nullptr};
+        cudaMalloc(&cudaMem, sizeof(int32_t) * (maxBatchSize + 1));
+        make_cuda_shared(mCuSeqLen, cudaMem);
+    }
+
+    mMaxBatchSize = maxBatchSize;
+}
+
+template<typename T>
+void FlashAttentionLayer<T>::initializeSeqlens(int32_t b, int32_t s, void* cu_seqlens_d, cudaStream_t stream)
+{
+    if (!b || !s)
+    {
+        return;
+    }
+
+    std::vector<int32_t> cuSeqLens(b + 1, 0);
+    // Compute the prefix sum of the seqlen
+    for (int32_t it = 0; it < b; it++)
+    {
+        cuSeqLens[it + 1] = cuSeqLens[it] + s;
+    }
+
+    cudaMemcpyAsync(
+        cu_seqlens_d, cuSeqLens.data(), sizeof(int32_t) * cuSeqLens.size(), cudaMemcpyHostToDevice, stream);
+    mOptBatchSize = b;
+    mOptSeqLen = s;
+}
+
+template class FlashAttentionLayer<half>;
+
+template<typename T>
+void CrossAttentionLayer<T>::forward(T* devQ,
+                         T* devKV,
+                         T* output,
+                         int32_t batch_size,
+                         int32_t head_num, 
+                         int32_t size_per_head, 
+                         int32_t seq_len_q,
+                         int32_t seq_len_kv,
+                         cudaStream_t stream) {
+    constexpr int32_t seqLenKvPadded = 128;
+    if (batch_size != mOptBatchSize) {
+        allocateSeqlens(batch_size);
+    }
+    if (batch_size != mOptBatchSize || seq_len_q != mOptSeqLenQ ||seq_len_kv != mOptSeqLenKV)
+    {
+        mOptSeqLenQ = seq_len_q;
+        mOptSeqLenKV = seq_len_kv;
+        initializeSeqlens(batch_size, seq_len_q, mCuSeqLenQ.get(), stream);
+        initializeSeqlens(batch_size, seq_len_kv, mCuSeqLenKV.get(), stream);
+    }
+    runFMHCAKernel(devQ, devKV, mCuSeqLenQ.get(), mCuSeqLenKV.get(), output, mSM, mKernels,
+            mOptBatchSize, head_num, size_per_head, mOptSeqLenQ, seqLenKvPadded, stream);
+}
+
+template<typename T>
+void CrossAttentionLayer<T>::createMHARunner()
+{
+    mKernels = getFMHCACubinKernels(MHA_DATA_TYPE_FP16, mSM); 
+}
+
+
+template<typename T>
+void CrossAttentionLayer<T>::allocateSeqlens(int32_t maxBatchSize)
+{
+    // allocate seqlens buffer
+    if ((!mCuSeqLenQ || !mCuSeqLenKV) && maxBatchSize)
+    {
+        void* cudaMemQ{nullptr};
+        void* cudaMemKV{nullptr};
+        cudaMalloc(&cudaMemQ, sizeof(int32_t) * (maxBatchSize + 1));
+        cudaMalloc(&cudaMemKV, sizeof(int32_t) * (maxBatchSize + 1));
+        make_cuda_shared(mCuSeqLenQ, cudaMemQ);
+        make_cuda_shared(mCuSeqLenKV, cudaMemKV);
+    }
+
+    mMaxBatchSize = maxBatchSize;
+}
+
+template<typename T>
+void CrossAttentionLayer<T>::initializeSeqlens(int32_t b, int32_t s, void* cu_seqlens_d, cudaStream_t stream)
+{
+    if (!b || !s)
+    {
+        return;
+    }
+
+    std::vector<int32_t> cuSeqLens(b + 1, 0);
+    // Compute the prefix sum of the seqlen
+    for (int32_t it = 0; it < b; it++)
+    {
+        cuSeqLens[it + 1] = cuSeqLens[it] + s;
+    }
+
+    cudaMemcpyAsync(
+        cu_seqlens_d, cuSeqLens.data(), sizeof(int32_t) * cuSeqLens.size(), cudaMemcpyHostToDevice, stream);
+    mOptBatchSize = b;
+}
+
+template class CrossAttentionLayer<half>;
+#endif  // end #if 0
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/compute/compute.h b/source/tnn/device/cuda/acc/compute/compute.h
new file mode 100644
index 000000000..9d684b144
--- /dev/null
+++ b/source/tnn/device/cuda/acc/compute/compute.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CUDA_COMPUTE_H_
+#define TNN_CUDA_COMPUTE_H_
+
+#include <float.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <memory>
+
+#include <cuda_fp16.h>
+
+#include "tnn/device/cuda/utils.cuh"
+#if 0
+#include "tnn/device/cuda/acc/compute/trt_fused_multihead_attention/qkvToContext.h"
+#include "tnn/device/cuda/acc/compute/trt_multihead_cross_attention/fmha_cross_attention/include/fmha_cross_attention.h"
+#include "tnn/device/cuda/acc/compute/trt_multihead_cross_attention/fmhca.h"
+#include "tnn/device/cuda/acc/compute/trt_multihead_flash_attention/fmha_flash_attention/include/fmha_flash_attention.h"
+#include "tnn/device/cuda/acc/compute/trt_multihead_flash_attention/fmha.h"
+#include "tnn/device/cuda/acc/compute/trt_unfused_multihead_attention/unfused_multihead_attention.h"
+#endif // end of #if 0
+
+namespace TNN_NS {
+
+template<typename T>
+void invokeAddBiasResidualLayerNorm(T*           out,
+                                    const T*     input_1,
+                                    const T*     input_2,
+                                    const T*     bias,
+                                    const T*     gamma,
+                                    const T*     beta,
+                                    const float  layernorm_eps,
+                                    const int    m,
+                                    const int    n,
+                                    cudaStream_t stream);
+
+template<typename T>
+void invokeAddBiasGelu(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
+
+template<typename T>
+void invokeAddBiasGeluV2(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
+
+
+
+template<typename T>
+class FfnLayer {
+public:
+    FfnLayer(cublasMMWrapper* cublas_wrapper);
+
+    virtual void forward(T* output,
+                         T* input,
+                         T* ffn_matmul_in,
+                         T* ffn_bias,
+                         T* ffn_matmul_out,
+                         T* inter_buf,
+                         int token_num,
+                         int hidden_dimension,
+                         int inter_size,
+                         cudaStream_t stream);
+
+protected:
+    virtual void invokeAddBiasActivation(const int token_num, const int inter_size, T* inter_buf, const T* bias, cudaStream_t stream) = 0;
+
+    std::shared_ptr<cublasMMWrapper> cublas_wrapper_in_;
+    std::shared_ptr<cublasMMWrapper> cublas_wrapper_out_;
+};
+
+template<typename T>
+class GeluFfnLayer: public FfnLayer<T> {
+public:
+    GeluFfnLayer(cublasMMWrapper* cublas_wrapper);
+
+private:
+    void invokeAddBiasActivation(const int token_num, const int inter_size, T* inter_buf, const T* bias, cudaStream_t stream) override;
+};
+
+template<typename T>
+class BaseAttentionLayer {
+public:
+    virtual void forward(T* attention_out,
+                         T* from_tensor,
+                         T* attention_mask,
+                         int* padding_offset,
+                         T* inter_buf,
+                         T* q_weight,
+                         T* k_weight,
+                         T* v_weight,
+                         T* o_weight,
+                         T* q_bias,
+                         T* k_bias,
+                         T* v_bias,
+                         int h_token_num,
+                         int max_seq_len,
+                         int batch_size,
+                         int ld_mask,
+                         cudaStream_t stream) = 0;
+    virtual ~BaseAttentionLayer() = default;
+}; // Class BaseAttentionLayer
+
+#if 0
+template<typename T>
+class FusedAttentionLayer : public BaseAttentionLayer<T> {
+public:
+    FusedAttentionLayer(size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           d_model,
+                        float            q_scaling,
+                        int              sm,
+                        cublasMMWrapper* cublas_wrapper);
+
+    void forward(T* attention_out,
+                 T* from_tensor,
+                 T* attention_mask,
+                 int* padding_offset,
+                 T* inter_buf,
+                 T* q_weight,
+                 T* k_weight,
+                 T* v_weight,
+                 T* o_weight,
+                 T* q_bias,
+                 T* k_bias,
+                 T* v_bias,
+                 int h_token_num,
+                 int max_seq_len,
+                 int batch_size,
+                 int ld_mask,
+                 cudaStream_t stream) override;
+
+private:
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t d_model_;
+    float  q_scaling_;
+    int    sm_;
+    std::shared_ptr<cublasMMWrapper> cublas_wrapper_;
+
+    size_t hidden_units_;
+    std::unique_ptr<MHARunner> dispatcher_fp16;
+};
+
+
+template<typename T>
+class UnfusedAttentionLayer : public BaseAttentionLayer<T> {
+public:
+    UnfusedAttentionLayer(size_t           head_num,
+                          size_t           size_per_head,
+                          size_t           d_model,
+                          float            q_scaling,
+                          int              sm,
+                          cublasMMWrapper* cublas_wrapper);
+
+    void forward(T* attention_out,
+                 T* from_tensor,
+                 T* attention_mask,
+                 int* padding_offset,
+                 T* inter_buf,
+                 T* q_weight,
+                 T* k_weight,
+                 T* v_weight,
+                 T* o_weight,
+                 T* q_bias,
+                 T* k_bias,
+                 T* v_bias,
+                 int h_token_num,
+                 int max_seq_len,
+                 int batch_size,
+                 int ld_mask,
+                 cudaStream_t stream) override;
+
+private:
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t d_model_;
+    float  q_scaling_;
+    int    sm_;
+    std::shared_ptr<cublasMMWrapper> cublas_wrapper_;
+
+    size_t hidden_units_;
+};
+
+class FusedMultiHeadFlashAttentionKernel;
+
+template <typename T>
+struct CudaDeleter
+{
+    void operator()(T* buf)
+    {
+         cudaFree(buf);
+    }
+};
+
+template <typename T>
+using cuda_shared_ptr = std::shared_ptr<T>;
+
+template <typename T>
+void make_cuda_shared(cuda_shared_ptr<T>& ptr, void* cudaMem)
+{
+    ptr.reset(static_cast<T*>(cudaMem), CudaDeleter<T>());
+}
+
+
+template<typename T>
+class FlashAttentionLayer {
+public:
+    FlashAttentionLayer(int sm) {
+        mSM = sm;
+        mOptBatchSize = 0;
+        mOptSeqLen = 0;
+        mMaxBatchSize = 0;
+        mCuSeqLen = nullptr;
+    }
+    void forward(T* devQKV,
+                 T* output,
+                 int32_t batch_size,
+                 int32_t head_num, 
+                 int32_t size_per_head, 
+                 int32_t seq_len,
+                 cudaStream_t stream);
+    void createMHARunner();
+    void allocateSeqlens(int32_t maxBatchSize);
+    void initializeSeqlens(int32_t b, int32_t s, void* cu_seqlens_d, cudaStream_t stream = 0);
+    ~FlashAttentionLayer(){};
+private:
+    int32_t mOptBatchSize;
+    int32_t mOptSeqLen;
+    int32_t mMaxBatchSize;
+    DataType mDataType; //{DataType::kFLOAT};
+    int mSM;
+    cuda_shared_ptr<void> mCuSeqLen;
+    FusedMultiHeadFlashAttentionKernel const* mKernels;
+
+    std::string const mLayerName;
+    std::string mNamespace;
+}; // Class FlashAttentionLayer
+
+template<typename T>
+class CrossAttentionLayer {
+public:
+    CrossAttentionLayer(int sm) {
+        mSM = sm;
+        mOptBatchSize = 0;
+        mOptSeqLenQ = 0;
+        mOptSeqLenKV = 0;
+        mMaxBatchSize = 0;
+        mCuSeqLenQ = nullptr;
+        mCuSeqLenKV = nullptr;
+    }
+    void forward(T* devQ,
+                 T* devKV,
+                 T* output,
+                 int32_t batch_size,
+                 int32_t head_num, 
+                 int32_t size_per_head, 
+                 int32_t seq_len_q,
+                 int32_t seq_len_kv,
+                 cudaStream_t stream);
+    void createMHARunner();
+    void allocateSeqlens(int32_t maxBatchSize);
+    void initializeSeqlens(int32_t b, int32_t s, void* cu_seqlens_d, cudaStream_t stream = 0);
+    ~CrossAttentionLayer(){};
+private:
+    int32_t mOptBatchSize;
+    int32_t mOptSeqLenQ;
+    int32_t mOptSeqLenKV;
+    int32_t mMaxBatchSize;
+    DataType mDataType; //{DataType::kFLOAT};
+    int mSM;
+    cuda_shared_ptr<void> mCuSeqLenQ;
+    cuda_shared_ptr<void> mCuSeqLenKV;
+    FusedMultiHeadCrossAttentionKernel const* mKernels;
+
+    std::string const mLayerName;
+    std::string mNamespace;
+}; // Class FlashAttentionLayer
+#endif  // end of #if 0
+
+}   // namespace TNN_NS
+
+#endif  // TNN_CUDA_COMPUTE_H_
diff --git a/source/tnn/device/cuda/acc/compute/reduce_utils.cuh b/source/tnn/device/cuda/acc/compute/reduce_utils.cuh
new file mode 100644
index 000000000..391552030
--- /dev/null
+++ b/source/tnn/device/cuda/acc/compute/reduce_utils.cuh
@@ -0,0 +1,152 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_CUDA_REDUCE_UTILS_H_
+#define TNN_CUDA_REDUCE_UTILS_H_
+
+namespace TNN_NS {
+
+template<typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+    #pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+        val += __shfl_xor_sync(0xffffffff, val, mask, 32);
+    return val;
+}
+
+template<typename T>
+__inline__ __device__ T blockReduceSum(T val) {
+    static __shared__ T shared[32];
+    int lane_id = threadIdx.x & 0x1f;
+    int warp_id = threadIdx.x >> 5;
+
+    val = warpReduceSum(val);
+    if (lane_id == 0)
+        shared[warp_id] = val;
+    __syncthreads();
+
+    if (threadIdx.x < 32) {
+        val = (threadIdx.x < (blockDim.x + 31) / 32) ? shared[threadIdx.x] : 0;
+        val = warpReduceSum(val);
+    }
+    return val;
+}
+
+template<typename T>
+__inline__ __device__ T warpReduceMax(T val) {
+    #pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+        val = max(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
+    return val;
+}
+
+// Calculate the maximum of all elements in a block
+template<typename T>
+__inline__ __device__ T blockReduceMax(T val) {
+    static __shared__ T shared[32];
+    int lane = threadIdx.x & 0x1f;  // in-warp idx
+    int wid = threadIdx.x >> 5;     // warp idx
+
+    val = warpReduceMax(val);  // get maxx in each warp
+    if (lane == 0)  // record in-warp maxx by warp Idx
+        shared[wid] = val;
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : -1e20f;
+    val = warpReduceMax(val);
+
+    return val;
+}
+
+
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceSumV2(T* val) {
+    #pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        #pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] += __shfl_xor_sync(0xffffffff, val[i], mask, 32);
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceSumV2(T* val) {
+    static __shared__ T shared[NUM][33];
+    int lane = threadIdx.x & 0x1f;
+    int wid = threadIdx.x >> 5;
+
+    warpReduceSumV2<T, NUM>(val);
+
+    if (lane == 0) {
+        #pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[i][wid] = val[i];
+        }
+    }
+    __syncthreads();
+
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+    #pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+    }
+    warpReduceSumV2<T, NUM>(val);
+    return (T)0.0f;
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceMaxV2(T* val) {
+    #pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        #pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] = max(val[i], __shfl_xor_sync(0xffffffff, val[i], mask, 32));
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceMaxV2(T* val) {
+    static __shared__ T shared[32][NUM];
+    int lane = threadIdx.x & 0x1f;  // in-warp idx
+    int wid = threadIdx.x >> 5;     // warp idx
+
+    warpReduceMaxV2<T, NUM>(val);  // get maxx in each warp
+
+    if (lane == 0) { // record in-warp maxx by warp Idx
+        #pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[wid][i] = val[i];
+        }
+    }
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+    #pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[lane][i] : (T)-1e20f;
+    }
+    warpReduceMaxV2<T, NUM>(val);
+    return (T)0.0f;
+}
+
+}   // namespace TNN_NS
+
+#endif  // TNN_CUDA_REDUCE_UTILS_H_
diff --git a/source/tnn/device/cuda/acc/compute/reformat.cu b/source/tnn/device/cuda/acc/compute/reformat.cu
new file mode 100644
index 000000000..c7d51c546
--- /dev/null
+++ b/source/tnn/device/cuda/acc/compute/reformat.cu
@@ -0,0 +1,263 @@
+#include "tnn/device/cuda/acc/compute/reformat.h"
+#include "tnn/device/cuda/fastdiv.h"
+
+#include <algorithm>
+#include <cuda.h>
+#include <stdint.h>
+
+#define THREADS_PER_BLOCK 128
+#define BLOCKS_Y_PER_GRID 65535 //Maximum Y dimension for grids
+#define BLOCKS_Z_PER_GRID 65535 //Maximum Z dimension for grids
+#define UNROLL_COUNT 4
+
+template <typename T>
+__device__ __forceinline__ float toFloat(T x)
+{
+    return float(x);
+}
+
+template <typename T>
+__device__ T fromFloat(float x);
+
+template <>
+__device__ __forceinline__ int8_t fromFloat<int8_t>(float x)
+{
+    // The order of the next two statements matters when x is a NaN,
+    // because IEEE max/min return the non-NaN operand when one operand
+    // is a NaN and the other is not.
+    x = fmaxf(x, INT8_MIN);
+    x = fminf(x, INT8_MAX);
+    return __float2int_rn(x);
+}
+
+inline __host__ __device__ constexpr int divUp(int x, int n)
+{
+    return (x + n - 1) / n;
+}
+
+inline __host__ __device__ constexpr int roundUp(int m, int n) { return divUp(m, n) * n; }
+
+struct DivisorParams {
+    fastdiv divCHW;
+    fastdiv divHW;
+    fastdiv divW;
+};
+
+void SetDivisorParams(DivisorParams &div_param, int32_t N, int32_t C, int32_t H, int32_t W) {
+    div_param.divCHW.init(C*H*W);
+    div_param.divHW.init(H*W);
+    div_param.divW.init(W);
+}
+
+template<typename Func>
+void metaLaunch(dim3 gridDim, Func func)
+{
+    for (uint32_t y = 0; y < gridDim.y; y += BLOCKS_Y_PER_GRID - 1) // Offset needs to be even number to avoid alignment issue.
+    {
+        for (uint32_t z = 0; z < gridDim.z; z += BLOCKS_Z_PER_GRID - 1)
+        {
+            dim3 gridDimSub(gridDim.x, std::min((unsigned int) BLOCKS_Y_PER_GRID, gridDim.y - y), std::min((unsigned int) BLOCKS_Z_PER_GRID, gridDim.z - z));
+            dim3 gridDimOffset(0, y, z);
+            func(gridDimSub, gridDimOffset);
+        }
+    }
+}
+
+static __device__ __forceinline__ int32_t getCoords(int32_t index, const DivisorParams& divParams, int32_t &n, int32_t &c, int32_t &h, int32_t &w, int32_t nStride)
+{
+    n = index / divParams.divCHW;
+    index = index % divParams.divCHW;
+    int32_t stridedIndex = n * nStride + index;
+    c = index / divParams.divHW;
+    index = index % divParams.divHW;
+    h = index / divParams.divW;
+    w = index % divParams.divW;
+    return stridedIndex;
+}
+
+// static __device__ __forceinline__ int32_t ncqhw4Addr(int32_t n, int32_t c, int32_t h, int32_t w, int32_t N, int32_t C, int32_t H, int32_t W, int32_t nStride, int32_t &indexNoStride)
+// {
+//     int32_t part = ((c / 4)*H*W + h*W + w) * 4 + (c & 3);
+//     indexNoStride = n * roundUp(C, 4)*H*W + part;
+//     return n * nStride + part;
+// }
+
+// __global__ void ncqhw4ToNchw(int8_t const *ncqhw4, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float* scale, DivisorParams divParams)
+// {
+//     int32_t D = roundUp(C, 4);
+
+//     int32_t index = threadIdx.x + THREADS_PER_BLOCK *blockIdx.x;
+//     if (index < N*C*H*W)
+//     {
+//         int32_t n, c, h, w;
+//         index = getCoords(index, divParams, n, c, h, w, dstNStride);
+//         assert(n < N && c < C && h < H && w < W);
+
+//         int32_t inIndexNoStride;
+//         int32_t inIndex = ncqhw4Addr(n, c, h, w, N, D, H, W, srcNStride, inIndexNoStride);
+//         assert(inIndexNoStride < N*D*H*W);
+//         float s{1.0f};
+//         if (scale) s = scale[c];
+//         float val{toFloat(ncqhw4[inIndex])};
+//         nchw[index] = s * val;
+//     }
+// }
+
+template<int PACK>
+__global__ void ncxhwxToNchw(int8_t const *ncxhwx, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, DivisorParams divParams)
+{
+    int32_t D = roundUp(C, PACK);
+
+    int32_t index = threadIdx.x + THREADS_PER_BLOCK *blockIdx.x;
+    if (index < N*C*H*W)
+    {
+        int32_t n, c, h, w;
+        index = getCoords(index, divParams, n, c, h, w, dstNStride);
+        // assert(n < N && c < C && h < H && w < W);
+
+        int32_t part = ((c / PACK)*H*W + h*W + w) * PACK + (c & (PACK-1));
+        int32_t inIndex = n * srcNStride + part;
+
+        // assert(inIndex < N*D*H*W);
+        // float s{1.0f};
+        // if (scale) s = scale[c];
+        float val{toFloat(ncxhwx[inIndex])};
+        nchw[index] = scale * val;
+    }   
+}
+
+__global__ void nchwToNcqhw4(const float* nchw, uint32_t* ncqhw4, int32_t N, int32_t C, int32_t H, int32_t W, int32_t cStride, int32_t srcNStride, int32_t dstNStride, const float scale, DivisorParams divParams)
+{
+    int32_t const n = blockIdx.y;
+    int32_t threadNum = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+    int32_t c = threadNum / divParams.divHW;
+    threadNum = threadNum % divParams.divHW;
+    int32_t h = threadNum / divParams.divW;
+    int32_t w = threadNum % divParams.divW;
+    // n,c,h,w filled in, c is [0, C/4]
+
+    int32_t const blockBase = n * srcNStride + c * 4 * cStride;
+
+    if (n < N && c < divUp(C, 4))
+    {
+        uint32_t a[4];
+        #pragma unroll
+        for (int32_t i = 0, index = blockBase + h * W + w; i < 4; i++, index += cStride)
+        {
+            bool validElement = (c * 4 + i) < C;
+            float val = validElement ? nchw[index] : 0.0f;
+
+            // If the element is not valid, then the scale is not valid.
+            // Have to have this check here, otherwise a NaN can be introduced.
+            // if (scale != nullptr && validElement)
+            if (validElement)
+            {
+                val = val / scale;
+            }
+
+            a[i] = uint32_t(uint8_t(fromFloat<int8_t>(val)));
+            // assert(a[i] <= 255);
+        }
+
+        // assert(dstNStride % 4 == 0);
+        uint32_t combinedVal = a[0] | a[1] << 8 | a[2] << 16 | a[3] << 24;
+        int32_t outIndex = n * (dstNStride >> 2) + c * cStride + h * W + w;
+        ncqhw4[outIndex] = combinedVal;
+    }
+}
+
+__global__ void nchwToNcxhwx(const float* nchw, uint32_t* ncxhwx, int32_t N, int32_t C, int32_t H, int32_t W, int32_t cStride, int32_t srcNStride, int32_t dstNStride, const float scale, DivisorParams divParams, fastdiv divPacked)
+{
+    int32_t const n = blockIdx.y;
+    int32_t threadNum = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+    int32_t c = threadNum / divParams.divHW;
+    threadNum = threadNum % divParams.divHW;
+    int32_t h = threadNum / divParams.divW;
+    int32_t w = threadNum % divParams.divW;
+    // n,c,h,w filled in, c is [0, C/4]
+
+    int32_t const blockBase = n * srcNStride + c * 4 * cStride;
+    int32_t c_major = c / divPacked;
+    int32_t c_minor = c % divPacked;
+
+    if (n < N && c < divUp(C, 4))
+    {
+        uint32_t a[4];
+        #pragma unroll
+        for (int32_t i = 0, index = blockBase + h * W + w; i < 4; i++, index += cStride)
+        {
+            bool validElement = (c * 4 + i) < C;
+            float val = validElement ? nchw[index] : 0.0f;
+
+            // If the element is not valid, then the scale is not valid.
+            // Have to have this check here, otherwise a NaN can be introduced.
+            // if (scale != nullptr && validElement)
+            if (validElement)
+            {
+                val = val / scale;
+            }
+
+            a[i] = uint32_t(uint8_t(fromFloat<int8_t>(val)));
+            // assert(a[i] <= 255);
+        }
+
+        uint32_t combinedVal = a[0] | a[1] << 8 | a[2] << 16 | a[3] << 24;
+        // int32_t outIndex = n * (dstNStride >> 2) + c * cStride + h * W + w;
+        int32_t outIndex = n * (dstNStride >> 2) + (c_major * H * W + h * W + w) * int(divPacked) + c_minor;
+        ncxhwx[outIndex] = combinedVal;
+    }
+}
+
+void NC4HW4ToNCHW(int8_t const *ncqhw4, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream)
+{
+    DivisorParams divParams;
+    SetDivisorParams(divParams, N, C, H, W);
+
+    // ncqhw4ToNchw <<< divUp(N*C*H*W, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0, stream >>>(ncqhw4, nchw, N, C, H, W, srcNStride, dstNStride, scale, divParams);
+    ncxhwxToNchw<4> <<< divUp(N*C*H*W, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0, stream >>>(ncqhw4, nchw, N, C, H, W, srcNStride, dstNStride, scale, divParams);
+}
+
+void NC32HW32ToNCHW(int8_t const *nc32hw32, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream)
+{
+    DivisorParams divParams;
+    SetDivisorParams(divParams, N, C, H, W);
+
+    ncxhwxToNchw<32> <<< divUp(N*C*H*W, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0, stream >>>(nc32hw32, nchw, N, C, H, W, srcNStride, dstNStride, scale, divParams);
+}
+
+void NCHWToNC4HW4(const float* nchw, int8_t* ncqhw4, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream)
+{
+    DivisorParams divParams;
+    SetDivisorParams(divParams, N, divUp(C, 4), H, W);
+
+    int32_t chwq = divUp(C, 4) * H * W;
+    auto func = [&](dim3 gridDimSub, dim3 gridDimOffset) {
+        int32_t srcOffset = srcNStride * gridDimOffset.y;
+        int32_t dstOffset = (dstNStride >> 2) * gridDimOffset.y;
+        nchwToNcqhw4<<<gridDimSub, THREADS_PER_BLOCK, 0, stream>>>(nchw + srcOffset, (uint32_t*) ncqhw4 + dstOffset, N, C, H, W, /* CStride= */H * W, srcNStride, dstNStride, scale, divParams);
+    };
+    dim3 gridDim(divUp(chwq, THREADS_PER_BLOCK), N);
+    metaLaunch(gridDim, func);
+}
+
+template<int PACK>
+void NCHWToNCxHWx(const float *nchw, int8_t *ncxhwx, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream) {
+    DivisorParams divParams;
+    SetDivisorParams(divParams, N, divUp(C, 4), H, W);
+    fastdiv divPacked;
+    divPacked.init(PACK / 4);
+
+    int32_t chwq = divUp(C, 4) * H * W;
+    auto func = [&](dim3 gridDimSub, dim3 gridDimOffset) {
+        int32_t srcOffset = srcNStride * gridDimOffset.y;
+        int32_t dstOffset = (dstNStride >> 2) * gridDimOffset.y;
+        nchwToNcxhwx<<<gridDimSub, THREADS_PER_BLOCK, 0, stream>>>(nchw + srcOffset, (uint32_t*) ncxhwx + dstOffset, N, C, H, W, /* CStride= */H * W, srcNStride, dstNStride, scale, divParams, divPacked);
+    };
+    dim3 gridDim(divUp(chwq, THREADS_PER_BLOCK), N);
+    metaLaunch(gridDim, func);
+}
+
+
+void NCHWToNC32HW32(const float* nchw, int8_t* nc32hw32, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream) {
+    NCHWToNCxHWx<32>(nchw, nc32hw32, N, C, H, W, srcNStride, dstNStride, scale, stream);
+}
\ No newline at end of file
diff --git a/source/tnn/device/cuda/acc/compute/reformat.h b/source/tnn/device/cuda/acc/compute/reformat.h
new file mode 100644
index 000000000..4f72ec732
--- /dev/null
+++ b/source/tnn/device/cuda/acc/compute/reformat.h
@@ -0,0 +1,12 @@
+#ifndef _TNN_CUDA_REFORMAT_H_
+#define _TNN_CUDA_REFORMAT_H_
+
+void NC4HW4ToNCHW(int8_t const *ncqhw4, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream);
+
+void NC32HW32ToNCHW(int8_t const *nc32hw32, float *nchw, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream);
+
+void NCHWToNC4HW4(const float* nchw, int8_t* ncqhw4, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream);
+
+void NCHWToNC32HW32(const float* nchw, int8_t* nc32hw32, int32_t N, int32_t C, int32_t H, int32_t W, int32_t srcNStride, int32_t dstNStride, const float scale, cudaStream_t stream);
+
+#endif
\ No newline at end of file
diff --git a/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu
index 252b4e42f..82b06441c 100644
--- a/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_cbam_fused_pooling_layer_acc.cu
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
 #include "tnn/utils/dims_utils.h"
 
 #include "cuda_fp16.h"
diff --git a/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu
index 280f76e83..2e6ccf961 100644
--- a/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_cbam_fused_reduce_layer_acc.cu
@@ -1,4 +1,5 @@
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
 #include "tnn/utils/dims_utils.h"
 #include <limits>
 #include <cuda.h>
diff --git a/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu
index 7dead9d8c..42c8ece3b 100644
--- a/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_conv_3d_layer_acc.cu
@@ -89,7 +89,7 @@ Status CudaConv3DLayerAcc::Init(Context *context, LayerParam *param, LayerResour
 
     size_t weights_size = sizeof(float) * input_dims[1] *
                           output_dims[1] * conv_param->kernels[2] *
-                          conv_param->kernels[1] * conv_param->kernels[0];
+                          conv_param->kernels[1] * conv_param->kernels[0] / conv_param->group;
 
     CUDA_CHECK(cudaMalloc((void **)&weights_, weights_size));
     CUDA_CHECK(cudaMemcpy(weights_, weights, weights_size, cudaMemcpyHostToDevice));
@@ -177,7 +177,7 @@ Status CudaConv3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
 
     CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
-        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        context_->GetCudnnHandle(), bottom_desc_, filter_desc_, conv_desc_,
         top_desc_, num_algos, &perf_count, perf_results.get()));
 
     auto valid_algos = getValidAlgorithms(perf_results.get(), perf_count);
@@ -192,7 +192,7 @@ Status CudaConv3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
     // workspace
     size_t needed_workspace_size;
     CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        context_->GetCudnnHandle(), bottom_desc_, filter_desc_, conv_desc_,
         top_desc_, conv_algo_, &needed_workspace_size));
 
     // LOGD("Workspace size: %ld\n", workspace_size_);
@@ -208,7 +208,7 @@ Status CudaConv3DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
 
 Status CudaConv3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     CUDNN_CHECK(cudnnConvolutionForward(
-        context_->cudnn_handle_, &alpha_, bottom_desc_,
+        context_->GetCudnnHandle(), &alpha_, bottom_desc_,
         inputs[0]->GetHandle().base, filter_desc_, weights_, conv_desc_,
         conv_algo_, workspace_data_, workspace_size_, &beta_, top_desc_,
         outputs[0]->GetHandle().base));
@@ -216,7 +216,7 @@ Status CudaConv3DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std:
     if (bias_term_) {
         float alpha = 1.0f;
         float beta  = 1.0f;
-        CUDNN_CHECK(cudnnAddTensor(context_->cudnn_handle_, &alpha, bias_desc_,
+        CUDNN_CHECK(cudnnAddTensor(context_->GetCudnnHandle(), &alpha, bias_desc_,
                                    bias_, &beta, top_desc_,
                                    outputs[0]->GetHandle().base));
     }
diff --git a/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu
index 4227bd1e2..9cb30819f 100644
--- a/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_conv_layer_acc.cu
@@ -37,7 +37,8 @@ Status CudaConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource
     DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
 
     if (input_dims.size() == 0 || output_dims.size() == 0) {
-        return TNNERR_LAYER_ERR;
+        LOGD("Conv layer acc input_dims or output_dims is 0, can be ignored in dynamic mode\n");
+        return TNN_OK;
     }
 
     Blob *input = inputs[0];
@@ -77,14 +78,18 @@ Status CudaConvLayerAcc::Init(Context *context, LayerParam *param, LayerResource
     float *weights = conv_resource->filter_handle.force_to<float *>();
 
     size_t weights_size = sizeof(float) * input_dims[1] * output_dims[1] *
-        conv_param->kernels[1] * conv_param->kernels[0];
-
+        conv_param->kernels[1] * conv_param->kernels[0] / conv_param->group;
+    
     CUDA_CHECK(cudaMalloc((void **)&weights_, weights_size));
-    CUDA_CHECK(cudaMemcpy(weights_, weights, weights_size, cudaMemcpyHostToDevice));
+
+    // conv resource is empty in qat mode
+    if (weights != nullptr && inputs.size() == 1)
+        CUDA_CHECK(cudaMemcpy(weights_, weights, weights_size, cudaMemcpyHostToDevice));
 
     if (conv_param->bias) {
         bias_term_ = true;
         if (output_dims[1] * sizeof(float) != conv_resource->bias_handle.GetBytesSize()) {
+            LOGE("Conv layer acc bias size error\n");
             return TNNERR_MODEL_ERR;
         }
 
@@ -155,7 +160,7 @@ Status CudaConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::v
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
 
     CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
-        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        context_->GetCudnnHandle(), bottom_desc_, filter_desc_, conv_desc_,
         top_desc_, num_algos, &perf_count, perf_results.get()));
 
     std::vector<perf_t> valid_algos;
@@ -175,7 +180,7 @@ Status CudaConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::v
     // workspace
     size_t needed_workspace_size;
     CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
-        context_->cudnn_handle_, bottom_desc_, filter_desc_, conv_desc_,
+        context_->GetCudnnHandle(), bottom_desc_, filter_desc_, conv_desc_,
         top_desc_, conv_algo_, &needed_workspace_size));
 
     if (workspace_size_ < needed_workspace_size) {
@@ -190,7 +195,7 @@ Status CudaConvLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::v
 
 Status CudaConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     CUDNN_CHECK(cudnnConvolutionForward(
-        context_->cudnn_handle_, &alpha_, bottom_desc_,
+        context_->GetCudnnHandle(), &alpha_, bottom_desc_,
         inputs[0]->GetHandle().base, filter_desc_, weights_, conv_desc_,
         conv_algo_, workspace_data_, workspace_size_, &beta_, top_desc_,
         outputs[0]->GetHandle().base));
@@ -198,7 +203,7 @@ Status CudaConvLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::v
     if (bias_term_) {
         float alpha = 1.0f;
         float beta  = 1.0f;
-        CUDNN_CHECK(cudnnAddTensor(context_->cudnn_handle_, &alpha, bias_desc_,
+        CUDNN_CHECK(cudnnAddTensor(context_->GetCudnnHandle(), &alpha, bias_desc_,
                                    bias_, &beta, top_desc_,
                                    outputs[0]->GetHandle().base));
     }
diff --git a/source/tnn/device/cuda/acc/cuda_cumsum_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_cumsum_layer_acc.cu
new file mode 100644
index 000000000..59abdc7a8
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_cumsum_layer_acc.cu
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Cumsum, LAYER_CUMSUM);
+
+Status CudaCumsumLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaCumsumLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+// TODO:
+// TODO: Optimize with wrap shuffle
+template<typename T>
+__global__ void cumsum_kernel(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int offset = blockIdx.x * dim_curr * dim_post + dim_post_i;
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr; i++) {
+            curr_cumsum += input[offset];
+            output[offset] = curr_cumsum;
+            offset += dim_post;
+        }
+    }
+}
+
+template<typename T>
+__global__ void cumsum_kernel_reversed(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int offset = blockIdx.x * dim_curr * dim_post + dim_post_i + dim_post * (dim_curr - 1);
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr; i++) {
+            curr_cumsum += input[offset];
+            output[offset] = curr_cumsum;
+            offset -= dim_post;
+        }
+    }
+}
+
+template<typename T>
+__global__ void cumsum_kernel_exclusive(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int offset = blockIdx.x * dim_curr * dim_post + dim_post_i;
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr; i++) {
+            output[offset] = curr_cumsum;
+            curr_cumsum += input[offset];
+            offset += dim_post;
+        }
+    }
+}
+
+template<typename T>
+__global__ void cumsum_kernel_exclusive_reversed(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int offset = blockIdx.x * dim_curr * dim_post + dim_post_i + dim_post * (dim_curr - 1);
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr; i++) {
+            output[offset] = curr_cumsum;
+            curr_cumsum += input[offset];
+            offset -= dim_post;
+        }
+    }
+}
+
+template<typename T>
+__global__ void cumsum_kernel_exclusive_extend(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int i_offset = blockIdx.x * dim_curr * dim_post + dim_post_i;
+        int o_offset = blockIdx.x * (dim_curr+1) * dim_post + dim_post_i;
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr+1; i++) {
+        output[o_offset] = curr_cumsum;
+            curr_cumsum += input[i_offset];
+            i_offset += dim_post;
+            o_offset += dim_post;
+        }
+    }
+}
+
+template<typename T>
+__global__ void cumsum_kernel_exclusive_extend_reversed(const T* input, T* output, const int dim_curr, const int dim_post) {
+    int dim_post_i = blockIdx.y * blockDim.x + threadIdx.x;
+    if (dim_post_i < dim_post) {
+        int i_offset = blockIdx.x * dim_curr * dim_post + dim_post_i + dim_post * (dim_curr - 1);
+        int o_offset = blockIdx.x * (dim_curr+1) * dim_post + dim_post_i + dim_post * (dim_curr);
+        T curr_cumsum = T(0);
+        for (int i=0; i<dim_curr+1; i++) {
+            output[o_offset] = curr_cumsum;
+            curr_cumsum += input[i_offset];
+            i_offset -= dim_post;
+            o_offset -= dim_post;
+        }
+    }
+}
+
+
+template <typename T>
+void CudaCumsumForwardImpl(Blob* input_blob, Blob* output_blob, const dim3 blocks,
+                           const int thread_per_block, const int dim_curr, const int dim_post,
+                           const bool exclusive, const bool exclusive_extend, const bool reverse, cudaStream_t& stream) {
+    T* input_data  = reinterpret_cast<T *>(input_blob->GetHandle().base);
+    T* output_data = reinterpret_cast<T *>(output_blob->GetHandle().base);
+    if (exclusive) {
+        if (reverse) {
+            cumsum_kernel_exclusive_reversed<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        } else {
+            cumsum_kernel_exclusive<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        }
+    } else if (exclusive_extend) {
+        if (reverse) {
+            cumsum_kernel_exclusive_extend_reversed<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        } else {
+            cumsum_kernel_exclusive_extend<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        }
+    } else {
+        if (reverse) {
+            cumsum_kernel_reversed<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        } else {
+            cumsum_kernel<T><<<blocks, thread_per_block, 0, stream>>>
+                (input_data, output_data, dim_curr, dim_post);
+        }
+    }
+}
+
+
+Status CudaCumsumLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // Operator Cumsum input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+
+    auto cumsum_param  = dynamic_cast<CumsumLayerParam*>(param_);
+    if (cumsum_param == nullptr) {
+        LOGE("Error: CudaCumsumLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CudaCumsumLayer forward Load layer param failed!");
+    }
+    if (cumsum_param->axis < 0) {
+        cumsum_param->axis += input_dims.size();
+    }
+
+    int dim_pre  = 1;
+    int dim_curr = input_dims[cumsum_param->axis];
+    int dim_post = 1;
+    for (int i=0; i<cumsum_param->axis; i++) {
+        dim_pre *= input_dims[i];
+    }
+    for (int i=cumsum_param->axis+1; i<input_dims.size(); i++) {
+        dim_post *= input_dims[i];
+    }
+
+    const int THREAD_PER_BLOCK = 128;
+    dim3 blocks;
+    blocks.x = dim_pre;
+    blocks.y = (dim_post + THREAD_PER_BLOCK - 1 ) / THREAD_PER_BLOCK;
+    if (blocks.x > 65535 || blocks.y > 65535) {
+        LOGE("Error: CudaCumsumLayer forward layer cuda block.x or block.y > 65535, large kernel not supported yet.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CudaCumsumLayer forward layer cuda block.x or block.y > 65535, large kernel not supported yet.");
+    }
+
+    // Run cuda Kernel
+    auto data_type = output_blob->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT) {
+        CudaCumsumForwardImpl<float>(input_blob, output_blob, blocks, THREAD_PER_BLOCK, dim_curr, dim_post,
+                                     cumsum_param->exclusive, cumsum_param->exclusive_extend,
+                                     cumsum_param->reverse, context_->GetStream());
+    } else if (data_type == DATA_TYPE_HALF) {
+        CudaCumsumForwardImpl<__half>(input_blob, output_blob, blocks, THREAD_PER_BLOCK, dim_curr, dim_post,
+                                      cumsum_param->exclusive, cumsum_param->exclusive_extend,
+                                      cumsum_param->reverse, context_->GetStream());
+    } else if (data_type == DATA_TYPE_INT32) {
+        CudaCumsumForwardImpl<int>(input_blob, output_blob, blocks, THREAD_PER_BLOCK, dim_curr, dim_post,
+                                   cumsum_param->exclusive, cumsum_param->exclusive_extend,
+                                   cumsum_param->reverse, context_->GetStream());
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Cumsum, LAYER_CUMSUM);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.cu
new file mode 100644
index 000000000..a091b2156
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.cu
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+#include <cuda_fp16.h>
+namespace TNN_NS {
+
+Status CudaDeconvolution1DLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+CudaDeconvolution1DLayerAcc::~CudaDeconvolution1DLayerAcc() {}
+
+Status CudaDeconvolution1DLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaDeconvolution1DLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return Status(TNNERR_LAYER_ERR, "acc not implemented \n");
+}
+
+REGISTER_CUDA_ACC(Deconvolution1D, LAYER_DECONVOLUTION_1D);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.h b/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.h
new file mode 100644
index 000000000..208d21941
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_deconv_1d_layer_acc.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_1D_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_1D_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaDeconvolution1DLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaDeconvolution1DLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_1D_LAYER_ACC_H_
diff --git a/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu
index 0735f074a..650e17741 100644
--- a/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.cu
@@ -12,23 +12,129 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/acc/cuda_deconv_layer_acc.h"
 #include "tnn/utils/dims_utils.h"
 
+#include <cuda_fp16.h>
 namespace TNN_NS {
 
-DECLARE_CUDA_ACC(Deconvolution, LAYER_DECONVOLUTION);
+// DECLARE_CUDA_ACC(Deconvolution, LAYER_DECONVOLUTION);
+
+template <typename Dtype, typename AccT, bool BIAS>
+__global__ void ConvBackward(const int nthreads, const Dtype *const input, const int num, const int channels,
+                             const int height, const int width, const int conved_height, const int conved_width,
+                             const int kernel_h, const int kernel_w, const int stride_h, const int stride_w,
+                             const int pad_h, const int pad_w, Dtype *const output, const Dtype *const weight,
+                             const Dtype *const bias) {
+    CUDA_KERNEL_LOOP(index, nthreads) {
+        const int w = index % width + pad_w;
+        const int h = (index / width) % height + pad_h;
+        const int c = (index / width / height) % channels;
+        const int n = index / width / height / channels;
+
+        const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+        const int phend   = min(h / stride_h + 1, conved_height);
+        const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+        const int pwend   = min(w / stride_w + 1, conved_width);
+
+        const int khstart = (h >= kernel_h) ? ((h - kernel_h) % stride_h) + (kernel_h - stride_h) : h;
+        const int kwstart = (w >= kernel_w) ? ((w - kernel_w) % stride_w) + (kernel_w - stride_w) : w;
+
+        AccT res                       = 0;
+        const Dtype *const input_slice = input + (n * channels + c) * conved_height * conved_width;
+
+        const Dtype *const weight_slice = weight + c * kernel_h * kernel_w;
+
+        for (int ph = phstart; ph < phend; ++ph) {
+            for (int pw = pwstart; pw < pwend; ++pw) {
+                int kh = khstart - (ph - phstart) * stride_h;
+                int kw = kwstart - (pw - pwstart) * stride_w;
+                res += (AccT)input_slice[ph * conved_width + pw] * (AccT)weight_slice[kh * kernel_w + kw];
+            }
+        }
+        if (BIAS)
+            output[index] = (Dtype)(res + (AccT)bias[c]);
+        else
+            output[index] = (Dtype)res;
+    }
+}
 
 Status CudaDeconvolutionLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
-        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+                                       const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+    DimsVector output_dims = outputs[0]->GetBlobDesc().dims;
+
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param);
+
+    ConvLayerResource *conv_resource = dynamic_cast<ConvLayerResource *>(resource);
+
+    CUDA_CHECK(cudaMalloc((void **)&weights_, conv_resource->filter_handle.GetBytesSize()));
+    CUDA_CHECK(cudaMemcpy(weights_, conv_resource->filter_handle.force_to<float *>(),
+                          conv_resource->filter_handle.GetBytesSize(), cudaMemcpyHostToDevice));
+
+    auto weight_fp16_buf = ConvertFloatToHalf(conv_resource->filter_handle);
+    CUDA_CHECK(cudaMalloc((void **)&weights_fp16_, weight_fp16_buf.GetBytesSize()));
+    CUDA_CHECK(cudaMemcpy(weights_fp16_, weight_fp16_buf.force_to<void *>(), weight_fp16_buf.GetBytesSize(),
+                          cudaMemcpyHostToDevice));
+
+    if (conv_param->bias) {
+        CUDA_CHECK(cudaMalloc((void **)&bias_, conv_resource->bias_handle.GetBytesSize()));
+        CUDA_CHECK(cudaMemcpy(bias_, conv_resource->bias_handle.force_to<float *>(),
+                              conv_resource->bias_handle.GetBytesSize(), cudaMemcpyHostToDevice));
+        auto bias_fp16_buf = ConvertFloatToHalf(conv_resource->bias_handle);
+        CUDA_CHECK(cudaMalloc((void **)&bias_fp16_, bias_fp16_buf.GetBytesSize()));
+        CUDA_CHECK(
+            cudaMemcpy(bias_fp16_, bias_fp16_buf.force_to<void *>(), bias_fp16_buf.GetBytesSize(), cudaMemcpyHostToDevice));
+    }
+
     return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
 }
 
+CudaDeconvolutionLayerAcc::~CudaDeconvolutionLayerAcc() {}
+
 Status CudaDeconvolutionLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return TNN_OK;
 }
 
 Status CudaDeconvolutionLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto dtype = input->GetBlobDesc().data_type;
+    ConvLayerParam *conv_param = dynamic_cast<ConvLayerParam *>(param_);
+
+    auto input_data = reinterpret_cast<void *>(input->GetHandle().base);
+    auto output_data = reinterpret_cast<void *>(output->GetHandle().base);
+
+    auto input_dims = input->GetBlobDesc().dims;
+    auto output_dims = output->GetBlobDesc().dims;
+
+    auto count = DimsVectorUtils::Count(output_dims);
+    if (dtype == DATA_TYPE_FLOAT) {
+        if (conv_param->bias) {
+            ConvBackward<float, float, true><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                count, (float*)input_data, output_dims[0], output_dims[1], output_dims[2], output_dims[3], input_dims[2],
+                input_dims[3], conv_param->kernels[1], conv_param->kernels[0], conv_param->strides[1], conv_param->strides[0],
+                conv_param->pads[2], conv_param->pads[0], (float*)output_data, (float*)weights_, (float*)bias_);
+        } else {
+            ConvBackward<float, float, false><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                count, (float*)input_data, output_dims[0], output_dims[1], output_dims[2], output_dims[3], input_dims[2],
+                input_dims[3], conv_param->kernels[1], conv_param->kernels[0], conv_param->strides[1], conv_param->strides[0],
+                conv_param->pads[2], conv_param->pads[0], (float*)output_data, (float*)weights_, (float*)bias_);            
+        }
+    } else if (dtype == DATA_TYPE_HALF) {
+        if (conv_param->bias) {
+            ConvBackward<__half, float, true><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                count, (__half*)input_data, output_dims[0], output_dims[1], output_dims[2], output_dims[3], input_dims[2],
+                input_dims[3], conv_param->kernels[1], conv_param->kernels[0], conv_param->strides[1], conv_param->strides[0],
+                conv_param->pads[2], conv_param->pads[0], (__half*)output_data, (__half*)weights_fp16_, (__half*)bias_fp16_);
+        } else {
+            ConvBackward<__half, float, false><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+                count, (__half*)input_data, output_dims[0], output_dims[1], output_dims[2], output_dims[3], input_dims[2],
+                input_dims[3], conv_param->kernels[1], conv_param->kernels[0], conv_param->strides[1], conv_param->strides[0],
+                conv_param->pads[2], conv_param->pads[0], (__half*)output_data, (__half*)weights_fp16_, (__half*)bias_fp16_);
+        }
+    }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.h b/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.h
new file mode 100644
index 000000000..35018e85a
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_deconv_layer_acc.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+
+namespace TNN_NS {
+
+class CudaDeconvolutionLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaDeconvolutionLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    void *weights_;
+    void *weights_fp16_;
+    void *bias_;
+    void *bias_fp16_;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_DECONV_LAYER_ACC_H_A
diff --git a/source/tnn/device/cuda/acc/cuda_dequantize_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_dequantize_layer_acc.cu
new file mode 100644
index 000000000..2f05a537d
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_dequantize_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Dequantize, LAYER_DEQUANTIZE);
+
+Status CudaDequantizeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaDequantizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaDequantizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Dequantize, LAYER_DEQUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_effective_transformer_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_effective_transformer_layer_acc.cu
new file mode 100644
index 000000000..1cc95cfba
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_effective_transformer_layer_acc.cu
@@ -0,0 +1,358 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+Status CudaEffectiveTransformerLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaEffectiveTransformerLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+__global__ void getPaddingOffsetKernel(int*    valid_word_num,
+                                       int*       tmp_mask_offset,
+                                       const int* sequence_length,
+                                       const int  batch_size,
+                                       const int  max_seq_len)
+{
+    // do cumulated sum
+    int total_seq_len = 0;
+    int cum_offset    = 0;
+    int index         = 0;
+    for (int i = 0; i < batch_size; i++) {
+        const int seq_len = sequence_length[i];
+        for (int j = 0; j < seq_len; j++) {
+            tmp_mask_offset[index] = cum_offset;
+            index++;
+        }
+        cum_offset += max_seq_len - seq_len;
+        total_seq_len += seq_len;
+    }
+    valid_word_num[0] = total_seq_len;
+}
+
+void invokeGetPaddingOffset(int*      h_token_num,
+                            int*      d_token_num,
+                            int*         tmp_mask_offset,
+                            const int*   sequence_lengths,
+                            const int    batch_size,
+                            const int    max_seq_len,
+                            cudaStream_t stream)
+{
+    getPaddingOffsetKernel<<<1, 1, 0, stream>>>(
+        d_token_num, tmp_mask_offset, sequence_lengths, batch_size, max_seq_len);
+    CUDA_CHECK(cudaMemcpyAsync(h_token_num, d_token_num, sizeof(int), cudaMemcpyDeviceToHost, stream));
+}
+
+template<typename T>
+__global__ void getPaddingOffsetFromMaskKernel(int*       valid_word_num,
+                                               int*       tmp_mask_offset,
+                                               const T*   mask,
+                                               const int  batch_size,
+                                               const int  max_seq_len,
+                                               const int  ld_mask)
+{
+    extern __shared__ int sequence_length[];
+    for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+        const T* b_mask = mask + i * ld_mask;
+        int len = 0;
+        for (int j = 0; j < max_seq_len; ++j) {
+            if ((float(b_mask[j]) - 0.0) > -1e-5 && (float(b_mask[j]) - 0.0) < 1e-5) {
+                ++len;
+            } else {
+                break;
+            }
+        }
+        sequence_length[i] = len;
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        // do cumulated sum
+        int total_seq_len = 0;
+        int cum_offset    = 0;
+        int index         = 0;
+        for (int i = 0; i < batch_size; i++) {
+            const int seq_len = sequence_length[i];
+            for (int j = 0; j < seq_len; j++) {
+                tmp_mask_offset[index] = cum_offset;
+                index++;
+            }
+            cum_offset += max_seq_len - seq_len;
+            total_seq_len += seq_len;
+        }
+        valid_word_num[0] = total_seq_len;
+    }
+}
+
+template<typename T>
+void invokeGetPaddingOffsetFromMask(int*         h_token_num,
+                                    int*         d_token_num,
+                                    int*         tmp_mask_offset,
+                                    const T*     mask,
+                                    const int    batch_size,
+                                    const int    max_seq_len,
+                                    const int    ld_mask,
+                                    cudaStream_t stream)
+{
+    getPaddingOffsetFromMaskKernel<<<1, 256, sizeof(int) * batch_size, stream>>>(
+        d_token_num, tmp_mask_offset, mask, batch_size, max_seq_len, ld_mask);
+    CUDA_CHECK(cudaMemcpyAsync(h_token_num, d_token_num, sizeof(int), cudaMemcpyDeviceToHost, stream));
+}
+
+template<typename T>
+__global__ void remove_padding(T* tgt, const T* src, const int* padding_offset, const int n)
+{
+    const int tid        = threadIdx.x;
+    const int bid        = blockIdx.x;
+    const int src_seq_id = bid + padding_offset[bid];
+    const int tgt_seq_id = bid;
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        tgt[tgt_seq_id * n + i] = src[src_seq_id * n + i];
+    }
+}
+
+template<typename T>
+void invokeRemovePadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream)
+{
+    remove_padding<<<token_num, 256, 0, stream>>>(dst, src, padding_offset, hidden_dim);
+}
+
+template<typename T>
+__global__ void rebuild_sequence_length_padding(const T* src, T* dst, const int* padding_offset, const int n)
+{
+    const int tid        = threadIdx.x;
+    const int bid        = blockIdx.x;
+    const int dst_seq_id = bid + padding_offset[bid];
+    const int src_seq_id = bid;
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        dst[dst_seq_id * n + i] = src[src_seq_id * n + i];
+    }
+}
+
+template<typename T>
+void invokeRebuildPadding(
+    T* dst, const T* src, const int* padding_offset, const int m, const int n, cudaStream_t stream)
+{
+    // src: [token_num, hidden_dim]
+    // dst: [batch_size*max_seq_len, hidden_dim]
+    rebuild_sequence_length_padding<<<m, 256, 0, stream>>>(src, dst, padding_offset, n);
+}
+
+__global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we remove the padding
+
+    extern __shared__ int tmp_offset[];
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < batch_size; i++) {
+            tmp_offset[i + 1] = tmp_offset[i] + sequence_length[i];
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    batch_size,
+                               cudaStream_t stream)
+{
+    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (batch_size + 1), stream>>>(
+        trt_mha_padding_offset, sequence_length, batch_size);
+}
+
+template<typename T>
+__global__ void getTrtPaddingOffsetKernelFromMask(int*      trt_mha_padding_offset,
+                                                  const T*  mask,
+                                                  const int batch_size,
+                                                  const int max_seq_len,
+                                                  const int ld_mask)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we remove the padding
+
+    extern __shared__ int sequence_length[];
+    for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+        const T* b_mask = mask + i * ld_mask;
+        int len = 0;
+        for (int j = 0; j < max_seq_len; ++j) {
+            if ((float(b_mask[j]) - 0.0) > -1e-5 && (float(b_mask[j]) - 0.0) < 1e-5) {
+                ++len;
+            } else {
+                break;
+            }
+        }
+        sequence_length[i] = len;
+    }
+    __syncthreads();
+
+    int *tmp_offset = sequence_length + batch_size;
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < batch_size; i++) {
+            tmp_offset[i + 1] = tmp_offset[i] + sequence_length[i];
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+template<typename T>
+void invokeGetTrtPaddingOffsetFromMask(int*         trt_mha_padding_offset,
+                                       const T*     mask,
+                                       const int    batch_size,
+                                       const int    max_seq_len,
+                                       const int    ld_mask,
+                                       cudaStream_t stream)
+{
+    getTrtPaddingOffsetKernelFromMask<<<1, 256, sizeof(int) * (2*batch_size + 1), stream>>>(
+        trt_mha_padding_offset, mask, batch_size, max_seq_len, ld_mask);
+}
+
+Status CudaEffectiveTransformerLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto layer_param = dynamic_cast<EffectiveTransformerLayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: layer_param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: layer_param is nil");
+    }
+
+    if ((layer_param->is_remove_padding && inputs.size() <= 2) ||
+        (!layer_param->is_remove_padding && (inputs.size() != 2 && inputs.size() != 3))) {
+        LOGE("Error: input number not support\n");
+        return Status(TNNERR_LAYER_ERR, "Error: input number not support");
+    }
+
+    Blob* input_blob1 = inputs[0];
+    Blob* input_blob2 = inputs[1];
+    auto input_dims1 = input_blob1->GetBlobDesc().dims;
+    auto input_dims2 = input_blob2->GetBlobDesc().dims;
+    void* input_data1 = input_blob1->GetHandle().base;
+
+    if (input_dims1.size() != 3) {
+        LOGE("Error: input dims not support\n");
+        return Status(TNNERR_LAYER_ERR, "Error: input dims not support");
+    }
+
+    int type_size = DataTypeUtils::GetBytesSize(input_blob1->GetBlobDesc().data_type);
+    int data_count = DimsVectorUtils::Count(input_dims1);
+
+    auto stream = context_->GetStream();
+    if (layer_param->is_remove_padding) {
+        if (outputs.size() != 3) {
+            LOGE("Error: output number not support\n");
+            return Status(TNNERR_LAYER_ERR, "Error: output number not support");
+        }
+        Blob* dense_blob = outputs[0];
+        Blob* offset_blob = outputs[1];
+        Blob* trt_offset_blob = outputs[2];
+        void* dense_data = dense_blob->GetHandle().base;
+        int* offset_data = reinterpret_cast<int *>(offset_blob->GetHandle().base);
+        int* trt_offset_data = reinterpret_cast<int *>(trt_offset_blob->GetHandle().base);
+
+        int ld_mask = DimsVectorUtils::Count(input_blob2->GetBlobDesc().dims, 1);
+
+        int h_token_num = 0;
+        if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            float* input_data2 = reinterpret_cast<float *>(input_blob2->GetHandle().base);
+            invokeGetPaddingOffsetFromMask(&h_token_num, offset_data, offset_data + 1, input_data2, input_dims1[0], input_dims1[1], ld_mask, stream);
+        } else {
+            half* input_data2 = reinterpret_cast<half *>(input_blob2->GetHandle().base);
+            invokeGetPaddingOffsetFromMask(&h_token_num, offset_data, offset_data + 1, input_data2, input_dims1[0], input_dims1[1], ld_mask, stream);
+        }
+        if (h_token_num < 0) {
+            LOGE("Error: token num is invalid\n");
+            return Status(TNNERR_LAYER_ERR, "Error: token num is invalid");
+        }
+
+        cudaMemsetAsync(dense_data, 0, data_count * type_size, stream);
+        if (h_token_num > 0) {
+            if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                invokeRemovePadding(reinterpret_cast<float*>(dense_data), reinterpret_cast<float*>(input_data1), offset_data + 1, h_token_num, input_dims1[2], stream);
+            } else if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+                invokeRemovePadding(reinterpret_cast<half*>(dense_data), reinterpret_cast<half*>(input_data1), offset_data + 1, h_token_num, input_dims1[2], stream);
+            } else {
+                LOGE("Error: input dtype not support\n");
+                return Status(TNNERR_LAYER_ERR, "Error: input dtype not support");
+            }
+        }
+        auto& info_map = context_->GetExtraInfoMap();
+        info_map["int_transformer_runtime_token_num"] = make_any<int>(h_token_num);
+
+        if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            float* input_data2 = reinterpret_cast<float *>(input_blob2->GetHandle().base);
+            invokeGetTrtPaddingOffsetFromMask(trt_offset_data, input_data2, input_dims1[0], input_dims1[1], ld_mask, stream);
+        } else {
+            half* input_data2 = reinterpret_cast<half *>(input_blob2->GetHandle().base);
+            invokeGetTrtPaddingOffsetFromMask(trt_offset_data, input_data2, input_dims1[0], input_dims1[1], ld_mask, stream);
+        }
+    } else {
+        if (outputs.size() != 1 && outputs.size() != 2) {
+            LOGE("Error: output number not support\n");
+            return Status(TNNERR_LAYER_ERR, "Error: output number not support");
+        }
+        Blob* output_blob = outputs[0];
+        void* output_data = output_blob->GetHandle().base;
+        int* input_data2 = reinterpret_cast<int *>(input_blob2->GetHandle().base);
+
+        int rt_token_num = 0;
+        auto& info_map = context_->GetExtraInfoMap();
+        if (info_map.find("int_transformer_runtime_token_num") != info_map.end()) {
+            rt_token_num = any_cast<int>(info_map["int_transformer_runtime_token_num"]); 
+        }
+        if (rt_token_num < 0) {
+            LOGE("Error: token num is invalid\n");
+            return Status(TNNERR_LAYER_ERR, "Error: token num is invalid");
+        }
+
+        cudaMemsetAsync(output_data, 0, data_count * type_size, stream);
+        if (rt_token_num > 0) {
+            if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+                invokeRebuildPadding(reinterpret_cast<float*>(output_data), reinterpret_cast<float*>(input_data1), input_data2 + 1, rt_token_num, input_dims1[2], stream);
+            } else if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+                invokeRebuildPadding(reinterpret_cast<half*>(output_data), reinterpret_cast<half*>(input_data1), input_data2 + 1, rt_token_num, input_dims1[2], stream);
+            } else {
+                LOGE("Error: input dtype not support\n");
+                return Status(TNNERR_LAYER_ERR, "Error: input dtype not support");
+            }
+        }
+        info_map["int_transformer_runtime_token_num"] = make_any<int>(-1);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_flatten_torch_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_flatten_torch_layer_acc.cu
new file mode 100644
index 000000000..ccb7c3a5c
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_flatten_torch_layer_acc.cu
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(FlattenTorch, LAYER_FLATTENTORCH);
+
+Status CudaFlattenTorchLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaFlattenTorchLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaFlattenTorchLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    auto data_byte_size = DataTypeUtils::GetBytesSize(outputs[0]->GetBlobDesc().data_type);
+    auto size_in_bytes = DimsVectorUtils::Count(input_dims) * data_byte_size;
+    cudaMemcpyAsync(outputs[0]->GetHandle().base, inputs[0]->GetHandle().base, size_in_bytes, cudaMemcpyDeviceToDevice, context_->GetStream());
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(FlattenTorch, LAYER_FLATTENTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_fused_group_norm_swish_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_fused_group_norm_swish_layer_acc.cu
new file mode 100644
index 000000000..9fbac6394
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_fused_group_norm_swish_layer_acc.cu
@@ -0,0 +1,294 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+#include <cstdint>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+namespace {
+
+inline static int getThreads(int count) {
+    if (count <= 0) return 0;
+    if (count <= 32) return 32;
+    if (count > 256) return 512;
+    count -= 1;
+    count |= (count >> 1);
+    count |= (count >> 2);
+    count |= (count >> 4);
+    return count + 1;
+}
+
+template<typename T>
+struct Tuple2 {
+    T v1; T v2;
+    __device__ __host__ inline Tuple2<T>(const T a, const T b) : v1(a), v2(b) {}
+    __device__ __host__ inline Tuple2<T>() : v1(0.), v2(0.) {}
+    __device__ __host__ inline Tuple2<T>(const T& other): v1(other), v2(other) {}
+    __device__ __host__ inline Tuple2<T> operator+(const Tuple2<T> &other) { return {v1 + other.v1, v2 + other.v2}; }
+    __device__ __host__ inline Tuple2<T> &operator+=(const Tuple2<T> &other) { v1 += other.v1; v2 += other.v2; return *this; }
+};
+
+template <typename T>
+__device__ __forceinline__ float toFloat(T x)
+{
+    return float(x);
+}
+
+template <typename T>
+__device__ T fromFloat(float x);
+
+template <>
+__device__ __forceinline__ int8_t fromFloat<int8_t>(float x)
+{
+    // The order of the next two statements matters when x is a NaN,
+    // because IEEE max/min return the non-NaN operand when one operand
+    // is a NaN and the other is not.
+    x = fmaxf(x, INT8_MIN);
+    x = fminf(x, INT8_MAX);
+    return __float2int_rn(x);
+}
+
+template <>
+__device__ __forceinline__ __half fromFloat<__half>(float x)
+{
+    // The order of the next two statements matters when x is a NaN,
+    // because IEEE max/min return the non-NaN operand when one operand
+    // is a NaN and the other is not.
+    x = fmaxf(x, INT8_MIN);
+    x = fminf(x, INT8_MAX);
+    return __float2half_rn(x);
+}
+
+template <>
+__device__ __forceinline__ float fromFloat<float>(float x)
+{
+    return x;
+}
+
+template<typename T> struct GNAccType {using type = T; };
+template<> struct GNAccType<__half> {using type = float; };
+template<> struct GNAccType<float> {using type = float; };
+
+__device__ inline static Tuple2<float> __shfl_down_sync(unsigned mask, Tuple2<float> var, unsigned int delta, int width) {
+    auto ret = ::__shfl_down_sync(mask, *(double *)&var, delta, width);
+    return *(Tuple2<float>*)&ret;
+}
+// __device__ inline static Tuple2<__half> __shfl_down_sync(unsigned mask, Tuple2<__half> var, unsigned int delta, int width) {
+//     auto ret = __shfl_down_sync(mask, *(float*)&var, delta, width);
+//     return *(Tuple2<__half>*)&ret;
+// }
+
+template<typename T, int WARP_SIZE>
+struct WarpReducer { __device__ inline static T reduce(T val); };
+template<typename T> struct WarpReducer<T, 32> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0xffffffff, val, 16, 32);
+    val += __shfl_down_sync(0x0000ffff, val, 8, 16);
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 16> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x0000ffff, val, 8, 16);
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 8> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x000000ff, val, 4, 8);
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 4> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x0000000f, val, 2, 4);
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 2> { __device__ inline static T reduce(T val) {
+    val += __shfl_down_sync(0x00000003, val, 1, 2);
+    return val;
+}};
+template<typename T> struct WarpReducer<T, 1> { __device__ inline static T reduce(T val) { return val; }};
+
+template<typename T> using UFunc = T(*)(T);
+template<typename T> __device__ __host__ inline T idn(T val) { return val; }
+template<typename T> __device__ __host__ inline T sqr(T val) { return val * val; }
+template<typename T> __device__ __host__ inline Tuple2<T> idn(Tuple2<T> val) { return val; }
+template<typename T> __device__ __host__ inline Tuple2<T> idn_sqr(Tuple2<T> val) { return {val.v1, val.v2 * val.v2}; }
+}
+
+template<int THREAD_PER_BLOCK, typename T, typename AccType, UFunc<AccType> ufunc>
+__device__ static void reduce(const T* input, AccType* output, const int count, const float i0i8Scale, const int in_elem_step = 1) {
+
+    static_assert(THREAD_PER_BLOCK % 32 == 0 && THREAD_PER_BLOCK >= 32, "");
+    __shared__ char _sm_static[(THREAD_PER_BLOCK / 32) * sizeof(AccType)];
+    AccType *ssum = reinterpret_cast<AccType*>(_sm_static);
+    AccType sum = AccType(0.);
+
+    const T* ptr = input + threadIdx.x * in_elem_step;
+    const auto actual_step = THREAD_PER_BLOCK * in_elem_step;
+    for (int i = threadIdx.x; i < count; i += THREAD_PER_BLOCK, ptr += actual_step) {
+        T input = *ptr;
+        auto value = static_cast<AccType>(input);
+        sum += ufunc(value);
+    }
+
+    sum = WarpReducer<AccType, 32>::reduce(sum);
+    if (threadIdx.x % 32 == 0) { ssum[threadIdx.x / 32] = sum; }
+    __syncthreads();
+
+    sum = threadIdx.x < THREAD_PER_BLOCK / 32 ? ssum[threadIdx.x] : AccType(0.);
+    sum = WarpReducer<AccType, THREAD_PER_BLOCK / 32>::reduce(sum);
+    if (threadIdx.x == 0) { *output = sum; }
+    __syncthreads();
+}
+
+
+template<typename T>
+__device__ void fuse_param_and_affine(const T *input, T *output, const float *gamma, const float *beta,
+                                      const float i0i8Scale, const float o0i8Scale, int g,
+                                      const int c_per_g, const int hw, const float eps,
+                                      typename GNAccType<T>::type sum1, typename GNAccType<T>::type sum2) {
+    using AccType = typename GNAccType<T>::type;
+    extern __shared__ char _sm[];
+    AccType* scale = reinterpret_cast<AccType*>(_sm);
+    AccType* bias = scale + c_per_g;
+    const int c_off = c_per_g * blockIdx.x % (c_per_g * g);
+    for (int i = threadIdx.x; i < c_per_g; i += blockDim.x) {
+        AccType mean = sum1 / (c_per_g * hw) ;
+        AccType var = sum2 / (c_per_g * hw) - mean * mean;
+        AccType k = rsqrt(var + eps) * gamma[c_off + i];
+        scale[i] = k;
+        bias[i] = - mean * k + beta[c_off + i];
+    }
+    __syncthreads();
+
+    const auto count = c_per_g * hw;
+    const auto offset = count * blockIdx.x;
+
+    const T* in_ptr = input + offset;
+    T* out_ptr = output + offset;
+    for (int i = threadIdx.x; i < count; i += blockDim.x) {
+        auto c_idx = i / hw;
+        float out_val = toFloat(static_cast<AccType>(in_ptr[i]) * scale[c_idx] + bias[c_idx]);
+        out_ptr[i] = fromFloat<T>(out_val / (1.f + expf(-out_val)));
+    }
+}
+
+template<int THREAD_PER_BLOCK, typename T>
+__global__ void group_norm_1pass(const T *input, T *output, const float *gamma, const float *beta,
+                                 const float i0i8Scale, const float o0i8Scale,
+                                 int g, const int c_per_g, const int hw, const float eps) {
+    // 1 group per block, used when c_per_g * hw <= 4096
+    // assert (c == g * c_per_g)
+    using AccType = typename GNAccType<T>::type;
+
+    __shared__ char _sums[sizeof(Tuple2<AccType>)];
+    Tuple2<AccType> *sums = reinterpret_cast<Tuple2<AccType>*>(_sums);
+
+    reduce<THREAD_PER_BLOCK, T, Tuple2<AccType>, idn_sqr<AccType> >(
+        input + blockIdx.x * hw * c_per_g, sums, c_per_g * hw, i0i8Scale);
+
+    fuse_param_and_affine<T>(input, output, gamma, beta, i0i8Scale, o0i8Scale, g,
+                              c_per_g, hw, eps, sums[0].v1, sums[0].v2);
+}
+
+template<typename T>
+static Status group_norm_v2(const T *input, T* output, const float *gamma, const float *beta,
+                            const float i0i8Scale, const float o0i8Scale,
+                            const int n, const int c, const int g, const int c_per_g, const int h, const int w,
+                            const float eps, cudaStream_t s) {
+    using AccType = typename GNAccType<T>::type;
+    static std::map<int, void(*)(
+        const T*, T*, const float *, const float *,
+        const float, const float, int, const int, const int, const float)> group_norm_1pass_funcs = {
+        {32,  group_norm_1pass<32, T>},
+        {64,  group_norm_1pass<64, T>},
+        {128, group_norm_1pass<128, T>},
+        {256, group_norm_1pass<256, T>},
+        {512, group_norm_1pass<512, T>},
+    };
+    const int hw = h * w;
+    auto block = getThreads(c_per_g * hw);
+    auto grid = n * g;
+    {
+        group_norm_1pass_funcs[block]<<<grid, block, 2 * c_per_g * sizeof(AccType), s>>>(
+            input, output, gamma, beta, i0i8Scale, o0i8Scale, g, c_per_g, hw, eps);
+        auto err = cudaGetLastError();
+        if (err != cudaSuccess)
+            return Status(TNNERR_CUDA_TENSORRT_ERROR, "GNSwish Plugin 1pass failed: " + std::to_string(err));
+    }
+    return TNN_OK;
+}
+
+Status CudaFusedGroupNormSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaFusedGroupNormSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaFusedGroupNormSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto params = dynamic_cast<GroupNormLayerParam*>(param_);
+    auto dtype = inputs[0]->GetBlobDesc().data_type;
+    auto dformat    = inputs[0]->GetBlobDesc().data_format;
+
+    Blob *input_blob = inputs[0];
+    Blob *scale_blob = inputs[1];
+    Blob *bias_blob  = inputs[2];
+    Blob *output_blob = outputs[0];
+    auto input_dims = inputs[0]->GetBlobDesc().dims;
+    if (dtype == DATA_TYPE_FLOAT) {
+        float* input_data = static_cast<float*>(input_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base) + scale_blob->GetHandle().bytes_offset;
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base) + bias_blob->GetHandle().bytes_offset;
+        float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+        int channels_per_group = input_dims[1] / params->group;
+
+        return group_norm_v2<float>(input_data, output_data, scale_data, bias_data, 1.0, 1.0,
+                                    input_dims[0], input_dims[1], params->group, channels_per_group,
+                                    input_dims[2], input_dims[3], params->eps, context_->GetStream());
+    } else if (dtype == DATA_TYPE_HALF) {
+        __half* input_data = static_cast<__half*>(input_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base) + scale_blob->GetHandle().bytes_offset;
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base) + bias_blob->GetHandle().bytes_offset;
+        __half* output_data = static_cast<__half*>(output_blob->GetHandle().base);
+        int channels_per_group = input_dims[1] / params->group;
+
+        return group_norm_v2<__half>(input_data, output_data, scale_data, bias_data, 1.0, 1.0,
+                                    input_dims[0], input_dims[1], params->group, channels_per_group,
+                                    input_dims[2], input_dims[3], params->eps, context_->GetStream());
+    } else {
+        return Status(TNNERR_CUDA_TENSORRT_ERROR, "Unexpected data type " + std::to_string(dtype));
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+}  // namespace TNN_NS
+
diff --git a/source/tnn/device/cuda/acc/cuda_fused_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_fused_layer_acc.cu
new file mode 100644
index 000000000..20b7f2ec4
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_fused_layer_acc.cu
@@ -0,0 +1,462 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_fused_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/utils/data_type_utils.h"
+
+
+namespace TNN_NS {
+
+Status CudaFusedLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    fused_param_ = dynamic_cast<FusedLayerParam *>(param_);
+
+    if (!fused_param_) {
+        LOGE("Error: fused layer param is nil\n");
+        return Status(TNNERR_PARAM_ERR, "Error: fused layer param is nil");
+    }
+
+    cublas_fp32_ = std::make_shared<cublasMMWrapper>(context_->GetCublasHandle(), context_->GetCublasLtHandle());
+    cublas_fp32_->setFP32GemmConfig();
+
+    cublas_fp16_ = std::make_shared<cublasMMWrapper>(context_->GetCublasHandle(), context_->GetCublasLtHandle());
+    cublas_fp16_->setFP16GemmConfig();
+
+    if (fused_param_->type == FusionType_AddBiasResidualLayerNorm) {
+        auto resource = dynamic_cast<EltwiseLayerResource*>(resource_);
+        if (!resource) {
+            LOGE("Error: fused layer norm resource is nil\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused layer norm resource is nil");
+        }
+
+        RETURN_ON_FAIL(PrepareResource(resource->element_handle));
+    } else if (fused_param_->type == FusionType_FFN) {
+        auto resource = dynamic_cast<FusedLayerResource*>(resource_);
+        if (!resource) {
+            LOGE("Error: fused ffn resource is nil\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused ffn resource is nil");
+        }
+
+        if (fused_param_->ffn_activation == ActivationType_GELU) {
+            ffn_layer_fp32_ = std::make_shared<GeluFfnLayer<float>>(cublas_fp32_.get());
+            ffn_layer_fp16_ = std::make_shared<GeluFfnLayer<half>>(cublas_fp16_.get());
+        } else {
+            LOGE("Error: fused ffn activation type not supported: %d\n", int(fused_param_->ffn_activation));
+            return Status(TNNERR_LAYER_ERR, "Error: fused ffn activation type not supported");
+        }
+
+        RETURN_ON_FAIL(PrepareResource(resource->ffn_matmul_in.weight));
+        RETURN_ON_FAIL(PrepareResource(resource->ffn_bias.element_handle));
+        RETURN_ON_FAIL(PrepareResource(resource->ffn_matmul_out.weight));
+#if 0  // Cuda Fused Attention Has 100mb + Volume
+    } else if (fused_param_->type == FusionType_Attention) {
+        auto resource = dynamic_cast<FusedLayerResource*>(resource_);
+        if (!resource) {
+            LOGE("Error: fused attention resource is nil\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention resource is nil");
+        }
+
+        int sm_version = -1;
+        RETURN_ON_FAIL(device_->GetCurrentSMVersion(sm_version));
+        if (fused_param_->attention_size_per_head != 64 ||
+            (sm_version!=-1 && sm_version!=70 && sm_version!=72 && sm_version!=75 && sm_version!=80 && sm_version!=86)) {
+            fp16_run_fused_attention_ = false;
+        }
+
+        if (fp16_run_fused_attention_) {
+            attention_fp16_ = std::make_shared<FusedAttentionLayer<half>>(
+                                      fused_param_->attention_head_num,
+                                      fused_param_->attention_size_per_head,
+                                      fused_param_->attention_head_num * fused_param_->attention_size_per_head,
+                                      fused_param_->attention_q_scaling,
+                                      sm_version,
+                                      cublas_fp16_.get());
+        } else {
+            attention_fp16_ = std::make_shared<UnfusedAttentionLayer<half>>(
+                                      fused_param_->attention_head_num,
+                                      fused_param_->attention_size_per_head,
+                                      fused_param_->attention_head_num * fused_param_->attention_size_per_head,
+                                      fused_param_->attention_q_scaling,
+                                      sm_version,
+                                      cublas_fp16_.get());
+        }
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_q_mm.weight));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_k_mm.weight));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_v_mm.weight));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_o_mm.weight));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_q_bias.element_handle));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_k_bias.element_handle));
+        RETURN_ON_FAIL(PrepareFp16Resource(resource->attention_v_bias.element_handle));
+
+        /*
+        // TODO: Support Fp32
+        attention_fp32_ = std::make_shared<UnfusedAttentionLayer<float>>(
+                                fused_param_->attention_head_num,
+                                fused_param_->attention_size_per_head,
+                                fused_param_->attention_head_num * fused_param_->attention_size_per_head,
+                                fused_param_->attention_q_scaling,
+                                sm_version,
+                                cublas_fp32_.get());
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_q_mm.weight));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_k_mm.weight));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_v_mm.weight));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_o_mm.weight));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_q_bias.element_handle));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_k_bias.element_handle));
+        RETURN_ON_FAIL(PrepareFp32Resource(resource->attention_v_bias.element_handle));
+        */
+    } else if (fused_param_->type == FusionType_Flash_Attention) {
+        auto resource = dynamic_cast<FusedLayerResource*>(resource_);
+        if (!resource) {
+            LOGE("Error: fused attention resource is nil\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention resource is nil");
+        }
+
+        int sm_version = -1;
+        RETURN_ON_FAIL(device_->GetCurrentSMVersion(sm_version));
+        /*
+        if (fused_param_->attention_size_per_head != 64 ||
+            (sm_version!=-1 && sm_version!=70 && sm_version!=72 && sm_version!=75 && sm_version!=80 && sm_version!=86)) {
+            std::cout<<"run_flash_attention = Flase size_per_head="<<fused_param_->attention_size_per_head<<" sm_version ="<<sm_version<<std::endl;
+            fp16_run_flash_attention_ = false;
+        }
+        */
+        if (fp16_run_flash_attention_) {
+            flash_attention_fp16_ = std::make_shared<FlashAttentionLayer<half>>(
+                                      sm_version);
+            flash_attention_fp16_->allocateSeqlens(128); //TODO: set a better maxBatchSize
+            flash_attention_fp16_->createMHARunner();
+        }
+    } else if (fused_param_->type == FusionType_Cross_Attention) {
+        auto resource = dynamic_cast<FusedLayerResource*>(resource_);
+        if (!resource) {
+            LOGE("Error: fused attention resource is nil\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention resource is nil");
+        }
+
+        int sm_version = -1;
+        RETURN_ON_FAIL(device_->GetCurrentSMVersion(sm_version));
+        /*
+        if (fused_param_->attention_size_per_head != 64 ||
+            (sm_version!=-1 && sm_version!=70 && sm_version!=72 && sm_version!=75 && sm_version!=80 && sm_version!=86)) {
+            std::cout<<"run_cross_attention = Flase size_per_head="<<fused_param_->attention_size_per_head<<" sm_version ="<<sm_version<<std::endl;
+            fp16_run_cross_attention_ = false;
+        }
+        */
+        if (fp16_run_cross_attention_) {
+            cross_attention_fp16_ = std::make_shared<CrossAttentionLayer<half>>(
+                                      sm_version);
+            cross_attention_fp16_->allocateSeqlens(128); //TODO: set a better maxBatchSize
+            cross_attention_fp16_->createMHARunner();
+        }
+#endif  // end of #if 0
+    } else {
+        LOGE("Error: not supported fusion type: %d\n", (int)(fused_param_->type));
+        return Status(TNNERR_PARAM_ERR, "Error: not supported fusion type");
+    }
+
+    return TNN_OK;
+}
+
+Status CudaFusedLayerAcc::PrepareResource(RawBuffer &buf) {
+    RETURN_ON_FAIL(PrepareFp32Resource(buf));
+    RETURN_ON_FAIL(PrepareFp16Resource(buf));
+    return TNN_OK;
+}
+
+Status CudaFusedLayerAcc::PrepareFp32Resource(RawBuffer &buf) {
+    int data_count = buf.GetDataCount();
+    auto buf_fp32 = ConvertHalfHandle(buf);
+    int data_size_fp32 = data_count * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT);
+    CreateTempBuf(data_size_fp32);
+    CUDA_CHECK(cudaMemcpy(tempbufs_[tempbufs_.size() - 1].ptr, buf_fp32.force_to<void*>(), data_size_fp32, cudaMemcpyHostToDevice));
+    return TNN_OK;
+}
+
+Status CudaFusedLayerAcc::PrepareFp16Resource(RawBuffer &buf) {
+    int data_count = buf.GetDataCount();
+    auto buf_fp16 = ConvertFloatToHalf(buf);
+    int data_size_fp16 = data_count * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF);
+    CreateTempBuf(data_size_fp16);
+    CUDA_CHECK(cudaMemcpy(tempbufs_[tempbufs_.size() - 1].ptr, buf_fp16.force_to<void*>(), data_size_fp16, cudaMemcpyHostToDevice));
+    return TNN_OK;
+}
+
+CudaFusedLayerAcc::~CudaFusedLayerAcc() {}
+
+Status CudaFusedLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaFusedLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    int rt_token_num = -1;
+    auto& info_map = context_->GetExtraInfoMap();
+    if (info_map.find("int_transformer_runtime_token_num") != info_map.end()) {
+        rt_token_num = any_cast<int>(info_map["int_transformer_runtime_token_num"]); 
+    }
+    // skip when valid token number is zero
+    if (rt_token_num == 0) {
+        return TNN_OK;
+    }
+
+    if (fused_param_->type == FusionType_AddBiasResidualLayerNorm) {
+        if (inputs.size() != 4 || outputs.size() != 1) {
+            LOGE("Error: fused layer norm io size error\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused layer norm io size error");
+        }
+
+        Blob *att_out_blob = inputs[0];
+        Blob *res_in_blob  = inputs[1];
+        Blob *scale_blob   = inputs[2];
+        Blob *bias_blob    = inputs[3];
+        Blob *output_blob  = outputs[0];
+
+        float layernorm_eps = fused_param_->layer_norm_param.eps;
+        auto dims = output_blob->GetBlobDesc().dims;
+        int m = rt_token_num > 0 ? rt_token_num : DimsVectorUtils::Count(dims, 0, 2);
+        int n = DimsVectorUtils::Count(dims, 2);
+
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            float *att_out = static_cast<float*>(att_out_blob->GetHandle().base);
+            float *res_in  = static_cast<float*>(res_in_blob->GetHandle().base);
+            float *scale   = static_cast<float*>(scale_blob->GetHandle().base);
+            float *bias    = static_cast<float*>(bias_blob->GetHandle().base);
+            float *output  = static_cast<float*>(output_blob->GetHandle().base);
+            invokeAddBiasResidualLayerNorm(output, att_out, res_in, static_cast<float*>(tempbufs_[0].ptr), scale, bias, layernorm_eps, m, n, context_->GetStream());
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            half *att_out = static_cast<half*>(att_out_blob->GetHandle().base);
+            half *res_in  = static_cast<half*>(res_in_blob->GetHandle().base);
+            half *scale   = static_cast<half*>(scale_blob->GetHandle().base);
+            half *bias    = static_cast<half*>(bias_blob->GetHandle().base);
+            half *output  = static_cast<half*>(output_blob->GetHandle().base);
+            invokeAddBiasResidualLayerNorm(output, att_out, res_in, static_cast<half*>(tempbufs_[1].ptr), scale, bias, layernorm_eps, m, n, context_->GetStream());
+        } else {
+            LOGE("Error: fused layernorm not supported data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: fused layernorm not supported data type");
+        }
+
+    } else if (fused_param_->type == FusionType_FFN) {
+        if (inputs.size() != 1 || outputs.size() != 1) {
+            LOGE("Error: fused ffn io size error\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused ffn io size error");
+        }
+
+        Blob *in_blob  = inputs[0];
+        Blob *out_blob = outputs[0];
+
+        auto dims = out_blob->GetBlobDesc().dims;
+
+        if (dims.size() < 2) {
+            LOGE("Error: fused ffn io dims not support\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused ffn io dims support");
+        }
+
+        int token_num = rt_token_num > 0 ? rt_token_num : DimsVectorUtils::Count(dims, 0, dims.size() - 1);
+        int hidden_size = DimsVectorUtils::Count(dims, dims.size() - 1);
+        int inter_size = fused_param_->ffn_inter_size;
+
+        context_->SetWorkspaceSize(inter_size * token_num * 4);
+
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            float *input = static_cast<float*>(in_blob->GetHandle().base);
+            float *output = static_cast<float*>(out_blob->GetHandle().base);
+            float *ffn_matmul_in = static_cast<float*>(tempbufs_[0].ptr);
+            float *ffn_bias = static_cast<float*>(tempbufs_[2].ptr);
+            float *ffn_matmul_out = static_cast<float*>(tempbufs_[4].ptr);
+            float *inter_buf = static_cast<float*>(context_->GetWorkspace());
+            ffn_layer_fp32_->forward(output, input, ffn_matmul_in, ffn_bias, ffn_matmul_out, inter_buf, token_num, hidden_size, inter_size, context_->GetStream());
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            half *input = static_cast<half*>(in_blob->GetHandle().base);
+            half *output = static_cast<half*>(out_blob->GetHandle().base);
+            half *ffn_matmul_in = static_cast<half*>(tempbufs_[1].ptr);
+            half *ffn_bias = static_cast<half*>(tempbufs_[3].ptr);
+            half *ffn_matmul_out = static_cast<half*>(tempbufs_[5].ptr);
+            half *inter_buf = static_cast<half*>(context_->GetWorkspace());
+            ffn_layer_fp16_->forward(output, input, ffn_matmul_in, ffn_bias, ffn_matmul_out, inter_buf, token_num, hidden_size, inter_size, context_->GetStream());
+        } else {
+            LOGE("Error: fused ffn not supported data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: fused ffn not supported data type");
+        }
+#if 0
+    } else if (fused_param_->type == FusionType_Attention) {
+        if (inputs.size() < 2 || outputs.size() != 1) {
+            LOGE("Error: fused attention io size error\n");
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention io size error");
+        }
+
+        if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_HALF) {
+            LOGE("Error: fused attention not supported data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention not supported data type");
+        }
+
+        Blob *input_blob  = inputs[0];
+        Blob *output_blob = outputs[0];
+
+        auto dims = output_blob->GetBlobDesc().dims;
+
+        int batch_size = DimsFunctionUtils::GetDim(dims, 0);
+        int seq_len    = DimsFunctionUtils::GetDim(dims, 1);
+
+        int token_num   = rt_token_num > 0 ? rt_token_num : DimsVectorUtils::Count(dims, 0, 2);
+        int hidden_size = DimsVectorUtils::Count(dims, 2);
+
+        int data_count = batch_size * seq_len * hidden_size;
+        if (fp16_run_fused_attention_) {
+            context_->SetWorkspaceSize(data_count * 7 * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF) + (2*batch_size + 1) * sizeof(int));
+        } else {
+            int dtype_byte_size = DataTypeUtils::GetBytesSize(inputs[0]->GetBlobDesc().data_type);
+            context_->SetWorkspaceSize(data_count * 8 * dtype_byte_size + batch_size * fused_param_->attention_head_num * seq_len * seq_len * sizeof(int));
+        }
+
+        int* trt_offset = nullptr;
+        if (fused_param_->dense_mode) {
+            // Suppose batch = 3, max_seq_len = 12, valid_seq_len is [5,7,9] respectively,
+            // Then,
+            // in Fused Attention Kernel:
+            //     trt_offset = [0,5,12,21], length = [batch+1]
+            // in Unfused Attention Kernel:
+            //     trt_offset = [0,0,0,0,0, 7,7,7,7,7,7,7, 12,12,12,12,12,12,12,12,12], length = [h_token_num], which is 5+7+9=21 here.
+            if (fp16_run_fused_attention_ && inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+                trt_offset = static_cast<int*>(inputs[2]->GetHandle().base);
+            } else {
+                trt_offset = static_cast<int*>(inputs[3]->GetHandle().base) + 1; // The first element is [device_token_num], skip the first element.
+            }
+        }
+
+        if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+            LOGE("Error: FP32 Multi-Head Attention Fused Layer Not Supported.\n");
+            return Status(TNNERR_PARAM_ERR, "Error: FP32 Multi-Head Attention Fused Layer Not Supported.");
+            
+            /*
+            float *input = static_cast<float*>(input_blob->GetHandle().base);
+            float *output = static_cast<float*>(output_blob->GetHandle().base);
+            float *inter_buf = static_cast<float*>(context_->GetWorkspace());
+            float *q_weight = static_cast<float*>(tempbufs_[7].ptr);
+            float *k_weight = static_cast<float*>(tempbufs_[8].ptr);
+            float *v_weight = static_cast<float*>(tempbufs_[9].ptr);
+            float *o_weight = static_cast<float*>(tempbufs_[10].ptr);
+            float *q_bias = static_cast<float*>(tempbufs_[11].ptr);
+            float *k_bias = static_cast<float*>(tempbufs_[12].ptr);
+            float *v_bias = static_cast<float*>(tempbufs_[13].ptr);
+
+            Blob *mask_blob = inputs[1];
+            float* mask = static_cast<float*>(mask_blob->GetHandle().base);
+            int ld_mask = DimsVectorUtils::Count(mask_blob->GetBlobDesc().dims, 1);
+
+            attention_fp32_->forward(output, input, mask, trt_offset, inter_buf,
+                                     q_weight, k_weight, v_weight, o_weight,
+                                     q_bias, k_bias, v_bias, token_num, seq_len, batch_size, ld_mask, context_->GetStream());
+            */
+        } else if (inputs[0]->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+            half *input = static_cast<half*>(input_blob->GetHandle().base);
+            half *output = static_cast<half*>(output_blob->GetHandle().base);
+            half *inter_buf = static_cast<half*>(context_->GetWorkspace());
+            half *q_weight = static_cast<half*>(tempbufs_[0].ptr);
+            half *k_weight = static_cast<half*>(tempbufs_[1].ptr);
+            half *v_weight = static_cast<half*>(tempbufs_[2].ptr);
+            half *o_weight = static_cast<half*>(tempbufs_[3].ptr);
+            half *q_bias = static_cast<half*>(tempbufs_[4].ptr);
+            half *k_bias = static_cast<half*>(tempbufs_[5].ptr);
+            half *v_bias = static_cast<half*>(tempbufs_[6].ptr);
+
+            half *mask  = nullptr;
+            int ld_mask = 0;
+            if ((fused_param_->has_attention_mask && !fused_param_->dense_mode) || !fp16_run_fused_attention_) {
+                Blob *mask_blob = inputs[1];
+                mask = static_cast<half*>(mask_blob->GetHandle().base);
+                ld_mask = DimsVectorUtils::Count(mask_blob->GetBlobDesc().dims, 1);
+            }
+
+            attention_fp16_->forward(output, input, mask, trt_offset, inter_buf,
+                                     q_weight, k_weight, v_weight, o_weight,
+                                     q_bias, k_bias, v_bias, token_num, seq_len, batch_size, ld_mask, context_->GetStream());
+        } else {
+            LOGE("Error: fused attention not supported data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: fused attention not supported data type");
+        }
+    } else if (fused_param_->type == FusionType_Flash_Attention) {
+        if (inputs.size() != 2 || outputs.size() != 1) {
+            LOGE("Error: flash attention io size error\n");
+            return Status(TNNERR_LAYER_ERR, "Error: flash attention io size error");
+        }
+
+        if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_HALF) {
+            LOGE("Error: flash attention not supported data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: flash attention not supported data type");
+        }
+
+        Blob *input_blob  = inputs[0];
+        Blob *output_blob = outputs[0];
+        half *input = static_cast<half*>(input_blob->GetHandle().base);
+        half *output = static_cast<half*>(output_blob->GetHandle().base);
+
+        auto dims = output_blob->GetBlobDesc().dims;
+
+        int batch_size = DimsFunctionUtils::GetDim(dims, 0);
+        int seq_len    = DimsFunctionUtils::GetDim(dims, 1);
+        int head_num   = DimsFunctionUtils::GetDim(dims, 2);
+        int size_per_head   = DimsFunctionUtils::GetDim(dims, 3);
+        int hidden_size = DimsVectorUtils::Count(dims, 2);
+
+        flash_attention_fp16_->forward(input, output, batch_size, head_num, size_per_head, seq_len,
+                                     context_->GetStream());
+
+    } else if (fused_param_->type == FusionType_Cross_Attention) {
+        if (inputs.size() != 3 || outputs.size() != 1) {
+            LOGE("Error: cross attention io size error\n");
+            return Status(TNNERR_LAYER_ERR, "Error: cross attention io size error");
+        }
+
+        if (inputs[0]->GetBlobDesc().data_type != DATA_TYPE_HALF ||
+            inputs[1]->GetBlobDesc().data_type != DATA_TYPE_HALF) {
+            LOGE("Error: cross attention not supported data type, input_0: %d, input_1: %d\n", 
+                 inputs[0]->GetBlobDesc().data_type, inputs[1]->GetBlobDesc().data_type);
+            return Status(TNNERR_LAYER_ERR, "Error: cross attention not supported data type");
+        }
+
+        Blob *output_blob = outputs[0];
+        half *input_0 = static_cast<half*>(inputs[0]->GetHandle().base);
+        half *input_1 = static_cast<half*>(inputs[1]->GetHandle().base);
+        half *output = static_cast<half*>(output_blob->GetHandle().base);
+
+        auto dims = output_blob->GetBlobDesc().dims;
+        auto qv_dims = inputs[1]->GetBlobDesc().dims;
+
+        int batch_size = DimsFunctionUtils::GetDim(dims, 0);
+        int q_seq_len    = DimsFunctionUtils::GetDim(dims, 1);
+        int kv_seq_len   = DimsFunctionUtils::GetDim(qv_dims, 1);
+        int head_num   = DimsFunctionUtils::GetDim(dims, 2);
+        int size_per_head   = DimsFunctionUtils::GetDim(dims, 3);
+        int hidden_size = DimsVectorUtils::Count(dims, 2);
+        cross_attention_fp16_->forward(input_0, input_1, output, batch_size, head_num, size_per_head, q_seq_len, kv_seq_len,
+                                     context_->GetStream());
+
+#endif  //#if 0
+    } else {
+        LOGE("Error: not supported fusion type: %d\n", (int)(fused_param_->type));
+        return Status(TNNERR_PARAM_ERR, "Error: not supported fusion type");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Fused, LAYER_FUSED);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_fused_layer_acc.h b/source/tnn/device/cuda/acc/cuda_fused_layer_acc.h
new file mode 100644
index 000000000..99a3bce80
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_fused_layer_acc.h
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_FUSED_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_FUSED_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/acc/compute/compute.h"
+
+namespace TNN_NS {
+
+class CudaFusedLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaFusedLayerAcc();
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    FusedLayerParam *fused_param_;
+
+    std::shared_ptr<cublasMMWrapper> cublas_fp32_;
+    std::shared_ptr<cublasMMWrapper> cublas_fp16_;
+
+    std::shared_ptr<FfnLayer<float>> ffn_layer_fp32_;
+    std::shared_ptr<FfnLayer<half>> ffn_layer_fp16_;
+
+    bool fp16_run_fused_attention_ = true;
+    //std::shared_ptr<BaseAttentionLayer<float>> attention_fp32_;
+    //std::shared_ptr<BaseAttentionLayer<half>> attention_fp16_;
+
+    bool fp16_run_flash_attention_ = true;
+    //std::shared_ptr<FlashAttentionLayer<float>> flash_attention_fp32_;
+    //std::shared_ptr<FlashAttentionLayer<half>> flash_attention_fp16_;
+    
+    bool fp16_run_cross_attention_ = true;
+    //std::shared_ptr<CrossAttentionLayer<float>> cross_attention_fp32_;
+    //std::shared_ptr<CrossAttentionLayer<half>> cross_attention_fp16_;
+
+    Status PrepareResource(RawBuffer &buf);
+    Status PrepareFp32Resource(RawBuffer &buf);
+    Status PrepareFp16Resource(RawBuffer &buf);
+
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_INNER_PRODUCT_LAYER_ACC_H_
diff --git a/source/tnn/device/cuda/acc/cuda_gather_elements_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_gather_elements_layer_acc.cu
new file mode 100644
index 000000000..65b7164aa
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_gather_elements_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(GatherElements, LAYER_GATHERELEMENTS);
+
+Status CudaGatherElementsLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaGatherElementsLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaGatherElementsLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(GatherElements, LAYER_GATHERELEMENTS);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu
index b0b5708a5..35ff7843d 100644
--- a/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_gridsample_layer_acc.cu
@@ -15,6 +15,7 @@
 #include <math.h>
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
 #include "tnn/utils/dims_utils.h"
+#include <cmath>
 
 namespace TNN_NS {
 
@@ -37,8 +38,8 @@ __global__ void gridsample_kernel(const float* input_data, const float* grid_dat
         float iy = (grid_ptr[2*index+1] + 1) * input_height * 0.5 - 0.5;
         // get corner pixel values from (x, y)
         // for 4d, we use north-east-south-west
-        int ix_nw = static_cast<int>(floorf(ix));
-        int iy_nw = static_cast<int>(floorf(iy));
+        int ix_nw = static_cast<int>(::floorf(ix));
+        int iy_nw = static_cast<int>(::floorf(iy));
 
         int ix_ne = ix_nw + 1;
         int iy_ne = iy_nw;
@@ -80,6 +81,22 @@ __global__ void gridsample_kernel(const float* input_data, const float* grid_dat
 
 Status CudaGridSampleLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    // pad_type => 0: const   1:reflect  2:edge
+    // mode =>     1: nereast 2: bilinear/linear 3: cubic
+    auto gs_param = dynamic_cast<GridSampleLayerParam *>(param);
+    if (!gs_param) {
+        LOGE("Error: Unable to Get Param of CUDA GridSample Layer.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Param of CUDA GridSample Layer.");
+    }
+    if (gs_param->pad_type != 0) {
+        LOGE("Error: Unsupported! CUDA GridSample Layer only support 'zero' padding_type now.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Unsupported! CUDA GridSample Layer only support 'zero' padding_type now.");
+    }
+    if (gs_param->mode != 2) {
+        LOGE("Error: Unsupported! CUDA GridSample Layer only support 'bilinear' mode now.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Unsupported! CUDA GridSample Layer only support 'bilinear' mode now.");
+    }
+
     return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu
index 52510c7af..dd85ea13a 100644
--- a/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_group_norm_layer_acc.cu
@@ -13,7 +13,9 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_function_utils.h"
 #include "tnn/utils/dims_vector_utils.h"
+#include "tnn/device/cuda/acc/compute/reformat.h"
 
 namespace TNN_NS {
 
@@ -42,6 +44,26 @@ struct Tuple2 {
     __device__ __host__ inline Tuple2<T> &operator+=(const Tuple2<T> &other) { v1 += other.v1; v2 += other.v2; return *this; }
 };
 
+template <typename T>
+__device__ __forceinline__ float toFloat(T x)
+{
+    return float(x);
+}
+
+template <typename T>
+__device__ T fromFloat(float x);
+
+template <>
+__device__ __forceinline__ int8_t fromFloat<int8_t>(float x)
+{
+    // The order of the next two statements matters when x is a NaN,
+    // because IEEE max/min return the non-NaN operand when one operand
+    // is a NaN and the other is not.
+    x = fmaxf(x, INT8_MIN);
+    x = fminf(x, INT8_MAX);
+    return __float2int_rn(x);
+}
+
 template<typename T> struct GNAccType {using type = T; };
 template<> struct GNAccType<__half> {using type = float; };
 template<> struct GNAccType<float> {using type = float; };
@@ -96,19 +118,31 @@ template<typename T> __device__ __host__ inline Tuple2<T> idn(Tuple2<T> val) { r
 template<typename T> __device__ __host__ inline Tuple2<T> idn_sqr(Tuple2<T> val) { return {val.v1, val.v2 * val.v2}; }
 }
 
-template<int THREAD_PER_BLOCK, typename T, typename AccType, UFunc<AccType> ufunc>
-__device__ static void reduce(const T* input, AccType* output, const int count, const int in_elem_step = 1) {
+template<int THREAD_PER_BLOCK, typename T, typename AccType, UFunc<AccType> ufunc, bool isI8>
+__device__ static void reduce(const T* input, AccType* output, const int count, const float i0i8Scale, const int in_elem_step = 1) {
 
     static_assert(THREAD_PER_BLOCK % 32 == 0 && THREAD_PER_BLOCK >= 32, "");
     __shared__ char _sm_static[(THREAD_PER_BLOCK / 32) * sizeof(AccType)];
     AccType *ssum = reinterpret_cast<AccType*>(_sm_static);
     AccType sum = AccType(0.);
 
-    const T* ptr = input + threadIdx.x * in_elem_step;
-    const auto actual_step = THREAD_PER_BLOCK * in_elem_step;
-    for (int i = threadIdx.x; i < count; i += THREAD_PER_BLOCK, ptr += actual_step) {
-        auto value = static_cast<AccType>(*ptr);
-        sum += ufunc(value);
+    if (isI8) {
+        const int8_t* i8_ptr = (const int8_t*)input + threadIdx.x * in_elem_step;
+        const auto actual_step = THREAD_PER_BLOCK * in_elem_step;
+        for (int i = threadIdx.x; i < count; i += THREAD_PER_BLOCK, i8_ptr += actual_step) {
+            float input_val = toFloat(*i8_ptr);
+            input_val *= i0i8Scale;
+            auto value = static_cast<AccType>(input_val);
+            sum += ufunc(value);
+        }
+    } else {
+        const T* ptr = input + threadIdx.x * in_elem_step;
+        const auto actual_step = THREAD_PER_BLOCK * in_elem_step;
+        for (int i = threadIdx.x; i < count; i += THREAD_PER_BLOCK, ptr += actual_step) {
+            T input = *ptr;
+            auto value = static_cast<AccType>(input);
+            sum += ufunc(value);
+        }
     }
     sum = WarpReducer<AccType, 32>::reduce(sum);
     if (threadIdx.x % 32 == 0) { ssum[threadIdx.x / 32] = sum; }
@@ -121,15 +155,16 @@ __device__ static void reduce(const T* input, AccType* output, const int count,
 }
 
 
-template<typename T>
+template<typename T, bool RELU, int PACK, bool isI8>
 __device__ void fuse_param_and_affine(const T *input, T *output, const float *gamma, const float *beta,
+                                      const float i0i8Scale, const float o0i8Scale, int g,
                                       const int c_per_g, const int hw, const float eps,
                                       typename GNAccType<T>::type sum1, typename GNAccType<T>::type sum2) {
     using AccType = typename GNAccType<T>::type;
     extern __shared__ char _sm[];
     AccType* scale = reinterpret_cast<AccType*>(_sm);
     AccType* bias = scale + c_per_g;
-    const int c_off = c_per_g * blockIdx.x;
+    const int c_off = c_per_g * blockIdx.x % (c_per_g * g);
     for (int i = threadIdx.x; i < c_per_g; i += blockDim.x) {
         AccType mean = sum1 / (c_per_g * hw) ;
         AccType var = sum2 / (c_per_g * hw) - mean * mean;
@@ -141,49 +176,82 @@ __device__ void fuse_param_and_affine(const T *input, T *output, const float *ga
 
     const auto count = c_per_g * hw;
     const auto offset = count * blockIdx.x;
-    const T* in_ptr = input + offset;
-    T* out_ptr = output + offset;
-    for (int i = threadIdx.x; i < count; i += blockDim.x) {
-        auto c_idx = i / hw;
-        out_ptr[i] = static_cast<AccType>(in_ptr[i]) * scale[c_idx] + bias[c_idx];
+    if (isI8) {
+        const int8_t* in_ptr = (const int8_t*)input + offset;
+        int8_t* out_ptr = (int8_t*)output + offset;
+        for (int i = threadIdx.x; i < count; i += blockDim.x) {
+            auto c_idx = i / PACK / hw * PACK + i % PACK;
+            float input_val = in_ptr[i];
+            input_val *= i0i8Scale;
+            float out_val = toFloat(input_val) * scale[c_idx] + bias[c_idx];
+            if (RELU) {
+                if (out_val < 0) {
+                    out_val = 0;
+                }
+            }
+
+            float o8Scale = o0i8Scale;
+            out_val = out_val / o8Scale;
+            out_ptr[i] = fromFloat<int8_t>(out_val);
+        }
+    } else {
+        const T* in_ptr = input + offset;
+        T* out_ptr = output + offset;
+        for (int i = threadIdx.x; i < count; i += blockDim.x) {
+            auto c_idx = i / hw;
+            out_ptr[i] = static_cast<AccType>(in_ptr[i]) * scale[c_idx] + bias[c_idx];
+            if (RELU) {
+                if (out_ptr[i] < (T)0) {
+                    out_ptr[i] = 0;
+                }
+            }
+        }
     }
 }
 
-template<int THREAD_PER_BLOCK, typename T>
+template<int THREAD_PER_BLOCK, typename T, bool RELU, int PACK, bool isI8>
 __global__ void group_norm_1pass(const T *input, T *output, const float *gamma, const float *beta,
-                                 const int c_per_g, const int hw, const float eps) {
+                                 const float i0i8Scale, const float o0i8Scale,
+                                 int g, const int c_per_g, const int hw, const float eps) {
     // 1 group per block, used when c_per_g * hw <= 4096
     // assert (c == g * c_per_g)
     using AccType = typename GNAccType<T>::type;
 
     __shared__ char _sums[sizeof(Tuple2<AccType>)];
     Tuple2<AccType> *sums = reinterpret_cast<Tuple2<AccType>*>(_sums);
-    reduce<THREAD_PER_BLOCK, T, Tuple2<AccType>, idn_sqr<AccType> >(
-        input + blockIdx.x * hw * c_per_g, sums, c_per_g * hw);
+    if (isI8) {
+        reduce<THREAD_PER_BLOCK, T, Tuple2<AccType>, idn_sqr<AccType>, isI8 >(
+            (const T*)((const int8_t*)input + blockIdx.x * hw * c_per_g), sums, c_per_g * hw, i0i8Scale);
+    } else {
+        reduce<THREAD_PER_BLOCK, T, Tuple2<AccType>, idn_sqr<AccType>, isI8 >(
+            input + blockIdx.x * hw * c_per_g, sums, c_per_g * hw, i0i8Scale);
+    }
 
-    fuse_param_and_affine<T>(input, output, gamma, beta, c_per_g, hw, eps, sums[0].v1, sums[0].v2);
+    fuse_param_and_affine<T, RELU, PACK, isI8 >(input, output, gamma, beta, i0i8Scale, o0i8Scale, g,
+                                          c_per_g, hw, eps, sums[0].v1, sums[0].v2);
 }
 
-template<typename T>
+template<typename T, bool RELU, int PACK, bool isI8>
 static Status group_norm_v2(const T *input, T* output, const float *gamma, const float *beta,
+                            const float i0i8Scale, const float o0i8Scale,
                             const int n, const int c, const int g, const int c_per_g, const int h, const int w,
                             const float eps, cudaStream_t s) {
     using AccType = typename GNAccType<T>::type;
     static std::map<int, void(*)(
         const T*, T*, const float *, const float *,
-        const int, const int, const float)> group_norm_1pass_funcs = {
-        {32,  group_norm_1pass<32, T>},
-        {64,  group_norm_1pass<64, T>},
-        {128, group_norm_1pass<128, T>},
-        {256, group_norm_1pass<256, T>},
-        {512, group_norm_1pass<512, T>},
+        const float, const float, int, const int, const int, const float)> group_norm_1pass_funcs = {
+        {32,  group_norm_1pass<32, T, RELU, PACK, isI8>},
+        {64,  group_norm_1pass<64, T, RELU, PACK, isI8>},
+        {128, group_norm_1pass<128, T, RELU, PACK, isI8>},
+        {256, group_norm_1pass<256, T, RELU, PACK, isI8>},
+        {512, group_norm_1pass<512, T, RELU, PACK, isI8>},
     };
     const int hw = h * w;
     auto block = getThreads(c_per_g * hw);
     auto grid = n * g;
     {
         group_norm_1pass_funcs[block]<<<grid, block, 2 * c_per_g * sizeof(AccType), s>>>(
-            input, output, gamma, beta, c_per_g, hw, eps);
+            input, output, gamma, beta, i0i8Scale, o0i8Scale, g, c_per_g, hw, eps);
         auto err = cudaGetLastError();
         if (err != cudaSuccess)
             return Status(TNNERR_CUDA_TENSORRT_ERROR, "GN Plugin 1pass failed: " + std::to_string(err));
@@ -203,6 +271,9 @@ Status CudaGroupNormLayerAcc::Reshape(const std::vector<Blob *> &inputs, const s
 Status CudaGroupNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     auto params = dynamic_cast<GroupNormLayerParam*>(param_);
     auto dtype = inputs[0]->GetBlobDesc().data_type;
+    auto dformat    = inputs[0]->GetBlobDesc().data_format;
+
+    auto cuda_context = dynamic_cast<CudaContext *>(context_);
 
     Blob *input_blob = inputs[0];
     Blob *scale_blob = inputs[1];
@@ -211,24 +282,84 @@ Status CudaGroupNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const s
     auto input_dims = inputs[0]->GetBlobDesc().dims;
     if (dtype == DATA_TYPE_FLOAT) {
         float* input_data = static_cast<float*>(input_blob->GetHandle().base);
-        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base);
-        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
         float* output_data = static_cast<float*>(output_blob->GetHandle().base);
         int channels_per_group = input_dims[1] / params->group;
 
-        return group_norm_v2<float>(input_data, output_data, scale_data, bias_data,
+        return group_norm_v2<float, false, 1, false>(input_data, output_data, scale_data, bias_data, 1.0, 1.0,
                                     input_dims[0], input_dims[1], params->group, channels_per_group,
                                     input_dims[2], input_dims[3], params->eps, context_->GetStream());
     } else if (dtype == DATA_TYPE_HALF) {
         __half* input_data = static_cast<__half*>(input_blob->GetHandle().base);
-        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base);
-        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base);
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
         __half* output_data = static_cast<__half*>(output_blob->GetHandle().base);
         int channels_per_group = input_dims[1] / params->group;
 
-        return group_norm_v2<__half>(input_data, output_data, scale_data, bias_data,
+        return group_norm_v2<__half, false, 1, false>(input_data, output_data, scale_data, bias_data, 1.0, 1.0,
                                     input_dims[0], input_dims[1], params->group, channels_per_group,
                                     input_dims[2], input_dims[3], params->eps, context_->GetStream());
+    } else if (dtype == DATA_TYPE_INT8) {
+        int8_t *input = static_cast<int8_t*>(input_blob->GetHandle().base);
+        int8_t *output = static_cast<int8_t*>(output_blob->GetHandle().base);
+
+        auto input_scale_handle  = cuda_context->GetQuantResource(input_blob->GetBlobDesc().name);
+        auto output_scale_handle = cuda_context->GetQuantResource(output_blob->GetBlobDesc().name);
+
+        auto iscale = input_scale_handle->force_to<float *>()[0];
+        auto oscale = output_scale_handle->force_to<float *>()[0];
+
+        auto N = DimsFunctionUtils::GetDim(input_blob->GetBlobDesc().dims, 0);
+        auto C = DimsFunctionUtils::GetDim(input_blob->GetBlobDesc().dims, 1);
+        auto H = DimsFunctionUtils::GetDim(input_blob->GetBlobDesc().dims, 2);
+        auto W = DimsFunctionUtils::GetDim(input_blob->GetBlobDesc().dims, 3);
+
+        cuda_context->SetWorkspaceSize(N*C*H*W*sizeof(float)*2);
+        auto in_float = reinterpret_cast<float *>(cuda_context->GetWorkspace());
+        auto ou_float = in_float + N*C*H*W;
+
+        float* scale_data = static_cast<float*>(scale_blob->GetHandle().base + scale_blob->GetHandle().bytes_offset);
+        float* bias_data  = static_cast<float*>(bias_blob->GetHandle().base + bias_blob->GetHandle().bytes_offset);
+
+        int32_t channels_per_group = input_dims[1] / params->group;
+
+        if (dformat == DATA_FORMAT_NC4HW4) {
+            int srcNStride = ROUND_UP(C,4)*H*W;
+            int dstNStride = C*H*W;
+            int pack = 4;
+            if (channels_per_group % pack == 0) {
+                group_norm_v2<float, true, 4, true>((float*)input, (float*)output, scale_data, bias_data, iscale, oscale,
+                                                    input_dims[0], input_dims[1], params->group, channels_per_group,
+                                                    input_dims[2], input_dims[3], params->eps, context_->GetStream());
+            } else {
+                NC4HW4ToNCHW(input, in_float, N, C, H, W, srcNStride, dstNStride, iscale, context_->GetStream());
+                group_norm_v2<float, true, 1, false>(in_float, ou_float, scale_data, bias_data, iscale, oscale,
+                                                     input_dims[0], input_dims[1], params->group, channels_per_group,
+                                                     input_dims[2], input_dims[3], params->eps, context_->GetStream());
+                NCHWToNC4HW4(ou_float, output, N, C, H, W, dstNStride, srcNStride, oscale, context_->GetStream());
+            }
+
+            return TNN_OK;
+        } else if (dformat == DATA_FORMAT_NC32HW32) {
+            int srcNStride = ROUND_UP(C,32)*H*W;
+            int dstNStride = C*H*W;
+            int pack = 32;
+            if (channels_per_group % pack == 0) {
+                group_norm_v2<float, true, 32, true>((float*)input, (float*)output, scale_data, bias_data, iscale, oscale,
+                                                     input_dims[0], input_dims[1], params->group, channels_per_group,
+                                                     input_dims[2], input_dims[3], params->eps, context_->GetStream());
+            } else {
+                NC32HW32ToNCHW(input, in_float, N, C, H, W, srcNStride, dstNStride, iscale, context_->GetStream());
+                group_norm_v2<float, true, 1, false>(in_float, ou_float, scale_data, bias_data, iscale, oscale,
+                                                  input_dims[0], input_dims[1], params->group, channels_per_group,
+                                                  input_dims[2], input_dims[3], params->eps, context_->GetStream());
+                NCHWToNC32HW32(ou_float, output, N, C, H, W, dstNStride, srcNStride, oscale, context_->GetStream());
+            }
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_CUDA_TENSORRT_ERROR, "Unexpected int8 data format " + std::to_string(dformat));
+        }
     } else {
         return Status(TNNERR_CUDA_TENSORRT_ERROR, "Unexpected data type " + std::to_string(dtype));
     }
diff --git a/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu
index 1253b76ea..16f516223 100644
--- a/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_hard_swish_layer_acc.cu
@@ -13,32 +13,132 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/fastdiv.h"
 #include "tnn/utils/dims_utils.h"
+#include <cuda.h>
 
 namespace TNN_NS {
 
 DECLARE_CUDA_ACC(HardSwish, LAYER_HARDSWISH);
 
-__global__ void hard_swish_kernel(int count, const float* in1, const float* in2, float* out, int in_n1,
-        int in_c1, int in_h1, int in_w1, int in_n2, int in_c2, int in_h2, int in_w2, int out_c, int out_h,
-        int out_w, const float alpha, const float beta) {
-    CUDA_KERNEL_LOOP(index, count) {
-        int b = index / (out_c * out_h * out_w);
-        int c = index / (out_h * out_w) % out_c;
-        int h = index / out_w % out_h;
-        int w = index % out_w;
-        int input_index_b_1 = min(b, in_n1-1) * in_c1 * in_h1 * in_w1;
-        int input_index_b_2 = min(b, in_n2-1) * in_c2 * in_h2 * in_w2;
-        int input_index_c_1 = min(c, in_c1-1) * in_h1 * in_w1 + input_index_b_1;
-        int input_index_c_2 = min(c, in_c2-1) * in_h2 * in_w2 + input_index_b_2;
-        int input_index_h_1 = min(h, in_h1-1) * in_w1 + input_index_c_1;
-        int input_index_h_2 = min(h, in_h2-1) * in_w1 + input_index_c_2;
-        int input_index_w_1 = min(w, in_w1-1) + input_index_h_1;
-        int input_index_w_2 = min(w, in_w2-1) + input_index_h_2;
-        out[index] = in1[input_index_w_1] * max(min(in2[input_index_w_2] * alpha + beta, 1.f), 0.f);
+template <typename T>
+__device__ __forceinline__ float toFloat(T x)
+{
+    return float(x);
+}
+
+template <typename T>
+__device__ T fromFloat(float x);
+
+template <>
+__device__ __forceinline__ int8_t fromFloat<int8_t>(float x)
+{
+    // The order of the next two statements matters when x is a NaN,
+    // because IEEE max/min return the non-NaN operand when one operand
+    // is a NaN and the other is not.
+    x = fmaxf(x, INT8_MIN);
+    x = fminf(x, INT8_MAX);
+    return __float2int_rn(x);
+}
+
+template <>
+__device__ __forceinline__ float fromFloat<float>(float x)
+{
+    return x;
+}
+
+template <>
+__device__ __forceinline__ __half fromFloat<__half>(float x)
+{
+    return __float2half(x);
+}
+
+template <>
+__device__ __forceinline__ int32_t fromFloat<int32_t>(float x)
+{
+    return __float2int_rz(x);
+}
+
+template <typename T, bool packed, typename T_MATH, bool isI8, int32_t BlockDim, int32_t PACK, int32_t UNROLL>
+__global__ __launch_bounds__(BlockDim) void hardswish_kernel(T *dst, int32_t dstNStride, const T *src,
+                                                             int32_t srcNStride, const float i0i8Scale,
+                                                             const float o0i8Scale, const float alpha, const float beta,
+                                                             int32_t N, int32_t C, int32_t H, int32_t W, fastdiv divCHW,
+                                                             fastdiv divHW, fastdiv divW, int32_t cExtent) {
+    typedef int32_t int_cast_type;
+    alignas(alignof(int_cast_type)) T data_in[UNROLL];
+    alignas(alignof(int_cast_type)) T data_out[UNROLL];
+
+    int32_t tid = (blockIdx.x * BlockDim + threadIdx.x) * UNROLL;
+    int32_t n   = tid / divCHW;
+    int32_t nr0 = tid % divCHW;
+    if (n < N) {
+        int32_t index                               = n * srcNStride + nr0;
+        int32_t dndex                               = n * dstNStride + nr0;
+        *reinterpret_cast<int_cast_type *>(data_in) = *reinterpret_cast<int_cast_type *>((T *)src + index);
+
+#pragma unroll UNROLL
+        for (int32_t i = 0; i < UNROLL; i++) {
+            T_MATH input = toFloat(data_in[i]);
+            T_MATH output;
+            int32_t nr  = nr0 + i;
+            int32_t idx = nr;
+
+            int32_t c      = nr / divHW;
+            idx            = nr % divHW;
+            int32_t w      = idx % divW;
+            int32_t offset = (packed) ? c * PACK + (w & (PACK - 1)) : c;
+            if (offset >= cExtent) {
+                data_out[i] = fromFloat<T>(0.f);  // ensure correct zero-padding
+                continue;
+            }
+            if (isI8) {
+                float i8Scale = i0i8Scale;
+                input *= i8Scale;
+            }
+
+            // x * clip(x*alpha + beta, 0, 1)
+            output = input * max(min(input * alpha + beta, 1.f), 0.f);
+
+            if (isI8) {
+                float o8Scale = o0i8Scale;
+                output /= o8Scale;
+            }
+            data_out[i] = fromFloat<T>(output);
+        }
+
+        *reinterpret_cast<int_cast_type *>(dst + dndex) = *reinterpret_cast<int_cast_type *>(data_out);
     }
 }
 
+// template<typename T>
+// __global__ void hard_swish_elementwise_kernel(const int count, const T *input, T *output, const float alpha, const float beta) {
+//     CUDA_KERNEL_LOOP(index, count) {
+//         output[index] = input[index] * T(max(min(float(input[index]) * alpha + beta, 1.f), 0.f));
+//     }
+// }
+
+// template<typename T>
+// __global__ void hard_swish_kernel(int count, const T* in1, const T* in2, T* out, int in_n1,
+//         int in_c1, int in_h1, int in_w1, int in_n2, int in_c2, int in_h2, int in_w2, int out_c, int out_h,
+//         int out_w, const float alpha, const float beta) {
+//     CUDA_KERNEL_LOOP(index, count) {
+//         int b = index / (out_c * out_h * out_w);
+//         int c = index / (out_h * out_w) % out_c;
+//         int h = index / out_w % out_h;
+//         int w = index % out_w;
+//         int input_index_b_1 = min(b, in_n1-1) * in_c1 * in_h1 * in_w1;
+//         int input_index_b_2 = min(b, in_n2-1) * in_c2 * in_h2 * in_w2;
+//         int input_index_c_1 = min(c, in_c1-1) * in_h1 * in_w1 + input_index_b_1;
+//         int input_index_c_2 = min(c, in_c2-1) * in_h2 * in_w2 + input_index_b_2;
+//         int input_index_h_1 = min(h, in_h1-1) * in_w1 + input_index_c_1;
+//         int input_index_h_2 = min(h, in_h2-1) * in_w1 + input_index_c_2;
+//         int input_index_w_1 = min(w, in_w1-1) + input_index_h_1;
+//         int input_index_w_2 = min(w, in_w2-1) + input_index_h_2;
+//         out[index] = in1[input_index_w_1] * T(max(min(float(in2[input_index_w_2]) * alpha + beta, 1.f), 0.f));
+//     }
+// }
+
 Status CudaHardSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
@@ -52,46 +152,148 @@ Status CudaHardSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const s
     auto params = dynamic_cast<HardSwishLayerParam *>(param_);
     if (!params) {
         LOGE("Error: HardSwishLayerParam is nil\n");
-        return Status(TNNERR_MODEL_ERR, "Error: HardSwishLayerParam is nil");
+        return Status(TNNERR_LAYER_ERR, "Error: HardSwishLayerParam is nil");
     }
 
-    int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+    auto cuda_context = dynamic_cast<CudaContext *>(context_);
+
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+
+    void *input_data  = static_cast<void *>(input_blob->GetHandle().base);
+    void *output_data = static_cast<void *>(output_blob->GetHandle().base);
 
-    Blob* input_blob1 = inputs[0];
-    Blob* input_blob2 = inputs[0];
-    Blob* output_blob = outputs[0];
-    if (inputs.size() != 1) {
-        input_blob2 = inputs[1];
+    auto dtype      = input_blob->GetBlobDesc().data_type;
+    auto dformat    = input_blob->GetBlobDesc().data_format;
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    int N           = DimsFunctionUtils::GetDim(input_dims, 0);
+    int C           = DimsFunctionUtils::GetDim(input_dims, 1);
+    int H           = DimsFunctionUtils::GetDim(input_dims, 2);
+    int W           = DimsFunctionUtils::GetDim(input_dims, 3);
+    int c_extend    = C;
+    const int block = 128;
+
+    if (dtype == DATA_TYPE_INT8) {
+        auto input_scale_handle  = cuda_context->GetQuantResource(input_blob->GetBlobDesc().name);
+        auto output_scale_handle = cuda_context->GetQuantResource(output_blob->GetBlobDesc().name);
+
+        auto iscale = input_scale_handle->force_to<float *>()[0];
+        auto oscale = output_scale_handle->force_to<float *>()[0];
+        // float iscale = 0.05f;
+        // float oscale = 0.05f;
+
+        if (dformat == DATA_FORMAT_NC4HW4) {
+            W              = W * 4;
+            C              = UP_DIV(C, 4);
+            const int size = N * C * H * W;
+            fastdiv divCHW;
+            divCHW.init(C * H * W);
+            fastdiv divHW;
+            divHW.init(H * W);
+            fastdiv divW;
+            divW.init(W);
+            const int srcNStride = C * H * W;
+            const int dstNStride = srcNStride;
+            hardswish_kernel<int8_t, true, float, true, block, 4, 4>
+                <<<UP_DIV(size, block * 4), block, 0, context_->GetStream()>>>(
+                    (int8_t *)output_data, dstNStride, (int8_t *)input_data, srcNStride, iscale, oscale, params->alpha,
+                    params->beta, N, C, H, W, divCHW, divHW, divW, c_extend);
+        } else if (dformat == DATA_FORMAT_NC32HW32) {
+            W              = W * 32;
+            C              = UP_DIV(C, 32);
+            const int size = N * C * H * W;
+            fastdiv divCHW;
+            divCHW.init(C * H * W);
+            fastdiv divHW;
+            divHW.init(H * W);
+            fastdiv divW;
+            divW.init(W);
+            const int srcNStride = C * H * W;
+            const int dstNStride = srcNStride;
+            hardswish_kernel<int8_t, true, float, true, block, 32, 4>
+                <<<UP_DIV(size, block * 4), block, 0, context_->GetStream()>>>(
+                    (int8_t *)output_data, dstNStride, (int8_t *)input_data, srcNStride, iscale, oscale, params->alpha,
+                    params->beta, N, C, H, W, divCHW, divHW, divW, c_extend);
+        } else {
+            LOGE("Error: unsupported int8 layout\n");
+            return Status(TNNERR_LAYER_ERR, "Error: unsupported int8 layout");
+        }
+    } else {
+        LOGE("Error: unsupported data type\n");
+        return Status(TNNERR_LAYER_ERR, "Error: unsupported data type");
     }
-    float* input_data1 = static_cast<float*>(input_blob1->GetHandle().base);
-    float* input_data2 = static_cast<float*>(input_blob2->GetHandle().base);
-    float* output_data = static_cast<float*>(output_blob->GetHandle().base);
-
-    auto input_dims1 = input_blob1->GetBlobDesc().dims;
-    auto input_dims2 = input_blob2->GetBlobDesc().dims;
-    auto output_dims = output_blob->GetBlobDesc().dims;
-
-    int in_n1 = DimsFunctionUtils::GetDim(input_dims1, 0);
-    int in_c1 = DimsFunctionUtils::GetDim(input_dims1, 1);
-    int in_h1 = DimsFunctionUtils::GetDim(input_dims1, 2);
-    int in_w1 = DimsFunctionUtils::GetDim(input_dims1, 3);
-
-    int in_n2 = DimsFunctionUtils::GetDim(input_dims2, 0);
-    int in_c2 = DimsFunctionUtils::GetDim(input_dims2, 1);
-    int in_h2 = DimsFunctionUtils::GetDim(input_dims2, 2);
-    int in_w2 = DimsFunctionUtils::GetDim(input_dims2, 3);
-
-    int out_c = DimsFunctionUtils::GetDim(output_dims, 1);
-    int out_h = DimsFunctionUtils::GetDim(output_dims, 2);
-    int out_w = DimsFunctionUtils::GetDim(output_dims, 3);
-
-    hard_swish_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
-        count, input_data1, input_data2, output_data, in_n1, in_c1, in_h1, in_w1, in_n2, in_c2, in_h2,
-        in_w2, out_c, out_h, out_w, params->alpha, params->beta);
+
+    // int count = DimsVectorUtils::Count(outputs[0]->GetBlobDesc().dims);
+
+    // Blob* input_blob1 = inputs[0];
+    // Blob* input_blob2 = inputs[0];
+    // Blob* output_blob = outputs[0];
+    // if (inputs.size() != 1) {
+    //     input_blob2 = inputs[1];
+    // } else {
+
+    //     int count = DimsVectorUtils::Count(input_blob1->GetBlobDesc().dims);
+    //     if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+    //         float* input_data = static_cast<float*>(input_blob1->GetHandle().base);
+    //         float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+    //         hard_swish_elementwise_kernel<float><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+    //             count, input_data, output_data, params->alpha, params->beta
+    //         );
+    //     } else if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+    //         printf("half here\n");
+    //         half* input_data = static_cast<half*>(input_blob1->GetHandle().base);
+    //         half* output_data = static_cast<half*>(output_blob->GetHandle().base);
+    //         hard_swish_elementwise_kernel<half><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+    //             count, input_data, output_data, params->alpha, params->beta
+    //         );
+    //     }
+
+    //     // auto error = cudaGetLastError();
+    //     // if (error != cudaSuccess) {
+    //     //     LOGE("Error: hard swish kernel error!\n %s\n", cudaGetErrorString(error));
+    //     //     return Status(TNNERR_CUDA_KERNEL_LAUNCH_ERROR, "Error: hard swish kernel error!");
+    //     // }
+    //     return TNN_OK;
+    // }
+
+    // auto input_dims1 = input_blob1->GetBlobDesc().dims;
+    // auto input_dims2 = input_blob2->GetBlobDesc().dims;
+    // auto output_dims = output_blob->GetBlobDesc().dims;
+
+    // int in_n1 = DimsFunctionUtils::GetDim(input_dims1, 0);
+    // int in_c1 = DimsFunctionUtils::GetDim(input_dims1, 1);
+    // int in_h1 = DimsFunctionUtils::GetDim(input_dims1, 2);
+    // int in_w1 = DimsFunctionUtils::GetDim(input_dims1, 3);
+
+    // int in_n2 = DimsFunctionUtils::GetDim(input_dims2, 0);
+    // int in_c2 = DimsFunctionUtils::GetDim(input_dims2, 1);
+    // int in_h2 = DimsFunctionUtils::GetDim(input_dims2, 2);
+    // int in_w2 = DimsFunctionUtils::GetDim(input_dims2, 3);
+
+    // int out_c = DimsFunctionUtils::GetDim(output_dims, 1);
+    // int out_h = DimsFunctionUtils::GetDim(output_dims, 2);
+    // int out_w = DimsFunctionUtils::GetDim(output_dims, 3);
+
+    // if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+    //     float* input_data1 = static_cast<float*>(input_blob1->GetHandle().base);
+    //     float* input_data2 = static_cast<float*>(input_blob2->GetHandle().base);
+    //     float* output_data = static_cast<float*>(output_blob->GetHandle().base);
+        
+    //     hard_swish_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+    //         count, input_data1, input_data2, output_data, in_n1, in_c1, in_h1, in_w1, in_n2, in_c2, in_h2,
+    //         in_w2, out_c, out_h, out_w, params->alpha, params->beta);
+    // } else if (input_blob1->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+    //     half* input_data1 = static_cast<half*>(input_blob1->GetHandle().base);
+    //     half* input_data2 = static_cast<half*>(input_blob2->GetHandle().base);
+    //     half* output_data = static_cast<half*>(output_blob->GetHandle().base);
+    //     hard_swish_kernel<half><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+    //         count, input_data1, input_data2, output_data, in_n1, in_c1, in_h1, in_w1, in_n2, in_c2, in_h2,
+    //         in_w2, out_c, out_h, out_w, params->alpha, params->beta);
+    // }
     
     return TNN_OK;
 }
 
 REGISTER_CUDA_ACC(HardSwish, LAYER_HARDSWISH);
 
-}  // namespace TNN_NS
+} // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu
index d5c4041bb..79d3e2138 100644
--- a/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_inner_product_layer_acc.cu
@@ -114,14 +114,14 @@ Status CudaInnerProductLayerAcc::Forward(const std::vector<Blob *> &inputs, cons
     float alpha = 1.0;
     float beta  = 0.0;
 
-    CUBLAS_CHECK(cublasSgemm(context_->cublas_handle_, CUBLAS_OP_T, CUBLAS_OP_N,
+    CUBLAS_CHECK(cublasSgemm(context_->GetCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_N,
                              n_, m_, k_, &alpha, weight_, k_, bottom_data, k_,
                              &beta, top_data, n_));
 
     if (has_bias_) {
         alpha = 1.0;
         beta  = 1.0;
-        CUBLAS_CHECK(cublasSgemm(context_->cublas_handle_, CUBLAS_OP_N,
+        CUBLAS_CHECK(cublasSgemm(context_->GetCublasHandle(), CUBLAS_OP_N,
                                  CUBLAS_OP_N, n_, m_, 1, &alpha, bias_, n_,
                                  multiplier_, 1, &beta, top_data, n_));
     }
diff --git a/source/tnn/device/cuda/acc/cuda_inplace_slice_copy_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_inplace_slice_copy_layer_acc.cu
new file mode 100644
index 000000000..12a5e8134
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_inplace_slice_copy_layer_acc.cu
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+template<typename T>
+__global__ void strided_copy_kernel(const T* src, T* dst, const int count, const int offset,
+    const int* strides_out, const int* strided_dims, const int* strides_in) 
+{
+    CUDA_KERNEL_LOOP(idx, count) {
+        int d4 = idx / strides_in[4] % strided_dims[4];
+        int d3 = idx / strides_in[3] % strided_dims[3];
+        int d2 = idx / strides_in[2] % strided_dims[2];
+        int d1 = idx / strides_in[1] % strided_dims[1];
+        int d0 = idx / strides_in[0] % strided_dims[0];
+        int index_in = d0 * strides_out[0] +
+                       d1 * strides_out[1] +
+                       d2 * strides_out[2] +
+                       d3 * strides_out[3] +
+                       d4 * strides_out[4] + offset;
+        dst[index_in] = src[idx];
+    }
+}
+
+template __global__ void strided_copy_kernel<float>(
+    const float* src, float* dst, const int count, const int offset,
+    const int* strides_fuse, const int* dims_out, const int* strides_out
+);
+
+template __global__ void strided_copy_kernel<half>(
+    const half* src, half* dst, const int count, const int offset,
+    const int* strides_fuse, const int* dims_out, const int* strides_out
+);
+
+Status CudaInplaceSliceCopyLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+    CreateTempBuf(5 * sizeof(int));
+    CreateTempBuf(5 * sizeof(int));
+    CreateTempBuf(5 * sizeof(int));
+
+    return TNN_OK;
+}
+
+Status CudaInplaceSliceCopyLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaInplaceSliceCopyLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob0   = inputs[0];
+    Blob *input_blob1   = inputs[1];
+    Blob *output_blob   = outputs[0];
+
+    auto input0_dims    = input_blob0->GetBlobDesc().dims;
+    auto input1_dims    = input_blob1->GetBlobDesc().dims;
+    auto output_dims    = output_blob->GetBlobDesc().dims;
+    auto layer_param    = dynamic_cast<StrideSliceV2LayerParam *>(param_);
+    if (!layer_param) {
+        LOGE("Error: InplaceSliceCopyLayerParam is nil\n");
+        return Status(TNNERR_MODEL_ERR, "Error: InplaceSliceCopyLayerParam is nil");
+    }
+
+    auto axes = layer_param->axes;
+    std::vector<int> begins(5, 0), strides(5, 1), strided_dims(5, 1), strides_in(5, 1), strides_out(5, 1), strides_offset(5, 1);
+    int offset = 0;
+    for (int i = 0; i < axes.size(); ++i) {
+        int axis = axes[i];
+        int begin = layer_param->begins[i];
+        begins[axis] = begin >= 0 ? begin : begin + output_dims[axis];
+        strides[axis] = layer_param->strides[i];
+    }
+    for (int i = 0; i < input1_dims.size(); i++) {
+        strided_dims[i] = input1_dims[i];
+    }
+    for (int i = input1_dims.size(); i < 5; i++) {
+        strided_dims[i] = 1;
+    }
+    strides_in[4] = 1;
+    strides_offset[4] = 1;
+    for (int i = 3; i >= 0; i--) {
+        if (i < input1_dims.size() - 1) {
+            strides_in[i] = strides_in[i + 1] * input1_dims[i + 1];
+            strides_offset[i] = strides_offset[i + 1] * output_dims[i + 1];
+        } else {
+            strides_in[i] = strides_in[i + 1];
+            strides_offset[i] = strides_offset[i + 1];
+        }
+    }
+    for (int i = 4; i >= 0; i--) {
+        offset += begins[i] * strides_offset[i];
+        strides_out[i] = strides_offset[i] * strides[i];
+    }
+
+    cudaMemcpyAsync(tempbufs_[0].ptr, &(strides_out[0]), 5 * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+    cudaMemcpyAsync(tempbufs_[1].ptr, &(strided_dims[0]), 5 * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+    cudaMemcpyAsync(tempbufs_[2].ptr, &(strides_in[0]), 5 * sizeof(int), cudaMemcpyHostToDevice, context_->GetStream());
+
+    void* input0_ptr = input_blob0->GetHandle().base;
+    void* input1_ptr = input_blob1->GetHandle().base;
+    void* output_ptr = output_blob->GetHandle().base;
+
+    int count_out = DimsVectorUtils::Count(output_dims);
+    int count = DimsVectorUtils::Count(input1_dims);
+
+    if (input_blob0->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
+        cudaMemcpyAsync(output_ptr, input0_ptr, count_out * sizeof(float), cudaMemcpyDeviceToDevice, context_->GetStream());
+        if (count == 0)
+            return TNN_OK;
+        strided_copy_kernel<float><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            (float*)input1_ptr, (float*)output_ptr, count, offset, (const int*)tempbufs_[0].ptr,
+            (const int*)tempbufs_[1].ptr, (const int*)tempbufs_[2].ptr
+        );
+    } else if (input_blob0->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        cudaMemcpyAsync(output_ptr, input0_ptr, count_out * sizeof(half), cudaMemcpyDeviceToDevice, context_->GetStream());
+        if (count == 0)
+            return TNN_OK;
+        strided_copy_kernel<half><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            (half*)input1_ptr, (half*)output_ptr, count, offset, (const int*)tempbufs_[0].ptr,
+            (const int*)tempbufs_[1].ptr, (const int*)tempbufs_[2].ptr
+        );
+    } else {
+        LOGE("Error: layer acc don't support data type: %d\n", inputs[0]->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+    }
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+}
\ No newline at end of file
diff --git a/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu
index 886502828..9e62ef325 100644
--- a/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_instance_norm_layer_acc.cu
@@ -19,6 +19,7 @@
 #include <cub/block/block_radix_sort.cuh>
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
 #include "tnn/utils/dims_utils.h"
 
 namespace TNN_NS {
@@ -94,6 +95,75 @@ __global__ void instance_norm_kernel(const T * input, T* output, const float * g
     }
 }
 
+template<int THREAD_PER_BLOCK, typename T>
+__global__ void instance_norm_no_running_kernel(const T * input, T* output,
+        const int size, const int batch_size, const int C, const float eps) {
+    __shared__ double ssum1[THREAD_PER_BLOCK/32];
+    __shared__ double ssum2[THREAD_PER_BLOCK/32];
+    __shared__ float k;
+    __shared__ float b;
+
+    // const int batch_offset = blockIdx.y * size;
+    const int block_offset = blockIdx.x * size;
+    const T * ptr = input + block_offset;
+    T * dst = output + block_offset;
+    
+    double thread_sum1 = 0.f;
+    double thread_sum2 = 0.f;
+
+    for (int i = threadIdx.x; i < size; i+=THREAD_PER_BLOCK) {
+        float value = get_float_value<T>(ptr[i]);
+        thread_sum1 += value;
+        thread_sum2 += value * value;
+    }
+
+    thread_sum1 += __shfl_down_sync(0xffffffff, thread_sum1, 16, 32);
+    thread_sum1 += __shfl_down_sync(0x0000ffff, thread_sum1, 8, 16);
+    thread_sum1 += __shfl_down_sync(0x000000ff, thread_sum1, 4, 8);
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0xffffffff, thread_sum2, 16, 32);
+    thread_sum2 += __shfl_down_sync(0x0000ffff, thread_sum2, 8, 16);
+    thread_sum2 += __shfl_down_sync(0x000000ff, thread_sum2, 4, 8);
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x % 32 == 0) {
+        ssum1[threadIdx.x / 32] = thread_sum1;
+        ssum2[threadIdx.x / 32] = thread_sum2;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_sum1 = ssum1[threadIdx.x];
+        thread_sum2 = ssum2[threadIdx.x];
+    } else {
+        thread_sum1 = 0;
+        thread_sum2 = 0;
+    }
+    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
+    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
+
+    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
+    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+
+    if (threadIdx.x == 0) {
+        double mean = thread_sum1 / size;
+        double var = thread_sum2 / size - mean * mean;
+
+        k = 1.0 / sqrt(var + eps);
+        b = - mean * k;
+    }
+    
+    __syncthreads();
+    #pragma unroll(4)
+    for (int i = threadIdx.x; i < size; i += THREAD_PER_BLOCK) {
+        dst[i] = convert_float_value<T>((get_float_value<T>(ptr[i]) * k + b));
+    }
+}
+
+
 Status CudaInstanceNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
@@ -107,15 +177,21 @@ Status CudaInstanceNormLayerAcc::Init(Context *context, LayerParam *param, Layer
         return Status(TNNERR_MODEL_ERR, "Error: InstanceNormLayerResource is nil");
     }
 
-    float *k_data = res->scale_handle.force_to<float *>();
-    int k_size = res->scale_handle.GetBytesSize();
-    float *b_data = res->bias_handle.force_to<float *>();
-    int b_size = res->bias_handle.GetBytesSize();
+    // Divide Instance Norm OP into two forms here:
+    // form 0: with running mean&var,  form 1: without running mean&var.
+    if (res->scale_handle.GetDataCount() != 0) {
+        // With Running Mean & Var
+        float *k_data = res->scale_handle.force_to<float *>();
+        float *b_data = res->bias_handle.force_to<float *>();
+        int k_size = res->scale_handle.GetBytesSize();
+        int b_size = res->bias_handle.GetBytesSize();
+        
+        CreateTempBuf(k_size);
+        CreateTempBuf(b_size);
+        cudaMemcpyAsync(tempbufs_[0].ptr, k_data, k_size, cudaMemcpyHostToDevice, context_->GetStream());
+        cudaMemcpyAsync(tempbufs_[1].ptr, b_data, b_size, cudaMemcpyHostToDevice, context_->GetStream());
+    }
 
-    CreateTempBuf(k_size);
-    CreateTempBuf(b_size);
-    cudaMemcpyAsync(tempbufs_[0].ptr, k_data, k_size, cudaMemcpyHostToDevice, context_->GetStream());
-    cudaMemcpyAsync(tempbufs_[1].ptr, b_data, b_size, cudaMemcpyHostToDevice, context_->GetStream());
     return TNN_OK;
 }
 
@@ -141,15 +217,26 @@ Status CudaInstanceNormLayerAcc::Forward(const std::vector<Blob *> &inputs, cons
     griddim.x = channels * num;
 
     if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        instance_norm_kernel<THREAD_PER_BLOCK, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((float*)input_data,
-            (float*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+        if (!tempbufs_.empty()) {
+            instance_norm_kernel<THREAD_PER_BLOCK, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((float*)input_data,
+                (float*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+        } else {
+            instance_norm_no_running_kernel<THREAD_PER_BLOCK, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>(
+                (float*)input_data, (float*)output_data, hw, channels * num, channels, 1e-5);
+        }
     } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
-        instance_norm_kernel<THREAD_PER_BLOCK, __half><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((__half*)input_data,
-            (__half*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+        if (!tempbufs_.empty()) {
+            instance_norm_kernel<THREAD_PER_BLOCK, __half><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((__half*)input_data,
+                (__half*)output_data, (const float *)tempbufs_[0].ptr, (const float *)tempbufs_[1].ptr, hw, channels * num, channels, 1e-5);
+        } else {
+            instance_norm_no_running_kernel<THREAD_PER_BLOCK, __half><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>(
+                (__half*)input_data, (__half*)output_data, hw, channels * num, channels, 1e-5);
+        }
     } else {
         LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
         return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
     }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_layer_acc.cc b/source/tnn/device/cuda/acc/cuda_layer_acc.cc
index c6e4ce33a..e772eed9e 100644
--- a/source/tnn/device/cuda/acc/cuda_layer_acc.cc
+++ b/source/tnn/device/cuda/acc/cuda_layer_acc.cc
@@ -104,7 +104,21 @@ std::vector<DataFormat> CudaLayerAcc::SupportDataFormat(DataType data_type, int
 void CudaLayerAcc::CreateTempBuf(size_t size) {
     CudaTempBufUnit buf;
     device_->Allocate(&(buf.ptr), size);
+    buf.size = size;
     tempbufs_.push_back(buf);
 }
 
+void CudaLayerAcc::ResizeTempBuf(int index, size_t size) {
+    if (index >= tempbufs_.size()) {
+        LOGD("CUDA ResizeTempBuf index: %d out of range, size of tempbufs_ = : %d\n", index, tempbufs_.size());
+        return;
+    }
+    if (size > tempbufs_[index].size) {
+        device_->ReAllocate(&(tempbufs_[index].ptr), size);
+        tempbufs_[index].size = size;
+    } else {
+        LOGD("CUDA ResizeTempBuf target size %d < current size %d, do nothing.\n", size, tempbufs_[index].size);
+    }
+}
+
 }  //  namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_layer_acc.h b/source/tnn/device/cuda/acc/cuda_layer_acc.h
index df70395a0..4d82ef045 100644
--- a/source/tnn/device/cuda/acc/cuda_layer_acc.h
+++ b/source/tnn/device/cuda/acc/cuda_layer_acc.h
@@ -21,7 +21,6 @@
 #include "tnn/device/cuda/cuda_context.h"
 #include "tnn/device/cuda/cuda_device.h"
 #include "tnn/device/cuda/cuda_macro.h"
-#include "tnn/device/cuda/utils.cuh"
 
 namespace TNN_NS {
 
@@ -67,6 +66,7 @@ class CudaLayerAcc : public AbstractLayerAcc {
 
 protected:
     void CreateTempBuf(size_t size);
+    void ResizeTempBuf(int index, size_t size);
 
     bool is_reshaped         = false;
     CudaDevice *device_      = nullptr;
@@ -76,7 +76,7 @@ class CudaLayerAcc : public AbstractLayerAcc {
     std::vector<CudaTempBufUnit> tempbufs_;
 
 private:
-    // @brief retrun device layer acc support data format
+    // @brief return device layer acc support data format
     virtual std::vector<DataFormat> SupportDataFormat(DataType data_type, int dims_size, BlobType blob_type);
 };
 
diff --git a/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
index ed271d44b..acf7a1388 100644
--- a/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_layer_norm_layer_acc.cu
@@ -19,79 +19,64 @@
 #include <cub/block/block_radix_sort.cuh>
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
 #include "tnn/utils/dims_utils.h"
 
 namespace TNN_NS {
 
 DECLARE_CUDA_ACC(LayerNorm, LAYER_LAYER_NORM);
 
-template<int THREAD_PER_BLOCK, typename T, typename Acc>
-__global__ void layer_norm_kernel(const T * input, T* output, const T *scale,
-        const T *bias, const int size, const int batch_size, const float eps) {
-    __shared__ Acc ssum1[THREAD_PER_BLOCK/32];
-    __shared__ Acc ssum2[THREAD_PER_BLOCK/32];
-    __shared__ Acc mean;
-    __shared__ Acc var;
-
-    const int block_offset = blockIdx.x * size;
-    const T *ptr = input + block_offset;
-    T *dst = output + block_offset;
-
-    Acc thread_sum1 = 0.f;
-    Acc thread_sum2 = 0.f;
-
-    for (int i = threadIdx.x; i < size; i+=THREAD_PER_BLOCK) {
-        float value = get_float_value<T>(ptr[i]);
-        thread_sum1 += value;
-        thread_sum2 += value * value;
+// Special Float2 Structure for LayerNorm, to calculate sum and variance sum within one CUB Reduction Call.
+// v1 for sum, v2 for variance sum.
+struct LNFloat2 {
+    float v1; float v2;
+    __device__ __host__ inline LNFloat2(const float a, const float b) : v1(a), v2(b) {}
+    __device__ __host__ inline LNFloat2() : v1(0.), v2(0.) {}
+    __device__ __host__ inline LNFloat2(const float& other): v1(other), v2(other * other) {}
+    __device__ __host__ inline LNFloat2(const __half& other): v1(float(other)), v2(float(other) * float(other)) {}
+    __device__ __host__ inline LNFloat2 operator+(const LNFloat2 &other) { return {v1 + other.v1, v2 + other.v2}; }
+    __device__ __host__ inline LNFloat2 &operator+=(const LNFloat2 &other) { v1 += other.v1; v2 += other.v2; return *this; }
+};
+
+struct LNFloat2CustomSum {
+    template <typename T>
+    CUB_RUNTIME_FUNCTION __device__ __host__ __forceinline__
+    T operator()(const T &a, const T &b) const {
+        return a + b;
     }
-
-    thread_sum1 += __shfl_down_sync(0xffffffff, thread_sum1, 16, 32);
-    thread_sum1 += __shfl_down_sync(0x0000ffff, thread_sum1, 8, 16);
-    thread_sum1 += __shfl_down_sync(0x000000ff, thread_sum1, 4, 8);
-    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
-    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
-
-    thread_sum2 += __shfl_down_sync(0xffffffff, thread_sum2, 16, 32);
-    thread_sum2 += __shfl_down_sync(0x0000ffff, thread_sum2, 8, 16);
-    thread_sum2 += __shfl_down_sync(0x000000ff, thread_sum2, 4, 8);
-    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
-    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
-
-    if (threadIdx.x % 32 == 0) {
-        ssum1[threadIdx.x / 32] = thread_sum1;
-        ssum2[threadIdx.x / 32] = thread_sum2;
+    
+    CUB_RUNTIME_FUNCTION __device__ __host__ __forceinline__
+    LNFloat2 operator()(const LNFloat2 &a, const LNFloat2 &b) const {
+        return {a.v1 + b.v1, a.v2 + b.v2};
     }
-    __syncthreads();
+};
 
-    if (threadIdx.x < blockDim.x / 32) {
-        thread_sum1 = ssum1[threadIdx.x];
-        thread_sum2 = ssum2[threadIdx.x];
-    } else {
-        thread_sum1 = 0;
-        thread_sum2 = 0;
+// Step 1: Set offset for CUB reduce kernel if necessary
+__global__ void ln_set_reduce_offset_kernel(int *offset, const int channels, const int channel_area) {
+    CUDA_KERNEL_LOOP(index, channels+1) {
+        offset[index] = index * channel_area;
     }
-    thread_sum1 += __shfl_down_sync(0x0000000f, thread_sum1, 2, 4);
-    thread_sum1 += __shfl_down_sync(0x00000003, thread_sum1, 1, 2);
-
-    thread_sum2 += __shfl_down_sync(0x0000000f, thread_sum2, 2, 4);
-    thread_sum2 += __shfl_down_sync(0x00000003, thread_sum2, 1, 2);
+}
 
-    if (threadIdx.x == 0) {
-        mean = thread_sum1 / size;
-        var = (thread_sum2 / size - mean * mean);
+// Step 4: Calculate Output with scale, bias and calculated mean, var.
+template<typename T>
+__global__ void ln_mul_add_kernel(const T *input, T *output, const T *scale, const T *bias,
+                                  const LNFloat2 *mean_var,
+                                  const int count, const float eps) {
+    int offset = blockIdx.y * blockDim.x + threadIdx.x;
+    int total_offset = blockIdx.x * count + offset;
+    if (offset < count) {
+        const float* mean_var_float = reinterpret_cast<const float*>(mean_var);
+        float mean = mean_var_float[blockIdx.x * 2 + 0] / float(count);
+        float var  = mean_var_float[blockIdx.x * 2 + 1] / float(count) - mean * mean;
         var = 1.0 / sqrt(var + eps);
-    }
-    __syncthreads();
-
-    #pragma unroll(4)
-    for (int i = threadIdx.x; i < size; i += THREAD_PER_BLOCK) {
-        float k = get_float_value<T>(scale[i]) * var;
-        float b = - mean * k + get_float_value<T>(bias[i]);
-        dst[i] = convert_float_value<T>((get_float_value<T>(ptr[i]) * k + b));
+        float k = float(scale[offset]) * var;
+        float b = - mean * k + float(bias[offset]);
+        output[total_offset] = T(float(input[total_offset]) * k + b);
     }
 }
 
+
 Status CudaLayerNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
@@ -99,6 +84,11 @@ Status CudaLayerNormLayerAcc::Init(Context *context, LayerParam *param, LayerRes
         return ret;
     }
 
+    // Create TempBuffer in Init Stage for LayerNorm
+    CreateTempBuf(sizeof(LNFloat2) * 4); // Buffer 0 for Stored Mean & Var
+    CreateTempBuf(sizeof(LNFloat2) * 4); // Buffer 1 for Cub::Reduce Offsets
+    CreateTempBuf(sizeof(LNFloat2) * 4); // Buffer 2 for Cub::Reduce Tempspace
+
     return TNN_OK;
 }
 
@@ -121,7 +111,6 @@ Status CudaLayerNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const s
     }
 
     const int channel_dim_size = (int)dims_input.size() - reduce_dim_size;
-
     const int channels = DimsVectorUtils::Count(dims_input, 0, channel_dim_size);
     const int channel_area = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims, channel_dim_size);
     if (0 == channels || 0 == channel_area) {
@@ -134,20 +123,57 @@ Status CudaLayerNormLayerAcc::Forward(const std::vector<Blob *> &inputs, const s
     void *scale_data  = scale_blob->GetHandle().base;
     void *bias_data   = bias_blob->GetHandle().base;
 
-    const int THREAD_PER_BLOCK = 128;
+    const int THREAD_PER_BLOCK = 1024;
+    int num_blocks = (channel_area - 1) / THREAD_PER_BLOCK + 1;
     dim3 griddim;
-    griddim.x = channels;
+    griddim.x = channels; // batch_size
+    griddim.y = num_blocks;
+
+    // Re-Allocate Temp Buffer if size of existing one is not enough.
+    ResizeTempBuf(0, sizeof(LNFloat2) * channels); // Buffer for stored mean & var
+    ResizeTempBuf(1, sizeof(int) * (channels + 1)); // Buffer for temp offsets
+    LNFloat2* temp0_ptr = static_cast<LNFloat2*>(tempbufs_[0].ptr);
+    int* offsets_ptr = static_cast<int*>(tempbufs_[1].ptr);
+    LNFloat2CustomSum tuple2_custom_sum;
+
+    // Step 1: Set offsets for CUB Reduction kernel if necessary
+    ln_set_reduce_offset_kernel<<<1, 256, 0, context_->GetStream()>>>(offsets_ptr, channels, channel_area);
 
     if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
-        layer_norm_kernel<THREAD_PER_BLOCK, float, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((float*)input_data,
-            (float *)output_data, (float *)scale_data, (float *)bias_data, channel_area, channels, layer_param->eps);
+        // Step 2: Determine temporary device storage requirements for CUB reduction, allocate if necessary
+        size_t curr_cub_temp_bytes = 0;
+        CubDebug(cub::DeviceSegmentedReduce::Reduce(nullptr, curr_cub_temp_bytes, (float*)input_data, temp0_ptr,
+                                                    channels, offsets_ptr, offsets_ptr + 1, tuple2_custom_sum, LNFloat2(0), context_->GetStream()));
+        ResizeTempBuf(2, curr_cub_temp_bytes); // Buffer for Cub TempSpace
+
+        // Step 3: Call CUB Reduction for a second time, Run mean var sum-reduction
+        CubDebug(cub::DeviceSegmentedReduce::Reduce(tempbufs_[2].ptr, curr_cub_temp_bytes, (float*)input_data, temp0_ptr,
+                                                    channels, offsets_ptr, offsets_ptr + 1, tuple2_custom_sum, LNFloat2(0), context_->GetStream()));
+
+        // Step 4: LayerNorm Multiple & Add with Reduced Mean & Var
+        ln_mul_add_kernel<float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+            ((float*)input_data, (float *)output_data, (float *)scale_data, (float *)bias_data,
+             temp0_ptr, channel_area, layer_param->eps);
     } else if (input_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
-        layer_norm_kernel<THREAD_PER_BLOCK, __half, float><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>((__half*)input_data,
-            (__half *)output_data, (__half *)scale_data, (__half *)bias_data, channel_area, channels, layer_param->eps);
+        // Step 2: Determine temporary device storage requirements for CUB reduction, allocate if necessary
+        size_t curr_cub_temp_bytes = 0;
+        CubDebug(cub::DeviceSegmentedReduce::Reduce(nullptr, curr_cub_temp_bytes, (__half*)input_data, temp0_ptr,
+                                                    channels, offsets_ptr, offsets_ptr + 1, tuple2_custom_sum, LNFloat2(0), context_->GetStream()));
+        ResizeTempBuf(2, curr_cub_temp_bytes); // Buffer for Cub TempSpace
+
+        // Step 3: Call CUB Reduction for a second time, Run mean var sum-reduction
+        CubDebug(cub::DeviceSegmentedReduce::Reduce(tempbufs_[2].ptr, curr_cub_temp_bytes, (__half*)input_data, temp0_ptr,
+                                                    channels, offsets_ptr, offsets_ptr + 1, tuple2_custom_sum, LNFloat2(0), context_->GetStream()));
+
+        // Step 4: LayerNorm Multiple & Add with Reduced Mean & Var
+        ln_mul_add_kernel<__half><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+            ((__half*)input_data, (__half *)output_data, (__half *)scale_data, (__half *)bias_data,
+             temp0_ptr, channel_area, layer_param->eps);
     } else {
-        LOGE("Error: layer acc dont support datatype: %d\n", input_blob->GetBlobDesc().data_type);
-        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
+        LOGE("Error: LayerNorm layer acc does not support datatype: %d\n", input_blob->GetBlobDesc().data_type);
+        return Status(TNNERR_MODEL_ERR, "Error: LayerNorm layer acc does not support current datatype");
     }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_leaky_relu_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_leaky_relu_layer_acc.cu
new file mode 100644
index 000000000..9d340006c
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_leaky_relu_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(LeakyRelu, LAYER_LEAKY_RELU);
+
+Status CudaLeakyReluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaLeakyReluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLeakyReluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(LeakyRelu, LAYER_LEAKY_RELU);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_linspace_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_linspace_layer_acc.cu
new file mode 100644
index 000000000..4823d2f3d
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_linspace_layer_acc.cu
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Linspace, LAYER_LINSPACE);
+
+Status CudaLinspaceLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Status status = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    RETURN_ON_FAIL(status);
+
+    float eps = 1e-6;
+    // Cuda Layer only support start and end is int
+    auto layer_param = dynamic_cast<LinspaceLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    float start = layer_param->start.f;
+    float end = layer_param->end.f;
+    if (layer_param->start_index != -1 && fabs(start - static_cast<int>(start)) > eps) {
+        LOGE("Cuda Linspace Layer got non-int start\n");
+        return TNNERR_LAYER_ERR;
+    }
+
+    if (layer_param->end_index != -1 && fabs(end - static_cast<int>(end)) > eps) {
+        LOGE("Cuda Linspace Layer got non-int end\n");
+        return TNNERR_LAYER_ERR;
+    }
+
+    return status;
+}
+
+Status CudaLinspaceLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaLinspaceLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Linspace, LAYER_LINSPACE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu
index c40e2098d..c4e38b74d 100644
--- a/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.cu
@@ -35,73 +35,73 @@ namespace TNN_NS {
             *[4, hidden_size],               // ifco Backward Bias for recurent, only exists in bidirection mode
         )
 */
-Status PackONNXWeightsToCUDNNFormat(Blob * W, Blob * R, Blob* B, 
+Status PackONNXWeightsToCUDNNFormat(CudaContext *context, Blob * W, Blob * R, Blob* B, 
                                     const int directions, const int hidden_size, const int input_size, 
                                     float * cudnn_weight_ptr)
 {
-    // 1. Check blob volumn
-    if (DimsVectorUtils::Count(W->GetBlobDesc().dims) != directions * 4 * hidden_size * input_size) {
-        LOGE("Blob W has invalid volumn\n");
-        return  TNNERR_LAYER_ERR;
-    }
-
-    if (DimsVectorUtils::Count(R->GetBlobDesc().dims) != directions * 4 * hidden_size * hidden_size) {
-        LOGE("Blob R has invalid volumn\n");
-        return  TNNERR_LAYER_ERR;
-    }
-
-    if (DimsVectorUtils::Count(B->GetBlobDesc().dims) != directions * 8 * hidden_size) {
-        LOGE("Blob B has invalid volumn\n");
-        return  TNNERR_LAYER_ERR;
-    }
-
-    const int gate_offset[4] = {0, 2, 3, 1}; // IOFC -> IFCO
-
-    // [num_directions, 4*hidden_size, input_size].
-    float * W_ptr = (float*)(((char*)W->GetHandle().base) + W->GetHandle().bytes_offset);
-    // [num_directions, 4*hidden_size, hidden_size].
-    float * R_ptr = (float*)(((char*)R->GetHandle().base) + R->GetHandle().bytes_offset);
-    // [num_directions, 8*hidden_size].
-    float * B_ptr = (float*)(((char*)B->GetHandle().base) + B->GetHandle().bytes_offset);
-
-    size_t offset = 0;
-    for(int dire = 0; dire < directions; dire++) {
-        // W
-        for(int g=0;g<4;g++) {
-            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
-                                  W_ptr + (dire * 4 + gate_offset[g]) * hidden_size * input_size,
-                                  hidden_size * input_size * sizeof(float),
-                                  cudaMemcpyDeviceToDevice));
-            offset += hidden_size * input_size;
-        }
-        // R
-        for(int g=0;g<4;g++) {
-            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
-                                  R_ptr + (dire * 4 + gate_offset[g]) * hidden_size * hidden_size,
-                                  hidden_size * hidden_size * sizeof(float),
-                                  cudaMemcpyDeviceToDevice));
-            offset += hidden_size * hidden_size;
-        }
-    }
-
-    for(int dire = 0; dire < directions; dire++) {
-        // WB
-        for(int g=0;g<4;g++) {
-            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
-                                  B_ptr + (dire * 8 + gate_offset[g]) * hidden_size,
-                                  hidden_size * sizeof(float),
-                                  cudaMemcpyDeviceToDevice));
-            offset += hidden_size;
-        }
-        // RB
-        for(int g=0;g<4;g++) {
-            CUDA_CHECK(cudaMemcpy(cudnn_weight_ptr + offset, 
-                                  B_ptr + (dire * 8 + 4 + gate_offset[g]) * hidden_size,
-                                  hidden_size * sizeof(float),
-                                  cudaMemcpyDeviceToDevice));
-            offset += hidden_size;
-        }
-    }
+    // // 1. Check blob volumn
+    // if (DimsVectorUtils::Count(W->GetBlobDesc().dims) != directions * 4 * hidden_size * input_size) {
+    //     LOGE("Blob W has invalid volumn\n");
+    //     return  TNNERR_LAYER_ERR;
+    // }
+
+    // if (DimsVectorUtils::Count(R->GetBlobDesc().dims) != directions * 4 * hidden_size * hidden_size) {
+    //     LOGE("Blob R has invalid volumn\n");
+    //     return  TNNERR_LAYER_ERR;
+    // }
+
+    // if (DimsVectorUtils::Count(B->GetBlobDesc().dims) != directions * 8 * hidden_size) {
+    //     LOGE("Blob B has invalid volumn\n");
+    //     return  TNNERR_LAYER_ERR;
+    // }
+
+    // const int gate_offset[4] = {0, 2, 3, 1}; // IOFC -> IFCO
+
+    // // [num_directions, 4*hidden_size, input_size].
+    // float * W_ptr = (float*)(((char*)W->GetHandle().base) + W->GetHandle().bytes_offset);
+    // // [num_directions, 4*hidden_size, hidden_size].
+    // float * R_ptr = (float*)(((char*)R->GetHandle().base) + R->GetHandle().bytes_offset);
+    // // [num_directions, 8*hidden_size].
+    // float * B_ptr = (float*)(((char*)B->GetHandle().base) + B->GetHandle().bytes_offset);
+
+    // size_t offset = 0;
+    // for(int dire = 0; dire < directions; dire++) {
+    //     // W
+    //     for(int g=0;g<4;g++) {
+    //         CUDA_CHECK(cudaMemcpyAsync(cudnn_weight_ptr + offset, 
+    //                               W_ptr + (dire * 4 + gate_offset[g]) * hidden_size * input_size,
+    //                               hidden_size * input_size * sizeof(float),
+    //                               cudaMemcpyDeviceToDevice, context->GetStream()));
+    //         offset += hidden_size * input_size;
+    //     }
+    //     // R
+    //     for(int g=0;g<4;g++) {
+    //         CUDA_CHECK(cudaMemcpyAsync(cudnn_weight_ptr + offset, 
+    //                               R_ptr + (dire * 4 + gate_offset[g]) * hidden_size * hidden_size,
+    //                               hidden_size * hidden_size * sizeof(float),
+    //                               cudaMemcpyDeviceToDevice, context->GetStream()));
+    //         offset += hidden_size * hidden_size;
+    //     }
+    // }
+
+    // for(int dire = 0; dire < directions; dire++) {
+    //     // WB
+    //     for(int g=0;g<4;g++) {
+    //         CUDA_CHECK(cudaMemcpyAsync(cudnn_weight_ptr + offset, 
+    //                               B_ptr + (dire * 8 + gate_offset[g]) * hidden_size,
+    //                               hidden_size * sizeof(float),
+    //                               cudaMemcpyDeviceToDevice, context->GetStream()));
+    //         offset += hidden_size;
+    //     }
+    //     // RB
+    //     for(int g=0;g<4;g++) {
+    //         CUDA_CHECK(cudaMemcpyAsync(cudnn_weight_ptr + offset, 
+    //                               B_ptr + (dire * 8 + 4 + gate_offset[g]) * hidden_size,
+    //                               hidden_size * sizeof(float),
+    //                               cudaMemcpyDeviceToDevice, context->GetStream()));
+    //         offset += hidden_size;
+    //     }
+    // }
 
     return TNN_OK;
 }
@@ -109,257 +109,257 @@ Status PackONNXWeightsToCUDNNFormat(Blob * W, Blob * R, Blob* B,
 Status CudaLSTMONNXLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
 
-    CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+//     CudaLayerAcc::Init(context, param, resource, inputs, outputs);
 
-    rnn_algo_ = CUDNN_RNN_ALGO_STANDARD;
-    // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;
-    // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_STATIC;
+//     rnn_algo_ = CUDNN_RNN_ALGO_STANDARD;
+//     // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_DYNAMIC;
+//     // rnn_algo_ = CUDNN_RNN_ALGO_PERSIST_STATIC;
 
-    CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_));
-    CUDNN_CHECK(cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc_));
+//     CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_));
+//     CUDNN_CHECK(cudnnCreateFilterDescriptor(&w_desc_));
+//     CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc_));
 
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_));
+//     CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_));
+//     CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_));
+//     CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_));
+//     CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_));
    
-    unsigned long long seed = 1337ull; // Pick a seed.
-    float dropout = 0;
-    size_t stateSize;
-
-    CUDNN_CHECK(cudnnDropoutGetStatesSize(context_->cudnn_handle_, &stateSize));
-    RETURN_ON_NEQ(device_->Allocate(&dropout_state_, stateSize), TNN_OK);
-    CUDNN_CHECK(cudnnSetDropoutDescriptor(dropout_desc_, 
-                               context_->cudnn_handle_,
-                               dropout, 
-                               dropout_state_, 
-                               stateSize, 
-                               seed));
-
-    return this->Reshape(inputs, outputs);
-}
-
-CudaLSTMONNXLayerAcc::~CudaLSTMONNXLayerAcc(){
-    CUDNN_CHECK(cudnnDestroyRNNDescriptor(rnn_desc_));
-    CUDNN_CHECK(cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_CHECK(cudnnDestroyDropoutDescriptor(dropout_desc_));
-
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cy_desc_));
-
-    if (dropout_state_) {
-        device_->Free(dropout_state_);
-        dropout_state_ = nullptr;
-    }
-
-    if (x_desc_ && seq_length_ > 0) {
-        for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
-        free(x_desc_);
-        x_desc_ = nullptr;
-    }
-    if (y_desc_ && seq_length_ > 0) {
-        for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
-        free(y_desc_);
-        y_desc_ = nullptr;
-    }
-
-    if (hx_) {
-        device_->Free(hx_);
-        hx_ = nullptr;
-    }
-    if (hy_) {
-        device_->Free(hy_);
-        hy_ = nullptr;
-    }
-    if (cx_) {
-        device_->Free(cx_);
-        cx_ = nullptr;
-    }
-    if (cy_) {
-        device_->Free(cy_);
-        cy_ = nullptr;
-    }
-    if (workspace_) {
-        device_->Free(workspace_);
-        workspace_= nullptr;
-        workspace_size_ = 0;
-    }
-
-    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
-        cudnnDestroyPersistentRNNPlan(rnn_plan_);
-    }
-}
-
-Status CudaLSTMONNXLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-    this->is_reshaped = false;
-    return TNN_OK;
-}
-
-Status CudaLSTMONNXLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-
-    if (!this->is_reshaped) {
-        DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
-        LSTMONNXLayerParam * lstm_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
-
-        if (inputs.size() < 4) {
-            return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
-        }
-        // free the last init resources 
-        if (x_desc_ && seq_length_ > 0) {
-            for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
-            free(x_desc_);
-            x_desc_ = nullptr;
-        }
-        if (y_desc_ && seq_length_ > 0) {
-            for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
-            free(y_desc_);
-            y_desc_ = nullptr;
-        }
-
-        hidden_size_ = lstm_param->hidden_size;
-        num_layers_ = 1;
-        input_size_ = DimsVectorUtils::Count(input_dims, 2); // input dimension
-        bidirectional_ = lstm_param->direction >= 2 ? true : false;
-
-        // currently one onnx lstm layer only compute one time, so num_layers = 1
-        seq_length_ = input_dims[0];
-        int batch_size = input_dims[1];
-
-        CUDNN_CHECK(cudnnSetRNNDescriptor_v6(context_->cudnn_handle_,
-                                        rnn_desc_,
-                                        hidden_size_, 
-                                        num_layers_, 
-                                        dropout_desc_,
-                                        CUDNN_LINEAR_INPUT, 
-                                        bidirectional_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, 
-                                        CUDNN_LSTM, 
-                                        rnn_algo_,
-                                        CUDNN_DATA_FLOAT));
-
-        // xy initialize
-        x_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
-        y_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
+//     unsigned long long seed = 1337ull; // Pick a seed.
+//     float dropout = 0;
+//     size_t stateSize;
+
+//     CUDNN_CHECK(cudnnDropoutGetStatesSize(context_->GetCudnnHandle(), &stateSize));
+//     RETURN_ON_NEQ(device_->Allocate(&dropout_state_, stateSize), TNN_OK);
+//     CUDNN_CHECK(cudnnSetDropoutDescriptor(dropout_desc_, 
+//                                context_->GetCudnnHandle(),
+//                                dropout, 
+//                                dropout_state_, 
+//                                stateSize, 
+//                                seed));
+
+//     return this->Reshape(inputs, outputs);
+// }
+
+// CudaLSTMONNXLayerAcc::~CudaLSTMONNXLayerAcc(){
+//     CUDNN_CHECK(cudnnDestroyRNNDescriptor(rnn_desc_));
+//     CUDNN_CHECK(cudnnDestroyFilterDescriptor(w_desc_));
+//     CUDNN_CHECK(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
+//     CUDNN_CHECK(cudnnDestroyTensorDescriptor(hx_desc_));
+//     CUDNN_CHECK(cudnnDestroyTensorDescriptor(cx_desc_));
+//     CUDNN_CHECK(cudnnDestroyTensorDescriptor(hy_desc_));
+//     CUDNN_CHECK(cudnnDestroyTensorDescriptor(cy_desc_));
+
+//     if (dropout_state_) {
+//         device_->Free(dropout_state_);
+//         dropout_state_ = nullptr;
+//     }
+
+//     if (x_desc_ && seq_length_ > 0) {
+//         for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
+//         free(x_desc_);
+//         x_desc_ = nullptr;
+//     }
+//     if (y_desc_ && seq_length_ > 0) {
+//         for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
+//         free(y_desc_);
+//         y_desc_ = nullptr;
+//     }
+
+//     if (hx_) {
+//         device_->Free(hx_);
+//         hx_ = nullptr;
+//     }
+//     if (hy_) {
+//         device_->Free(hy_);
+//         hy_ = nullptr;
+//     }
+//     if (cx_) {
+//         device_->Free(cx_);
+//         cx_ = nullptr;
+//     }
+//     if (cy_) {
+//         device_->Free(cy_);
+//         cy_ = nullptr;
+//     }
+//     if (workspace_) {
+//         device_->Free(workspace_);
+//         workspace_= nullptr;
+//         workspace_size_ = 0;
+//     }
+
+//     if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+//         cudnnDestroyPersistentRNNPlan(rnn_plan_);
+//     }
+// }
+
+// Status CudaLSTMONNXLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+//     this->is_reshaped = false;
+//     return TNN_OK;
+// }
+
+// Status CudaLSTMONNXLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+
+//     if (!this->is_reshaped) {
+//         DimsVector input_dims  = inputs[0]->GetBlobDesc().dims;
+//         LSTMONNXLayerParam * lstm_param = dynamic_cast<LSTMONNXLayerParam *>(param_);
+
+//         if (inputs.size() < 4) {
+//             return Status(TNNERR_LAYER_ERR, "LSTM has invalid inputs");
+//         }
+//         // free the last init resources 
+//         if (x_desc_ && seq_length_ > 0) {
+//             for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_[i])); }
+//             free(x_desc_);
+//             x_desc_ = nullptr;
+//         }
+//         if (y_desc_ && seq_length_ > 0) {
+//             for (int i = 0; i < seq_length_; i++) {CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_[i])); }
+//             free(y_desc_);
+//             y_desc_ = nullptr;
+//         }
+
+//         hidden_size_ = lstm_param->hidden_size;
+//         num_layers_ = 1;
+//         input_size_ = DimsVectorUtils::Count(input_dims, 2); // input dimension
+//         bidirectional_ = lstm_param->direction >= 2 ? true : false;
+
+//         // currently one onnx lstm layer only compute one time, so num_layers = 1
+//         seq_length_ = input_dims[0];
+//         int batch_size = input_dims[1];
+
+//         CUDNN_CHECK(cudnnSetRNNDescriptor_v6(context_->GetCudnnHandle(),
+//                                         rnn_desc_,
+//                                         hidden_size_, 
+//                                         num_layers_, 
+//                                         dropout_desc_,
+//                                         CUDNN_LINEAR_INPUT, 
+//                                         bidirectional_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, 
+//                                         CUDNN_LSTM, 
+//                                         rnn_algo_,
+//                                         CUDNN_DATA_FLOAT));
+
+//         // xy initialize
+//         x_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
+//         y_desc_ = (cudnnTensorDescriptor_t*)malloc(seq_length_ * sizeof(cudnnTensorDescriptor_t));
 
         
-        int dimA[3];
-        int strideA[3];
+//         int dimA[3];
+//         int strideA[3];
 
-        for (int i = 0; i < seq_length_; i++) {
-            CUDNN_CHECK( cudnnCreateTensorDescriptor(&(x_desc_[i])) );
-            CUDNN_CHECK( cudnnCreateTensorDescriptor(&(y_desc_[i])) );
+//         for (int i = 0; i < seq_length_; i++) {
+//             CUDNN_CHECK( cudnnCreateTensorDescriptor(&(x_desc_[i])) );
+//             CUDNN_CHECK( cudnnCreateTensorDescriptor(&(y_desc_[i])) );
 
-            dimA[0] = batch_size;
-            dimA[1] = input_size_;
-            dimA[2] = 1;
+//             dimA[0] = batch_size;
+//             dimA[1] = input_size_;
+//             dimA[2] = 1;
 
-            strideA[0] = dimA[2] * dimA[1];
-            strideA[1] = dimA[2];
-            strideA[2] = 1;
+//             strideA[0] = dimA[2] * dimA[1];
+//             strideA[1] = dimA[2];
+//             strideA[2] = 1;
 
-            CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
+//             CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
 
-            dimA[0] = batch_size;
-            dimA[1] = hidden_size_ * (bidirectional_ ? 2 : 1);
-            dimA[2] = 1;
+//             dimA[0] = batch_size;
+//             dimA[1] = hidden_size_ * (bidirectional_ ? 2 : 1);
+//             dimA[2] = 1;
 
-            strideA[0] = dimA[2] * dimA[1];
-            strideA[1] = dimA[2];
-            strideA[2] = 1;
+//             strideA[0] = dimA[2] * dimA[1];
+//             strideA[1] = dimA[2];
+//             strideA[2] = 1;
 
-            CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
-        }
+//             CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_desc_[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
+//         }
     
     
-        // hc initialize
-        dimA[0] = num_layers_ * (bidirectional_ ? 2 : 1);
-        dimA[1] = batch_size;
-        dimA[2] = hidden_size_;
-
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
-        CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
-        CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
-        CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
-
-        size_t hc_size_in_bytes = (bidirectional_ ? 2 : 1) * batch_size * hidden_size_ * sizeof(float);
-        RETURN_ON_NEQ(device_->ReAllocate((void **)&hx_, hc_size_in_bytes), TNN_OK);
-        RETURN_ON_NEQ(device_->ReAllocate((void **)&hy_, hc_size_in_bytes), TNN_OK);
-        RETURN_ON_NEQ(device_->ReAllocate((void **)&cx_, hc_size_in_bytes), TNN_OK);
-        RETURN_ON_NEQ(device_->ReAllocate((void **)&cy_, hc_size_in_bytes), TNN_OK);
-
-        CUDA_CHECK(cudaMemset(hy_, 0, hc_size_in_bytes));
-        CUDA_CHECK(cudaMemset(cy_, 0, hc_size_in_bytes));
-
-        if (inputs.size() >= 6) {
-            // [num_directions, batch_size, hidden_size].
-            float * h0_ptr = (float*)(((char*)inputs[4]->GetHandle().base) + inputs[4]->GetHandle().bytes_offset);
-            float * c0_ptr = (float*)(((char*)inputs[5]->GetHandle().base) + inputs[5]->GetHandle().bytes_offset);
-            CUDA_CHECK(cudaMemcpy(hx_, h0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
-            CUDA_CHECK(cudaMemcpy(cx_, c0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
-        } else {
-            CUDA_CHECK(cudaMemset(hx_, 0, hc_size_in_bytes));
-            CUDA_CHECK(cudaMemset(cx_, 0, hc_size_in_bytes));
-        }
+//         // hc initialize
+//         dimA[0] = num_layers_ * (bidirectional_ ? 2 : 1);
+//         dimA[1] = batch_size;
+//         dimA[2] = hidden_size_;
+
+//         strideA[0] = dimA[2] * dimA[1];
+//         strideA[1] = dimA[2];
+//         strideA[2] = 1;
+
+//         CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+//         CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+//         CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+//         CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, CUDNN_DATA_FLOAT, 3, dimA, strideA));
+
+//         size_t hc_size_in_bytes = (bidirectional_ ? 2 : 1) * batch_size * hidden_size_ * sizeof(float);
+//         RETURN_ON_NEQ(device_->ReAllocate((void **)&hx_, hc_size_in_bytes), TNN_OK);
+//         RETURN_ON_NEQ(device_->ReAllocate((void **)&hy_, hc_size_in_bytes), TNN_OK);
+//         RETURN_ON_NEQ(device_->ReAllocate((void **)&cx_, hc_size_in_bytes), TNN_OK);
+//         RETURN_ON_NEQ(device_->ReAllocate((void **)&cy_, hc_size_in_bytes), TNN_OK);
+
+//         CUDA_CHECK(cudaMemset(hy_, 0, hc_size_in_bytes));
+//         CUDA_CHECK(cudaMemset(cy_, 0, hc_size_in_bytes));
+
+//         if (inputs.size() >= 6) {
+//             // [num_directions, batch_size, hidden_size].
+//             float * h0_ptr = (float*)(((char*)inputs[4]->GetHandle().base) + inputs[4]->GetHandle().bytes_offset);
+//             float * c0_ptr = (float*)(((char*)inputs[5]->GetHandle().base) + inputs[5]->GetHandle().bytes_offset);
+//             CUDA_CHECK(cudaMemcpy(hx_, h0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
+//             CUDA_CHECK(cudaMemcpy(cx_, c0_ptr, hc_size_in_bytes, cudaMemcpyDeviceToDevice));
+//         } else {
+//             CUDA_CHECK(cudaMemset(hx_, 0, hc_size_in_bytes));
+//             CUDA_CHECK(cudaMemset(cx_, 0, hc_size_in_bytes));
+//         }
     
-        // weight initialize
-        size_t weightsSize;
-        CUDNN_CHECK(cudnnGetRNNParamsSize(context_->cudnn_handle_, rnn_desc_, x_desc_[0], &weightsSize, CUDNN_DATA_FLOAT));
-
-        int dimW[3];   
-        dimW[0] =  weightsSize / sizeof(float);
-        dimW[1] = 1;
-        dimW[2] = 1;
-
-        CUDNN_CHECK(cudnnSetFilterNdDescriptor(w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));   
-
-        RETURN_ON_NEQ(device_->ReAllocate((void **)&weights_, weightsSize), TNN_OK);
-        RETURN_ON_NEQ(PackONNXWeightsToCUDNNFormat(inputs[1], inputs[2], inputs[3], 
-                                                num_layers_ * (bidirectional_ ? 2 : 1), hidden_size_, input_size_,
-                                                (float*)weights_), 
-                    TNN_OK);
-
-        CUDNN_CHECK(cudnnGetRNNWorkspaceSize(context_->cudnn_handle_, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
-
-        if (workspace_size_ > 0) {
-            RETURN_ON_NEQ(device_->ReAllocate(&workspace_, workspace_size_), TNN_OK);
-        }
-
-        // set lstm algo persist plan 
-        if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
-        // Note: This step is expensive. Once completed the plan can be reused so long as the descriptor
-        CUDNN_CHECK(cudnnCreatePersistentRNNPlan(rnn_desc_, batch_size, CUDNN_DATA_FLOAT, &rnn_plan_));
-        CUDNN_CHECK(cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_));
-        }
-        this->is_reshaped = true;
-    }
-
-    float * bottom_data = (float*)(((char*)inputs[0]->GetHandle().base) + inputs[0]->GetHandle().bytes_offset);
-    float * top_data    = (float*)(((char*)outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset);
-    CUDNN_CHECK(cudnnRNNForwardInference(context_->cudnn_handle_,
-                                         rnn_desc_, 
-                                         seq_length_,
-                                         x_desc_, 
-                                         bottom_data, 
-                                         hx_desc_,
-                                         hx_, 
-                                         cx_desc_,
-                                         cx_, 
-                                         w_desc_,
-                                         weights_,
-                                         y_desc_,
-                                         top_data,
-                                         hy_desc_, 
-                                         hy_,
-                                         cy_desc_, 
-                                         cy_,
-                                         workspace_,
-                                         workspace_size_));
+//         // weight initialize
+//         size_t weightsSize;
+//         CUDNN_CHECK(cudnnGetRNNParamsSize(context_->GetCudnnHandle(), rnn_desc_, x_desc_[0], &weightsSize, CUDNN_DATA_FLOAT));
+
+//         int dimW[3];   
+//         dimW[0] =  weightsSize / sizeof(float);
+//         dimW[1] = 1;
+//         dimW[2] = 1;
+
+//         CUDNN_CHECK(cudnnSetFilterNdDescriptor(w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));   
+
+//         RETURN_ON_NEQ(device_->ReAllocate((void **)&weights_, weightsSize), TNN_OK);
+//         RETURN_ON_NEQ(PackONNXWeightsToCUDNNFormat(context_, inputs[1], inputs[2], inputs[3], 
+//                                                 num_layers_ * (bidirectional_ ? 2 : 1), hidden_size_, input_size_,
+//                                                 (float*)weights_), 
+//                     TNN_OK);
+
+//         CUDNN_CHECK(cudnnGetRNNWorkspaceSize(context_->GetCudnnHandle(), rnn_desc_, seq_length_, x_desc_, &workspace_size_));
+
+//         if (workspace_size_ > 0) {
+//             RETURN_ON_NEQ(device_->ReAllocate(&workspace_, workspace_size_), TNN_OK);
+//         }
+
+//         // set lstm algo persist plan 
+//         if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+//         // Note: This step is expensive. Once completed the plan can be reused so long as the descriptor
+//         CUDNN_CHECK(cudnnCreatePersistentRNNPlan(rnn_desc_, batch_size, CUDNN_DATA_FLOAT, &rnn_plan_));
+//         CUDNN_CHECK(cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_));
+//         }
+//         this->is_reshaped = true;
+//     }
+
+//     float * bottom_data = (float*)(((char*)inputs[0]->GetHandle().base) + inputs[0]->GetHandle().bytes_offset);
+//     float * top_data    = (float*)(((char*)outputs[0]->GetHandle().base) + outputs[0]->GetHandle().bytes_offset);
+//     CUDNN_CHECK(cudnnRNNForwardInference(context_->GetCudnnHandle(),
+//                                          rnn_desc_, 
+//                                          seq_length_,
+//                                          x_desc_, 
+//                                          bottom_data, 
+//                                          hx_desc_,
+//                                          hx_, 
+//                                          cx_desc_,
+//                                          cx_, 
+//                                          w_desc_,
+//                                          weights_,
+//                                          y_desc_,
+//                                          top_data,
+//                                          hy_desc_, 
+//                                          hy_,
+//                                          cy_desc_, 
+//                                          cy_,
+//                                          workspace_,
+//                                          workspace_size_));
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h b/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h
index 0c4d54554..e17eb221f 100644
--- a/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h
+++ b/source/tnn/device/cuda/acc/cuda_lstm_layer_acc.h
@@ -29,32 +29,32 @@ class CudaLSTMONNXLayerAcc : public CudaLayerAcc {
 
 protected:
 
-    int num_layers_;
-    int seq_length_;
-    int hidden_size_;
-    int input_size_;
-    bool bidirectional_;
-
-    cudnnRNNAlgo_t rnn_algo_;
-
-    cudnnRNNDescriptor_t rnn_desc_;
-    cudnnFilterDescriptor_t w_desc_;
-    cudnnPersistentRNNPlan_t rnn_plan_;
-    cudnnDropoutDescriptor_t dropout_desc_;
-
-    cudnnTensorDescriptor_t *x_desc_, *y_desc_;
-    cudnnTensorDescriptor_t hx_desc_, cx_desc_;
-    cudnnTensorDescriptor_t hy_desc_, cy_desc_;
-
-    float * hx_ = nullptr;
-    float * hy_ = nullptr;
-    float * cx_ = nullptr;
-    float * cy_ = nullptr;
-
-    void * workspace_ = nullptr;
-    void * dropout_state_ = nullptr;
-    size_t workspace_size_ = 0;
-    float* weights_ = nullptr;
+    // int num_layers_;
+    // int seq_length_;
+    // int hidden_size_;
+    // int input_size_;
+    // bool bidirectional_;
+
+    // cudnnRNNAlgo_t rnn_algo_;
+
+    // cudnnRNNDescriptor_t rnn_desc_;
+    // cudnnFilterDescriptor_t w_desc_;
+    // cudnnPersistentRNNPlan_t rnn_plan_;
+    // cudnnDropoutDescriptor_t dropout_desc_;
+
+    // cudnnTensorDescriptor_t *x_desc_, *y_desc_;
+    // cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+    // cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+
+    // float * hx_ = nullptr;
+    // float * hy_ = nullptr;
+    // float * cx_ = nullptr;
+    // float * cy_ = nullptr;
+
+    // void * workspace_ = nullptr;
+    // void * dropout_state_ = nullptr;
+    // size_t workspace_size_ = 0;
+    // float* weights_ = nullptr;
     
 };
 
diff --git a/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu
index 845a3d222..3df2bb2a4 100644
--- a/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.cu
@@ -12,14 +12,14 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
+#include "tnn/device/cuda/acc/cuda_mat_mul_layer_acc.h"
 #include "tnn/utils/dims_utils.h"
 #include "tnn/utils/data_type_utils.h"
+//#include <cublas_v2.h>
 
 namespace TNN_NS {
 
-DECLARE_CUDA_ACC(MatMul, LAYER_MATMUL);
-
 #define BLOCK_DIM 16
 
 __device__ __forceinline__ __half atomic_add(__half* address, __half val) {
@@ -171,62 +171,122 @@ __global__ void matmul_batched_gemv_kernel_fp16(const __half* data1, const float
 
 Status CudaMatMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    Status ret = CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+    if (ret != TNN_OK) {
+        return ret;
+    }
+
+    auto mm_param = dynamic_cast<MatMulLayerParam *>(param);
+    if (!mm_param) {
+        LOGE("Error: Unable to Get Param of CUDA MatMul Layer.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Param of CUDA MatMul Layer.");
+    }
+
+    DataType compute_dtype = inputs[0]->GetBlobDesc().data_type;
+    if (compute_dtype != DATA_TYPE_FLOAT && compute_dtype != DATA_TYPE_HALF) {
+        LOGE("Error: MatMul input Mat has data type other than float and half, which is Not Supported by TNN cuda ACC.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: MatMul input Mat has data type other than float and half, which is Not Supported by TNN cuda ACC.");
+    }
+
+    if (mm_param->extra_config.find("ffn") != mm_param->extra_config.end()) {
+        if (inputs.size() == 1) {
+            if (mm_param->weight_position != 0 && mm_param->weight_position != 1) {
+                LOGE("Error: Wrong layer param for CUDA MatMul Layer.\n");
+                return Status(TNNERR_LAYER_ERR, "Error: Wrong layer param for CUDA MatMul Layer.");
+            }
+            auto mm_resource = dynamic_cast<MatMulLayerResource *>(resource);
+            if (!mm_resource) {
+                LOGE("Error: Unable to Get Resource of CUDA MatMul Layer.\n");
+                return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Resource of CUDA MatMul Layer.");
+            }
+            RawBuffer buf = mm_resource->weight;
+            if (buf.GetDataCount() <= 0 ||
+                (buf.GetDataType() != DATA_TYPE_FLOAT && buf.GetDataType() != DATA_TYPE_HALF)) {
+                LOGE("Error: Unable to Get Correct Param and Resource of CUDA MatMul Layer.\n");
+                return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Correct Param and Resource of CUDA MatMul Layer.");
+            }
+
+            CreateTempBuf(buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT));
+            CreateTempBuf(buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF));
+
+            if (buf.GetDataType() == DATA_TYPE_FLOAT) {
+                CUDA_CHECK(cudaMemcpy(tempbufs_[0].ptr,
+                                      buf.force_to<void*>(),
+                                      buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT),
+                                      cudaMemcpyHostToDevice));
+                auto half_buf = ConvertFloatToHalf(buf);
+                CUDA_CHECK(cudaMemcpy(tempbufs_[1].ptr,
+                                      half_buf.force_to<void*>(),
+                                      half_buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF),
+                                      cudaMemcpyHostToDevice));
+            } else if (buf.GetDataType() == DATA_TYPE_HALF) {
+                auto ptr = GetFloatFromRawBuffer(buf);
+                CUDA_CHECK(cudaMemcpy(tempbufs_[0].ptr,
+                                      ptr.get(),
+                                      buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_FLOAT),
+                                      cudaMemcpyHostToDevice));
+                CUDA_CHECK(cudaMemcpy(tempbufs_[1].ptr,
+                                      buf.force_to<void*>(),
+                                      buf.GetDataCount() * DataTypeUtils::GetBytesSize(DATA_TYPE_HALF),
+                                      cudaMemcpyHostToDevice));
+            }
+        }
+
+        cublas_fp32_ = std::make_shared<cublasMMWrapper>(context_->GetCublasHandle(), context_->GetCublasLtHandle());
+        cublas_fp16_ = std::make_shared<cublasMMWrapper>(context_->GetCublasHandle(), context_->GetCublasLtHandle());
+        cublas_fp32_->setFP32GemmConfig();
+        cublas_fp16_->setFP16GemmConfig();
+    }
+
+    return TNN_OK;
 }
 
 Status CudaMatMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return TNN_OK;
 }
 
-Status CudaMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
-    Blob* input_blob1 = inputs[0];
-    Blob* input_blob2 = inputs[1];
-    Blob* output_blob = outputs[0];
-    auto input_dims1 = input_blob1->GetBlobDesc().dims;
-    auto input_dims2 = input_blob2->GetBlobDesc().dims;
-
-    if (input_dims1.size() > 5) {
-        LOGE("Error: layer acc dont support dims: %lu\n", input_dims1.size());
-        return Status(TNNERR_MODEL_ERR, "Error: layer acc don't support datatype");
-    }
-
-    int K = input_dims1[input_dims1.size() - 1];
-    int N = input_dims1[input_dims1.size() - 2];
+Status RunCudaGEMVKernel(const void* in_a_ptr, const void* in_b_ptr, void* out_ptr,
+                         const int M, const int K, const int N,
+                         const DimsVector in_a_dims, const DimsVector in_b_dims,
+                         DataType dtype, CudaContext* context) {
+    auto stream = context->GetStream();
 
     int size[3];
     int stride_a[3];
     int stride_b[3];
 
     int i = 0;
-    for (; i < input_dims1.size() - 2; i++) {
-        size[i] = std::max(input_dims1[i], input_dims2[i]);
-        stride_a[i] = input_dims1[i] == 1 ? 0 : 1;
-        stride_b[i] = input_dims2[i] == 1 ? 0 : 1;
+    for (; i < in_a_dims.size() - 2; i++) {
+        size[i] = std::max(in_a_dims[i], in_b_dims[i]);
+        stride_a[i] = in_a_dims[i] == 1 ? 0 : 1;
+        stride_b[i] = in_b_dims[i] == 1 ? 0 : 1;
     }
-
     for (; i < 3; i++) {
         size[i] = 1;
         stride_a[i] = 0;
         stride_b[i] = 0;
     }
 
-    void* input_data1 = input_blob1->GetHandle().base;
-    void* input_data2 = input_blob2->GetHandle().base;
-    void* output_data = output_blob->GetHandle().base;
+    if (stride_a[0] == 0 && stride_a[1] == 0 && stride_a[2] == 0) {
+        stride_a[2] = 1;
+    }
+    if (stride_b[0] == 0 && stride_b[1] == 0 && stride_b[2] == 0) {
+        stride_b[2] = 1;
+    }
 
     dim3 dimGrid(K/BLOCK_DIM, N/BLOCK_DIM, size[0]*stride_a[0]+size[1]*stride_a[1]+size[2]*stride_a[2]);
     dim3 dimBlock(BLOCK_DIM, BLOCK_DIM, 1);
 
-    int type_size = DataTypeUtils::GetBytesSize(input_blob1->GetBlobDesc().data_type);
+    int type_size = DataTypeUtils::GetBytesSize(dtype);
     int cur_workspace_size = (size[0]*stride_a[0]+size[1]*stride_a[1]+size[2]*stride_a[2]) * K * N * type_size;
 
-    context_->SetWorkspaceSize(cur_workspace_size);
-    if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_FLOAT) {
-        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, context_->GetStream()>>>((float*)context_->GetWorkspace(),
-        (float*)input_data1, K, N);
-    } else if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_HALF) {
-        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, context_->GetStream()>>>((__half*)context_->GetWorkspace(),
-        (__half*)input_data1, K, N);
+    context->SetWorkspaceSize(cur_workspace_size);
+    if (dtype == DataType::DATA_TYPE_FLOAT) {
+        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, stream>>>((float*)context->GetWorkspace(),
+        (float*)in_a_ptr, K, N);
+    } else {
+        matmul_transpose_kernel<<<dimGrid, dimBlock, 0, stream>>>((__half*)context->GetWorkspace(),
+        (__half*)in_a_ptr, K, N);
     }
 
     dim3 grid;
@@ -234,17 +294,174 @@ Status CudaMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std:
     grid.y = (N + TNN_CUDA_NUM_THREADS - 1) / TNN_CUDA_NUM_THREADS;
     grid.z = (K + TNN_CUDA_NUM_THREADS - 1) / TNN_CUDA_NUM_THREADS;
 
-    CUDA_CHECK(cudaMemsetAsync(output_data, 0, size[0] * size[1] * size[2] * N * type_size, context_->GetStream()));
+    CUDA_CHECK(cudaMemsetAsync(out_ptr, 0, size[0] * size[1] * size[2] * N * type_size, stream));
 
-    if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_FLOAT) {
-        matmul_batched_gemv_kernel<<<grid, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
-            (float*)context_->GetWorkspace(), (float*)input_data2, (float*)output_data, stride_a[0], stride_a[1],
+    if (dtype == DataType::DATA_TYPE_FLOAT) {
+        matmul_batched_gemv_kernel<<<grid, TNN_CUDA_NUM_THREADS, 0, stream>>>(
+            (float*)context->GetWorkspace(), (float*)in_b_ptr, (float*)out_ptr, stride_a[0], stride_a[1],
             stride_a[2], stride_b[0], stride_b[1], stride_b[2], size[1], size[2], N, K);
-    } else if (input_blob1->GetBlobDesc().data_type == DataType::DATA_TYPE_HALF) {
-        matmul_batched_gemv_kernel_fp16<<<grid, TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
-            (__half*)context_->GetWorkspace(), (float*)input_data2, (__half*)output_data, stride_a[0], stride_a[1],
+    } else {
+        matmul_batched_gemv_kernel_fp16<<<grid, TNN_CUDA_NUM_THREADS, 0, stream>>>(
+            (__half*)context->GetWorkspace(), (float*)in_b_ptr, (__half*)out_ptr, stride_a[0], stride_a[1],
             stride_a[2], stride_b[0], stride_b[1], stride_b[2], size[1], size[2], N, K);
     }
+
+    return TNN_OK;
+}
+
+
+Status CudaMatMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto param = dynamic_cast<MatMulLayerParam *>(param_);
+    if (!param) {
+        LOGE("Error: Unable to Get Param of CUDA MatMul Layer.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Param of CUDA MatMul Layer.");
+    }
+
+    // Step 1: Prepare for CUDA MatMul.
+    int M, K, N;
+    int B = 1, B_in_a = 1, B_in_b = 1;   // Batch
+    DimsVector in_a_dims, in_b_dims;
+    DataType compute_dtype;
+
+    void* in_a_ptr, *in_b_ptr;
+    void* out_ptr = outputs[0]->GetHandle().base;
+
+    if (inputs.size() == 1) {
+        auto resource = dynamic_cast<MatMulLayerResource *>(resource_);
+        if (!resource) {
+            LOGE("Error: Unable to Get Resource of CUDA MatMul Layer.\n");
+            return Status(TNNERR_LAYER_ERR, "Error: Unable to Get Resource of CUDA MatMul Layer.");
+        }
+        compute_dtype = inputs[0]->GetBlobDesc().data_type;
+        if (param->weight_position == 0) {
+            in_a_dims  = resource->weight.GetBufferDims();
+            in_b_dims  = inputs[0]->GetBlobDesc().dims;
+            for (int i=0; i<in_b_dims.size()-2; i++) {
+                B_in_b *= in_b_dims[i];
+            }
+            in_a_ptr   = compute_dtype == DATA_TYPE_FLOAT ? tempbufs_[0].ptr : tempbufs_[1].ptr;
+            in_b_ptr   = inputs[0]->GetHandle().base;
+        } else {  //param->weight_position == 1
+            in_a_dims  = inputs[0]->GetBlobDesc().dims;
+            in_b_dims  = resource->weight.GetBufferDims();
+            for (int i=0; i<in_a_dims.size()-2; i++) {
+                B_in_a *= in_a_dims[i];
+            }
+            in_a_ptr   = inputs[0]->GetHandle().base;
+            in_b_ptr   = compute_dtype == DATA_TYPE_FLOAT ? tempbufs_[0].ptr : tempbufs_[1].ptr;
+        }
+    } else {  //inputs.size() == 2
+        if (inputs[0]->GetBlobDesc().data_type != inputs[1]->GetBlobDesc().data_type) {
+            LOGE("Error: MatMul input Mat A and B has different data type, which is Not Supported by TNN cuda ACC.\n");
+            return Status(TNNERR_MODEL_ERR, "Error: MatMul input Mat A and B has different data type, which is Not Supported by TNN cuda ACC.");
+        }
+        compute_dtype = inputs[0]->GetBlobDesc().data_type;
+        in_a_dims     = inputs[0]->GetBlobDesc().dims;
+        in_b_dims     = inputs[1]->GetBlobDesc().dims;
+        for (int i=0; i<in_a_dims.size()-2; i++) {
+            B_in_a   *= in_a_dims[i];
+        }
+        for (int i=0; i<in_b_dims.size()-2; i++) {
+            B_in_b   *= in_b_dims[i];
+        }
+        in_a_ptr      = inputs[0]->GetHandle().base;
+        in_b_ptr      = inputs[1]->GetHandle().base;
+    }
+    M = in_b_dims[in_b_dims.size() - 1];
+    K = in_a_dims[in_a_dims.size() - 1];
+    N = in_a_dims[in_a_dims.size() - 2];
+
+    if (B_in_a != B_in_b) {
+        if (B_in_b == 1) {
+            // Treated As single batch GEMM
+            N *= B_in_a;
+            B = 1;
+        } else {
+            LOGE("Error: MatMul input Mat A and B has different multi-batch, which is Not Supported by TNN cuda ACC.\n");
+            return Status(TNNERR_MODEL_ERR, "Error: MatMul input Mat A and B has different multi-batch, which is Not Supported by TNN cuda ACC.");
+        }
+    } else {
+        // Batched-GEMM
+        B = B_in_a;
+    }
+    if (compute_dtype != DataType::DATA_TYPE_FLOAT && compute_dtype != DataType::DATA_TYPE_HALF) {
+        LOGE("Error: MatMul input Mat A and B has data type other than float and half, which is Not Supported by TNN cuda ACC.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: MatMul input Mat A and B has data type other than float and half, which is Not Supported by TNN cuda ACC.");
+    }
+
+    // MatMul with dynamic N of MNK:
+    // Used in NLP models like BERT.
+    //
+    // In Bert Multi-Head Attention Module, there will be MatMuls with input like
+    // [Batch*Max_Seq_len, 3*Hidden_Size] * [3*Hidden_Size, Hidden_Size]
+    // Batch*Max_Seq_len is not fulfilled with Sequences of Length : Max_Seq_Len,
+    // Thus, only Vaild Positions of Batch*Seq_len is actually meaningful.
+    // We have implemented a Method to Move all Valid Sequences to the first N-current positions of Batch*Max_Seq_Len
+    // We call this "Dense Mode"
+    // Under Dense Mode, we will have an extra infomation called "bert_current_total_seqlen" stored in Context.
+    // We get N-current from "bert_current_total_seqlen" from Context, and Implement MatMul only on the first N-current elements.
+    if (B == 1) {
+        auto& info_map = context_->GetExtraInfoMap();
+        if (info_map.find("int_transformer_runtime_token_num") != info_map.end()) {
+            auto rt_token_num = any_cast<int>(info_map["int_transformer_runtime_token_num"]);
+            // Use dense mode only when rt_token_num is valid.
+            if (rt_token_num > 0) {
+                N = rt_token_num;
+            }
+        }
+    }
+
+
+    // Step 2: Run MatMul Kernels.
+    if (B == 1) {
+        if (M == 1 && in_a_dims.size() == in_b_dims.size()) {
+            // Special GEMV case, Use hand-written CUDA kernels here.
+            return RunCudaGEMVKernel(in_a_ptr, in_b_ptr, out_ptr, M, K, N, in_a_dims, in_b_dims, compute_dtype, context_);
+        }
+
+        // Standard GEMM Cases.
+        if (compute_dtype == DATA_TYPE_FLOAT) {
+            // Traditional CUBLAS Version
+            //float alpha = 1.0;
+            //float beta  = 0.0;
+            //CUBLAS_CHECK(cublasSgemm(context_->GetCublasHandle(),
+            //            CUBLAS_OP_N, CUBLAS_OP_N,
+            //            M, N, K, &alpha, (float*)in_b_ptr, M, (float*)in_a_ptr, K,
+            //            &beta, (float*)out_ptr, M));
+            // New CUBLAS Wrapper Version
+            cublas_fp32_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
+                               (float*)in_b_ptr, M, (float*)in_a_ptr, K, (float*)out_ptr, M, context_->GetStream());
+        } else { // HALF
+            // Traditional CUBLAS Version
+            //__half alpha = __half(1.f);
+            //__half beta  = __half(0.f);
+            //CUBLAS_CHECK(cublasHgemm(context_->GetCublasHandle(),
+            //             CUBLAS_OP_N, CUBLAS_OP_N,
+            //             M, N, K, &alpha, (__half*)in_b_ptr, M, (__half*)in_a_ptr, K,
+            //             &beta, (__half*)out_ptr, M));
+            // New CUBLAS Wrapper Version
+            cublas_fp16_->Gemm(CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
+                               (__half*)in_b_ptr, M, (__half*)in_a_ptr, K, (__half*)out_ptr, M, context_->GetStream());
+        }
+    } else {
+        // B != 1, Batched-GEMM
+        if (compute_dtype == DATA_TYPE_FLOAT) {
+            float alpha = 1.0;
+            float beta  = 0.0;
+            CUBLAS_CHECK(cublasSgemmStridedBatched(context_->GetCublasHandle(),
+                         CUBLAS_OP_N, CUBLAS_OP_N,
+                         M, N, K, &alpha, (float*)in_b_ptr, M, K*M, (float*)in_a_ptr, K, N*K,
+                         &beta, (float*)out_ptr, M, N*M, B));
+        } else { // HALF
+            __half alpha = __half(1.f);
+            __half beta  = __half(0.f);
+            CUBLAS_CHECK(cublasHgemmStridedBatched(context_->GetCublasHandle(),
+                         CUBLAS_OP_N, CUBLAS_OP_N,
+                         M, N, K, &alpha, (__half*)in_b_ptr, M, K*M, (__half*)in_a_ptr, K, N*K,
+                         &beta, (__half*)out_ptr, M, N*M, B));
+        }
+    }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.h b/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.h
new file mode 100644
index 000000000..6745823f8
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_mat_mul_layer_acc.h
@@ -0,0 +1,38 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_MATMUL_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_MATMUL_LAYER_ACC_H_
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
+
+namespace TNN_NS {
+
+class CudaMatMulLayerAcc : public CudaLayerAcc {
+public:
+    virtual Status Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual ~CudaMatMulLayerAcc(){};
+    virtual Status Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+    virtual Status Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs);
+
+protected:
+    std::shared_ptr<cublasMMWrapper> cublas_fp32_;
+    std::shared_ptr<cublasMMWrapper> cublas_fp16_;
+};
+
+}  //  namespace TNN_NS
+
+#endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_ACC_CUDA_MATMUL_LAYER_ACC_H_
diff --git a/source/tnn/device/cuda/acc/cuda_mod_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_mod_layer_acc.cu
new file mode 100644
index 000000000..aabfbf5d7
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_mod_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Mod, LAYER_MOD);
+
+Status CudaModLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaModLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaModLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Mod, LAYER_MOD);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu
index 0f613e8e7..c43a2e1aa 100644
--- a/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_mul_layer_acc.cu
@@ -17,10 +17,17 @@
 
 namespace TNN_NS {
 
+template <typename Dtype>
+__global__ void mul_kernel(const int n, const Dtype *a, const Dtype *b, Dtype *y) {
+    CUDA_KERNEL_LOOP(index, n) {
+        y[index] = a[index] * b[index];
+    }
+}
+
 DECLARE_CUDA_ACC(Mul, LAYER_MUL);
 
 Status CudaMulLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
-        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+                             const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
 }
 
@@ -29,6 +36,26 @@ Status CudaMulLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::ve
 }
 
 Status CudaMulLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto input0 = inputs[0];
+    auto input1 = inputs[1];
+    auto output = outputs[0];
+    auto dtype  = input0->GetBlobDesc().data_type;
+
+    auto input0_data = reinterpret_cast<void *>(input0->GetHandle().base);
+    auto input1_data = reinterpret_cast<void *>(input1->GetHandle().base);
+    auto output_data = reinterpret_cast<void *>(output->GetHandle().base);
+
+    auto output_dims = output->GetBlobDesc().dims;
+    auto count       = DimsVectorUtils::Count(output_dims);
+
+    if (dtype == DATA_TYPE_FLOAT) {
+        mul_kernel<float><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, (const float *)input0_data, (const float *)input1_data, (float *)output_data);
+    } else if (dtype == DATA_TYPE_HALF) {
+        mul_kernel<__half><<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
+            count, (const __half *)input0_data, (const __half *)input1_data, (__half *)output_data);
+    }
+
     return TNN_OK;
 }
 
diff --git a/source/tnn/device/cuda/acc/cuda_nonzero_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_nonzero_layer_acc.cu
new file mode 100644
index 000000000..6f3bcf331
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_nonzero_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(NonZero, LAYER_NONZERO);
+
+Status CudaNonZeroLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaNonZeroLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaNonZeroLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(NonZero, LAYER_NONZERO);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_or_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_or_layer_acc.cu
new file mode 100644
index 000000000..b59eaa165
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_or_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may And use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Or, LAYER_OR);
+
+Status CudaOrLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaOrLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaOrLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Or, LAYER_OR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu
index 2eaddb28a..33048ed29 100644
--- a/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_pad_layer_acc.cu
@@ -113,12 +113,21 @@ Status CudaPadLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::ve
 
     Blob *input_blob  = inputs[0];
     Blob *output_blob = outputs[0];
+
+    auto input_dims   = input_blob->GetBlobDesc().dims;
+    auto output_dims  = output_blob->GetBlobDesc().dims;
+    int nbDims = input_dims.size();
+    if (nbDims!=3 && nbDims!=4) {
+        LOGE("Error: unsupported PAD input format, should be NCHW or CHW.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: unsupported PAD input format, should be NCHW or CHW.");
+    }
+
     const int count = DimsVectorUtils::Count(output_blob->GetBlobDesc().dims);
-    int output_channel = output_blob->GetBlobDesc().dims[1];
-    int input_h = input_blob->GetBlobDesc().dims[2];
-    int input_w = input_blob->GetBlobDesc().dims[3];
-    int output_h = output_blob->GetBlobDesc().dims[2];
-    int output_w = output_blob->GetBlobDesc().dims[3];
+    int output_channel = output_blob->GetBlobDesc().dims[nbDims-3];
+    int input_h = input_blob->GetBlobDesc().dims[nbDims-2];
+    int input_w = input_blob->GetBlobDesc().dims[nbDims-1];
+    int output_h = output_blob->GetBlobDesc().dims[nbDims-2];
+    int output_w = output_blob->GetBlobDesc().dims[nbDims-1];
     if (input_blob->GetBlobDesc().data_type == DATA_TYPE_FLOAT) {
         float* input_data = static_cast<float*>(input_blob->GetHandle().base);
         float* output_data = static_cast<float*>(output_blob->GetHandle().base);
@@ -132,7 +141,7 @@ Status CudaPadLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::ve
                 input_data, output_data, count, output_channel, output_h, output_w, input_h, input_w, pad_l,
                 pad_t);
         } else {
-            int input_channel = input_blob->GetBlobDesc().dims[1];
+            int input_channel = input_blob->GetBlobDesc().dims[nbDims-3];
             pad_default_kernel<<<TNN_CUDA_GET_BLOCKS(count), TNN_CUDA_NUM_THREADS, 0, context_->GetStream()>>>(
                 input_data, output_data, count, input_channel, output_channel, pad_c_b, output_h, output_w,
                 input_h, input_w, pad_l, pad_t, value);
diff --git a/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu
index 7ff80e2cc..eb26478d8 100644
--- a/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_permute_layer_acc.cu
@@ -33,5 +33,6 @@ Status CudaPermuteLayerAcc::Forward(const std::vector<Blob *> &inputs, const std
 }
 
 REGISTER_CUDA_ACC(Permute, LAYER_PERMUTE);
+REGISTER_CUDA_ACC(Permute, LAYER_PERMUTEV2);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_pooling_1d_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_pooling_1d_layer_acc.cu
index 309210d37..eb42ee4ce 100644
--- a/source/tnn/device/cuda/acc/cuda_pooling_1d_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_pooling_1d_layer_acc.cu
@@ -80,7 +80,7 @@ Status CudaPooling1DLayerAcc::Forward(const std::vector<Blob *> &inputs, const s
 
     float alpha = 1.f;
     float beta = 0.f;
-    auto status = cudnnPoolingForward(context_->cudnn_handle_, this->m_pooling_desc, &alpha, m_input_desc,
+    auto status = cudnnPoolingForward(context_->GetCudnnHandle(), this->m_pooling_desc, &alpha, m_input_desc,
         input_data, &beta, m_output_desc, output_data);
 
     if (status != CUDNN_STATUS_SUCCESS) {
diff --git a/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu
index 7c6ebc219..591eae34d 100644
--- a/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_pooling_layer_acc.cu
@@ -18,34 +18,75 @@
 namespace TNN_NS {
 
 __device__ int get_start_index(int a, int b, int c) {
-    return (int)floorf((float)(a * c) / b);
+    return (int)floor((double)(a * c) / b);
 }
 
 __device__ int get_end_index(int a, int b, int c) {
-    return (int)ceilf((float)((a + 1) * c) / b);
+    return (int)ceil((double)((a + 1) * c) / b);
+}
+
+template<int THREAD_PER_BLOCK>
+__global__ void global_pooling_kernel(const float* input, float* output, int channels, int input_size, int pool_type) {
+    __shared__ float s_pool[THREAD_PER_BLOCK / 32];
+
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+
+    const float* input_ptr = input + bid * input_size;
+
+    float thread_sum = 0.f;
+    for (int i = tid; i < input_size; i += blockDim.x) {
+        thread_sum += input_ptr[i];
+    }
+
+    thread_sum += __shfl_down_sync(0xffffffff, thread_sum, 16, 32);
+    thread_sum += __shfl_down_sync(0x0000ffff, thread_sum, 8, 16);
+    thread_sum += __shfl_down_sync(0x000000ff, thread_sum, 4, 8);
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+    
+    if (threadIdx.x % 32 == 0) {
+        s_pool[threadIdx.x / 32] = thread_sum;
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockDim.x / 32) {
+        thread_sum = s_pool[threadIdx.x];
+    } else {
+        thread_sum = 0;
+    }
+
+    thread_sum += __shfl_down_sync(0x0000000f, thread_sum, 2, 4);
+    thread_sum += __shfl_down_sync(0x00000003, thread_sum, 1, 2);
+
+    if (threadIdx.x == 0) {
+        output[bid] = thread_sum / input_size;
+    }
 }
 
 __global__ void adaptive_pooling_kernel(const float* input, float* output, int channels, int input_height,
         int input_width, int output_height, int output_width, int pool_type) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= output_height * output_width) return;
+    if (tid >= output_height * output_width)
+        return;
 
     int bid = blockIdx.y + blockIdx.z * gridDim.y;
-    if (bid >= channels) return;
+    if (bid >= channels)
+        return;
 
     int oh = tid / output_width;
     int ow = tid % output_width;
 
     int ih0 = get_start_index(oh, output_height, input_height);
     int ih1 = get_end_index(oh, output_height, input_height);
-    int kh = ih1 - ih0;
+    int kh  = ih1 - ih0;
 
     int iw0 = get_start_index(ow, output_width, input_width);
     int iw1 = get_end_index(ow, output_width, input_width);
-    int kw = iw1 - iw0;
+    int kw  = iw1 - iw0;
 
-    const float* input_ptr = input + bid * input_height * input_width;
-    float* output_ptr = output + bid * output_height * output_width;
+    const float *input_ptr = input + bid * input_height * input_width;
+    float *output_ptr      = output + bid * output_height * output_width;
 
     if (pool_type == 1) {
         float sum = 0;
@@ -55,7 +96,7 @@ __global__ void adaptive_pooling_kernel(const float* input, float* output, int c
             }
         }
         output_ptr[oh * output_width + ow] = sum / kh / kw;
-        }
+    }
 }
 
 CudaPoolingLayerAcc::~CudaPoolingLayerAcc() {
@@ -108,6 +149,26 @@ Status CudaPoolingLayerAcc::Forward(const std::vector<Blob *> &inputs, const std
 
     auto input_dims = input_blob->GetBlobDesc().dims;
     auto output_dims = output_blob->GetBlobDesc().dims;
+    bool is_global_average_pool = param->is_global_pool || (param->is_adaptive_pool && output_dims[2] == 1 && output_dims[3] == 1) && param->pool_type == 1;
+    if (is_global_average_pool) {
+        bool is_1d = input_dims.size() == 3;
+        int channels = is_1d ? input_dims[0] : input_dims[0] * input_dims[1];
+        int input_size = is_1d ? input_dims[1] * input_dims[2] : input_dims[2] * input_dims[3];
+        // printf("%d %d\n", channels, input_size);
+        const int thread_num = 128;
+        global_pooling_kernel<thread_num><<<channels, thread_num, 0, context_->GetStream()>>>(
+            input_data, output_data, channels, input_size, param->pool_type 
+        );
+
+        auto error = cudaGetLastError();
+        if (error != cudaSuccess) {
+            LOGE("Error: pooling kernel error!\n %s\n", cudaGetErrorString(error));
+            return Status(TNNERR_CUDA_KERNEL_LAUNCH_ERROR, "Error: pooling kernel error!");
+        }
+
+        return TNN_OK;
+    }
+
     if (param->is_global_pool) {
         cudnnSetPooling2dDescriptor(this->m_pooling_desc, this->m_pooling_mode, CUDNN_PROPAGATE_NAN,
             input_dims[2], input_dims[3], param->pads[2], param->pads[0], param->strides[1],
@@ -140,7 +201,7 @@ Status CudaPoolingLayerAcc::Forward(const std::vector<Blob *> &inputs, const std
     } else {
         float alpha = 1.f;
         float beta = 0.f;
-        cudnnPoolingForward(context_->cudnn_handle_, this->m_pooling_desc, &alpha, m_input_desc,
+        cudnnPoolingForward(context_->GetCudnnHandle(), this->m_pooling_desc, &alpha, m_input_desc,
             input_data, &beta, m_output_desc, output_data);
     }
     return TNN_OK;
diff --git a/source/tnn/device/cuda/acc/cuda_quantize_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_quantize_layer_acc.cu
new file mode 100644
index 000000000..fac5c9175
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_quantize_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Quantize, LAYER_QUANTIZE);
+
+Status CudaQuantizeLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaQuantizeLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaQuantizeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Quantize, LAYER_QUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu
index 97cc88a8d..6ccc1aa0a 100644
--- a/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_reduce_log_sum_exp_layer_acc.cu
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/device/cuda/utils.cuh"
 #include "tnn/utils/dims_utils.h"
 
 namespace TNN_NS {
diff --git a/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu
index c176acbee..ccf1fc3df 100644
--- a/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_reshape_layer_acc.cu
@@ -40,5 +40,6 @@ Status CudaReshapeLayerAcc::Forward(const std::vector<Blob *> &inputs, const std
 }
 
 REGISTER_CUDA_ACC(Reshape, LAYER_RESHAPE);
+REGISTER_CUDA_ACC(Reshape, LAYER_RESHAPETORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_roll_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_roll_layer_acc.cu
new file mode 100644
index 000000000..d2f6802e3
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_roll_layer_acc.cu
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_utils.h"
+#include "tnn/device/cuda/cuda_macro.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Roll, LAYER_ROLL);
+
+Status CudaRollLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);;
+}
+
+Status CudaRollLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+typedef struct roll_dims_t {
+    roll_dims_t(std::vector<int> dims) {
+        memset(d, 0, maxDims * sizeof(int));
+        nbDims = dims.size();
+        for (int i=0; i<nbDims; i++) {
+            d[i] = dims[i];
+        }
+    }
+    constexpr static int maxDims = 6;
+    int nbDims = 0;
+    int d[maxDims];
+} dims_t;
+
+template<typename T, int NBDIMS>
+__global__ void roll_kernel(const T* src, T* dst,
+                            const dims_t shape, const dims_t shifts, const dims_t strides, const int count) {
+    CUDA_KERNEL_LOOP(index, count) {
+        int src_index = 0;
+        int dst_index = index;
+        int remainder = index;
+        //#pragma unroll
+        for (int dim=0; dim<NBDIMS; dim++) {
+            int dst_index_dim = remainder / strides.d[dim];
+            int src_index_dim  = (dst_index_dim - shifts.d[dim] + shape.d[dim]) % shape.d[dim];
+            src_index += strides.d[dim] * src_index_dim;
+            remainder %= strides.d[dim];
+        }
+
+        dst[dst_index] = src[src_index];
+    }
+}
+
+Status CudaRollLayerAcc::Forward(const std::vector<Blob*> &inputs, const std::vector<Blob*> &outputs) {
+    // Operator Roll input.dim == output.dim
+    auto input_blob  = inputs[0];
+    auto output_blob = outputs[0];
+    auto input_dims  = input_blob->GetBlobDesc().dims;
+    auto count       = DimsVectorUtils::Count(input_dims);
+
+    auto roll_param  = dynamic_cast<RollLayerParam*>(param_);
+    if (roll_param == nullptr) {
+        LOGE("Error: CudaRollLayer forward load layer param failed\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CudaRollLayer forward Load layer param failed!");
+    }
+    if (roll_param->dims.size() != roll_param->shifts.size()) {
+        LOGE("Error: CpuRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CpuRollLayer forward layer param.shifts.nbDims not equal to input param.dims.nbDims!");
+    }
+
+    // Create Full, Ordered, Positive shifts from param.shifts.
+    // Create Strides.
+    std::vector<int> shifts(input_dims.size(), 0);
+    std::vector<int> strides(input_dims.size(), 1);
+    for (int d=0; d<input_dims.size(); d++) {
+        strides[d] = DimsVectorUtils::Count(input_dims, d+1);
+    }
+    for (int d=0; d<roll_param->dims.size(); d++) {
+        int dim     = roll_param->dims[d];
+        shifts[dim] = roll_param->shifts[d] < 0 ? roll_param->shifts[d] + input_dims[dim] : roll_param->shifts[d];
+    }
+    dims_t shape_dims(input_dims);
+    dims_t shifts_dims(shifts);
+    dims_t strides_dims(strides);
+    
+    const int THREAD_PER_BLOCK = 128;
+
+    dim3 blocks;
+    blocks.x = (count + THREAD_PER_BLOCK - 1 ) / THREAD_PER_BLOCK;
+    if (blocks.x > 65535) {
+        LOGE("Error: CudaRollLayer forward layer cuda block.x > 65535, large kernel not supported yet.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CudaRollLayer forward layer cuda block.x > 65535, large kernel not supported yet.");
+    }
+
+    // Run cuda Kernel
+    auto data_type = outputs[0]->GetBlobDesc().data_type;
+    if (data_type == DATA_TYPE_FLOAT || data_type == DATA_TYPE_INT32) {
+        // DataType with Element size = 4, treat as float
+        float* src = reinterpret_cast<float*>(((char*)input_blob->GetHandle().base) + input_blob->GetHandle().bytes_offset);
+        float* dst = reinterpret_cast<float*>(((char*)output_blob->GetHandle().base) + output_blob->GetHandle().bytes_offset);
+        
+        using kernel_function_ptr_t = decltype(&roll_kernel<float,1>);
+        kernel_function_ptr_t kernel_ptr = nullptr;
+        switch (shape_dims.nbDims) {
+            case 1:
+                kernel_ptr = roll_kernel<float, 1>;
+                break;
+            case 2:
+                kernel_ptr = roll_kernel<float, 2>;
+                break;
+            case 3:
+                kernel_ptr = roll_kernel<float, 3>;
+                break;
+            case 4:
+                kernel_ptr = roll_kernel<float, 4>;
+                break;
+            case 5:
+                kernel_ptr = roll_kernel<float, 5>;
+                break;
+            case 6:
+                kernel_ptr = roll_kernel<float, 6>;
+                break;
+            default:
+                LOGE("Error: CudaRollLayer forward layer input nbDims should be 1-6.\n");
+                return Status(TNNERR_MODEL_ERR, "Error: CudaRollLayer forward layer input nbDims should be 1-6.");
+        }
+        kernel_ptr<<<blocks, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+                    (src, dst, shape_dims, shifts_dims, strides_dims, count);
+    } else if (data_type == DATA_TYPE_HALF || data_type == DATA_TYPE_BFP16) {
+        // DataType with Element size = 2, treat as __half
+        __half* src = reinterpret_cast<__half*>(((char*)input_blob->GetHandle().base) + input_blob->GetHandle().bytes_offset);
+        __half* dst = reinterpret_cast<__half*>(((char*)output_blob->GetHandle().base) + output_blob->GetHandle().bytes_offset);
+        using kernel_function_ptr_t = decltype(&roll_kernel<__half,1>);
+        kernel_function_ptr_t kernel_ptr = nullptr;
+        switch (shape_dims.nbDims) {
+            case 1:
+                kernel_ptr = roll_kernel<__half, 1>;
+                break;
+            case 2:
+                kernel_ptr = roll_kernel<__half, 2>;
+                break;
+            case 3:
+                kernel_ptr = roll_kernel<__half, 3>;
+                break;
+            case 4:
+                kernel_ptr = roll_kernel<__half, 4>;
+                break;
+            case 5:
+                kernel_ptr = roll_kernel<__half, 5>;
+                break;
+            case 6:
+                kernel_ptr = roll_kernel<__half, 6>;
+                break;
+            default:
+                LOGE("Error: CudaRollLayer forward layer input nbDims should be 1-6.\n");
+                return Status(TNNERR_MODEL_ERR, "Error: CudaRollLayer forward layer input nbDims should be 1-6.");
+        }
+        kernel_ptr<<<blocks, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+                    (src, dst, shape_dims, shifts_dims, strides_dims, count);
+    } else {
+        LOGE("Error: CudaRollLayer forward layer data type not supported.\n");
+        return Status(TNNERR_MODEL_ERR, "Error: CudaRollLayer forward layer data type not supported!");
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Roll, LAYER_ROLL);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_split_gelu_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_split_gelu_layer_acc.cu
new file mode 100644
index 000000000..b1d3778af
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_split_gelu_layer_acc.cu
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+#include <cstdint>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(SplitGelu, LAYER_FUSED_SPLIT_GELU);
+
+using half = __half;
+
+template <typename T, int32_t tHHS, int32_t tTPB>
+__global__ void splitGeLUKernel(T const* input, T* output, float const fDivRecip, float const fAdd, float const fMul)
+{
+    assert(input != nullptr);
+    assert(output != nullptr);
+
+    int32_t indexInput = blockIdx.x * tHHS * 2 + threadIdx.x;
+    int32_t indexOutput = blockIdx.x * tHHS + threadIdx.x;
+
+#pragma unroll
+    for (int32_t i = 0; i < tHHS / tTPB; ++i)
+    {
+        auto valueL = static_cast<float>(input[indexInput]);
+        auto valueR = static_cast<float>(input[indexInput + tHHS]);
+        float tmp = valueR;
+        tmp *= fDivRecip;
+        tmp = erff(tmp);
+        tmp += fAdd;
+        tmp *= valueR;
+        tmp *= fMul;
+        tmp *= valueL;
+        output[indexOutput] = static_cast<T>(tmp);
+        indexInput += tTPB;
+        indexOutput += tTPB;
+    }
+    return;
+}
+
+template <typename T>
+int32_t launchSplitGeLUKernel(cudaStream_t stream, int32_t gridSize, int32_t nHalfHiddenSize, T const* input, T* output,
+    float const fDiv, float const fAdd, float const fMul)
+{
+    auto const fDivRecip = 1.F / fDiv;
+    constexpr int32_t kTPB = 256; // thread per block
+    switch (nHalfHiddenSize)
+    {
+    case 1280: (splitGeLUKernel<T, 1280, kTPB>) <<<gridSize, kTPB, 0, stream>>>(input, output, fDivRecip, fAdd, fMul); break;
+    case 2560: (splitGeLUKernel<T, 2560, kTPB>) <<<gridSize, kTPB, 0, stream>>>(input, output, fDivRecip, fAdd, fMul); break;
+    case 5120: (splitGeLUKernel<T, 5120, kTPB>) <<<gridSize, kTPB, 0, stream>>>(input, output, fDivRecip, fAdd, fMul); break;
+    }
+    return 0;
+}
+
+template __global__ void splitGeLUKernel<float, 1280, 256>(float const*, float*, float const, float const, float const);
+template __global__ void splitGeLUKernel<float, 2560, 256>(float const*, float*, float const, float const, float const);
+template __global__ void splitGeLUKernel<float, 5120, 256>(float const*, float*, float const, float const, float const);
+template __global__ void splitGeLUKernel<half, 1280, 256>(half const*, half*, float const, float const, float const);
+template __global__ void splitGeLUKernel<half, 2560, 256>(half const*, half*, float const, float const, float const);
+template __global__ void splitGeLUKernel<half, 5120, 256>(half const*, half*, float const, float const, float const);
+
+template int32_t launchSplitGeLUKernel<float>(cudaStream_t stream, int32_t gridSize, int32_t nHalfHiddenSize,
+    float const* input, float* output, float const fDiv, float const fAdd, float const fMul);
+
+template int32_t launchSplitGeLUKernel<half>(cudaStream_t stream, int32_t gridSize, int32_t nHalfHiddenSize,
+    half const* input, half* output, float const fDiv, float const fAdd, float const fMul);
+
+Status CudaSplitGeluLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSplitGeluLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSplitGeluLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    Blob *input_blob  = inputs[0];
+    Blob *output_blob = outputs[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    void *input_data  = input_blob->GetHandle().base;
+    void *output_data = output_blob->GetHandle().base;
+    auto dtype = input_blob->GetBlobDesc().data_type;
+
+    float div = 1.4140625F;
+    float add = 1.F;
+    float mul = 0.5F;
+
+    int32_t const grid_size = output_dims[0] * output_dims[1];
+    int32_t const half_hidden_size = output_dims[2];
+
+    if (dtype == DATA_TYPE_FLOAT)
+    {
+        auto const input = static_cast<float const*>(input_data);
+        auto output = static_cast<float*>(output_data);
+        launchSplitGeLUKernel<float>(context_->GetStream(), grid_size, half_hidden_size, input, output, div, add, mul);
+    }
+    else
+    {
+        auto const input = static_cast<half const*>(input_data);
+        auto output = static_cast<half*>(output_data);
+        launchSplitGeLUKernel<half>(context_->GetStream(), grid_size, half_hidden_size, input, output, div, add, mul);
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(SplitGelu, LAYER_FUSED_SPLIT_GELU);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu
index 93d4ff594..04c7ffd72 100644
--- a/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu
+++ b/source/tnn/device/cuda/acc/cuda_splitv_layer_acc.cu
@@ -19,6 +19,8 @@ namespace TNN_NS {
 
 DECLARE_CUDA_ACC(SplitV, LAYER_SPLITV);
 
+int SPLITV_GRID_DIM_Y_MAX = 65535;
+
 template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
 __global__ void splitv_separate_kernel(
     const float * __restrict__ src, float * dst,
@@ -44,6 +46,32 @@ __global__ void splitv_separate_kernel(
 
 }
 
+// Cases when real_grid_dim_y > SPLITV_GRID_DIM_Y_MAX
+template<int THREAD_PER_BLOCK, int ELE_PER_THREAD>
+__global__ void splitv_separate_ylarge_kernel(
+    const float * __restrict__ src, float * dst,
+    const int inner_size, const int in_stride,
+    const int split_start, const int split_end, const int real_grid_dim_y, const int GRID_DIM_Y_MAX)
+{
+    for (int block_idx_y = blockIdx.y; block_idx_y < real_grid_dim_y; block_idx_y += GRID_DIM_Y_MAX) {
+        int block_offset = blockIdx.x * THREAD_PER_BLOCK * ELE_PER_THREAD;
+
+        const int split_size = split_end - split_start;
+        const int size = split_size * inner_size;
+        const float* src_offsetted = src + (blockIdx.z * real_grid_dim_y + block_idx_y) * in_stride;
+        float* dst_offsetted       = dst + (blockIdx.z * real_grid_dim_y + block_idx_y) * size;
+
+        #pragma unroll
+        for (int i = 0; i < ELE_PER_THREAD ; i++) {
+            int index = block_offset + i * THREAD_PER_BLOCK + threadIdx.x;
+            if (index < size) {
+                int input_index = index + split_start * inner_size;
+                dst_offsetted[index] = __ldg(src_offsetted + input_index);
+            }
+        }
+    }
+}
+
 Status CudaSplitVLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
         const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     CudaLayerAcc::Init(context, param, resource, inputs, outputs);
@@ -61,11 +89,12 @@ Status CudaSplitVLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std:
 
 Status CudaSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
     auto layer_param = dynamic_cast<SplitVLayerParam *>(param_);
-    int axis = layer_param->axis;
     Blob *input_blob  = inputs[0];
     auto dims = input_blob->GetBlobDesc().dims;
     float* input_data = static_cast<float*>(input_blob->GetHandle().base);
 
+    int axis = layer_param->axis >= 0 ? layer_param->axis : (dims.size() + layer_param->axis);
+
     const int THREAD_PER_BLOCK = 128;
     const int ELE_PER_THREAD = 16;
 
@@ -77,17 +106,25 @@ Status CudaSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std:
 
     int split_begin = 0;
     for(int i= 0; i < split_num; i++) {
-      if (slices[i] > 0) {
-        Blob* output_blob = outputs[i];
-        int split_end = split_begin + slices[i];
-        dim3 griddim;
-        griddim.x = (slices[i] * inner_size + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
+      Blob* output_blob = outputs[i];
+      int split_size = output_blob->GetBlobDesc().dims[axis];
+      int split_end = split_begin + split_size;
+      dim3 griddim;
+      if (split_size != 0) {
+        griddim.x = (split_size * inner_size + ELE_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELE_PER_THREAD * THREAD_PER_BLOCK);
         griddim.y = DimsVectorUtils::Count(dims, 1, axis);
-        griddim.z = DimsVectorUtils::Count(dims, 0, min(1, axis));
+        griddim.z = DimsVectorUtils::Count(dims, 0, min(axis, 1));
 
         float* output_data = static_cast<float*>(output_blob->GetHandle().base);
-        splitv_separate_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+        if (griddim.y <= SPLITV_GRID_DIM_Y_MAX) {
+          splitv_separate_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
             (input_data, output_data, inner_size, in_stride, split_begin, split_end);
+        } else {
+          int real_grid_dim_y = griddim.y;
+          griddim.y = SPLITV_GRID_DIM_Y_MAX;
+          splitv_separate_ylarge_kernel<THREAD_PER_BLOCK, ELE_PER_THREAD><<<griddim, THREAD_PER_BLOCK, 0, context_->GetStream()>>>
+            (input_data, output_data, inner_size, in_stride, split_begin, split_end, real_grid_dim_y, SPLITV_GRID_DIM_Y_MAX);
+        }
         split_begin = split_end;
       }
     }
@@ -96,5 +133,6 @@ Status CudaSplitVLayerAcc::Forward(const std::vector<Blob *> &inputs, const std:
 }
 
 REGISTER_CUDA_ACC(SplitV, LAYER_SPLITV);
+REGISTER_CUDA_ACC(SplitV, LAYER_SPLITTORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_swish_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_swish_layer_acc.cu
new file mode 100644
index 000000000..fa688f0aa
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_swish_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Swish, LAYER_SWISH);
+
+Status CudaSwishLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaSwishLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaSwishLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Swish, LAYER_SWISH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/acc/cuda_xor_layer_acc.cu b/source/tnn/device/cuda/acc/cuda_xor_layer_acc.cu
new file mode 100644
index 000000000..a60e270c8
--- /dev/null
+++ b/source/tnn/device/cuda/acc/cuda_xor_layer_acc.cu
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may And use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/cuda/acc/cuda_layer_acc.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_CUDA_ACC(Xor, LAYER_XOR);
+
+Status CudaXorLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+        const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return CudaLayerAcc::Init(context, param, resource, inputs, outputs);
+}
+
+Status CudaXorLayerAcc::Reshape(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+Status CudaXorLayerAcc::Forward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    return TNN_OK;
+}
+
+REGISTER_CUDA_ACC(Xor, LAYER_XOR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/cuda/cuda_blob_converter.cc b/source/tnn/device/cuda/cuda_blob_converter.cc
index 93a11d7f4..467cbb500 100644
--- a/source/tnn/device/cuda/cuda_blob_converter.cc
+++ b/source/tnn/device/cuda/cuda_blob_converter.cc
@@ -53,28 +53,47 @@ Status CudaBlobConverterAcc::ConvertToMatAsync(Mat& image, MatConvertParam param
     }
 
     cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
-    auto blob_data = reinterpret_cast<float*>(blob_->GetHandle().base);
+    auto blob_data = blob_->GetHandle().base;
     auto desc = blob_->GetBlobDesc();
     auto dims = desc.dims;
     auto hw = DimsVectorUtils::Count(dims, 2);
     auto chw = DimsFunctionUtils::GetDim(dims, 1) * hw;
     auto nchw = DimsFunctionUtils::GetDim(dims, 0) * chw;
+    if (!DimsVectorUtils::Equal(dims, image.GetDims())) {
+        image.ReAlloc(dims);
+    }
     if (image.GetDeviceType() == DEVICE_CUDA) {
         prepareParamPtr(param, image.GetMatType(), stream);
         if (image.GetMatType() == NCHW_FLOAT) {
-            ScaleBias(blob_data, (float*)image.GetData(), stream, scale_ptr_, bias_ptr_, 
-                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            if (desc.data_type == DATA_TYPE_FLOAT) {
+                ScaleBias((float*)blob_data, (float*)image.GetData(), stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+	        } else if (desc.data_type == DATA_TYPE_HALF) {
+                ScaleBias((__half*)blob_data, (float*)image.GetData(), stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            }
+        } else if (image.GetMatType() == NCHW_HALF || image.GetMatType() == RESERVED_FP16_TEST) {
+            if (desc.data_type == DATA_TYPE_FLOAT) {
+                ScaleBias((float*)blob_data, reinterpret_cast<__half*>(image.GetData()), stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+	        } else if (desc.data_type == DATA_TYPE_HALF) {
+                ScaleBias((__half*)blob_data, reinterpret_cast<__half*>(image.GetData()), stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            }
         } else if (image.GetMatType() == NC_INT32 && desc.data_type == DATA_TYPE_INT32) {
             cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int32_t),
                 cudaMemcpyDeviceToDevice, stream);
-        } else if (image.GetMatType() == N8UC4) {
-            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image.GetData(), stream, 4, scale_ptr_, bias_ptr_,
+        } else if (image.GetMatType() == NC_INT64 && desc.data_type == DATA_TYPE_INT64) {
+            cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int64_t),
+                cudaMemcpyDeviceToDevice, stream);
+        } else if (image.GetMatType() == N8UC4  && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToBGR(dims[0], chw, hw, (float*)blob_data, (unsigned char*)image.GetData(), stream, 4, scale_ptr_, bias_ptr_,
                 param.reverse_channel);
-        } else if (image.GetMatType() == N8UC3) {
-            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image.GetData(), stream, 3, scale_ptr_, bias_ptr_,
+        } else if (image.GetMatType() == N8UC3 && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToBGR(dims[0], chw, hw, (float*)blob_data, (unsigned char*)image.GetData(), stream, 3, scale_ptr_, bias_ptr_,
                 param.reverse_channel);
-        } else if (image.GetMatType() == NGRAY) {
-            BlobToGray(nchw, blob_data, (unsigned char*)image.GetData(), stream, param.scale[0], param.bias[0]);
+        } else if (image.GetMatType() == NGRAY && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToGray(nchw, (float*)blob_data, (unsigned char*)image.GetData(), stream, param.scale[0], param.bias[0]);
         } else {
             ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
         }
@@ -82,31 +101,51 @@ Status CudaBlobConverterAcc::ConvertToMatAsync(Mat& image, MatConvertParam param
         prepareImagePtr(image, param, dims, stream);
         prepareParamPtr(param, image.GetMatType(), stream);
         if (image.GetMatType() == NCHW_FLOAT) {
-            ScaleBias(blob_data, (float*)image_ptr_, stream, scale_ptr_, bias_ptr_, 
-                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            if (desc.data_type == DATA_TYPE_FLOAT) {
+                ScaleBias((float*)blob_data, (float*)image_ptr_, stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            } else if (desc.data_type == DATA_TYPE_HALF) {
+                ScaleBias((__half*)blob_data, (float*)image_ptr_, stream, scale_ptr_, bias_ptr_,
+                    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            }
             cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(float),
                 cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == NCHW_HALF || image.GetMatType() == RESERVED_FP16_TEST) {
+            if (desc.data_type == DATA_TYPE_FLOAT) {
+                //ScaleBias((float*)blob_data, reinterpret_cast<__half*>(image_ptr_), stream, scale_ptr_, bias_ptr_,
+                //    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            } else if (desc.data_type == DATA_TYPE_HALF) {
+                //ScaleBias((__half*)blob_data, reinterpret_cast<__half*>(image_ptr_), stream, scale_ptr_, bias_ptr_,
+                //    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+            }
+            //cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(fp16_t),
+            cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(fp16_t),
+                cudaMemcpyDeviceToHost, stream);
         } else if (image.GetMatType() == NC_INT32 && desc.data_type == DATA_TYPE_INT32) {
             cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int32_t), 
                 cudaMemcpyDeviceToHost, stream);
-        } else if (image.GetMatType() == N8UC4) {
-            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image_ptr_, stream, 4, scale_ptr_, bias_ptr_,
+        } else if (image.GetMatType() == NC_INT64 && desc.data_type == DATA_TYPE_INT64) {
+            cudaMemcpyAsync(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int64_t), 
+                cudaMemcpyDeviceToHost, stream);
+        } else if (image.GetMatType() == N8UC4 && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToBGR(dims[0], chw, hw, (float*)blob_data, (unsigned char*)image_ptr_, stream, 4, scale_ptr_, bias_ptr_,
                 param.reverse_channel);
             cudaMemcpyAsync(image.GetData(), image_ptr_, dims[0] * 4 * hw * sizeof(unsigned char),
                 cudaMemcpyDeviceToHost, stream);
-        } else if (image.GetMatType() == N8UC3) {
-            BlobToBGR(dims[0], chw, hw, blob_data, (unsigned char*)image_ptr_, stream, 3, scale_ptr_, bias_ptr_,
+        } else if (image.GetMatType() == N8UC3 && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToBGR(dims[0], chw, hw, (float*)blob_data, (unsigned char*)image_ptr_, stream, 3, scale_ptr_, bias_ptr_,
                 param.reverse_channel);
             cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(unsigned char),
                 cudaMemcpyDeviceToHost, stream);
-        } else if (image.GetMatType() == NGRAY) {
-            BlobToGray(nchw, blob_data, (unsigned char*)image_ptr_, stream, param.scale[0], param.bias[0]);
+        } else if (image.GetMatType() == NGRAY && desc.data_type == DATA_TYPE_FLOAT) {
+            BlobToGray(nchw, (float*)blob_data, (unsigned char*)image_ptr_, stream, param.scale[0], param.bias[0]);
             cudaMemcpyAsync(image.GetData(), image_ptr_, DimsVectorUtils::Count(dims) * sizeof(unsigned char),
                 cudaMemcpyDeviceToHost, stream);
         } else {
             ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
         }
     }
+
     return ret;
 }
 
@@ -142,11 +181,21 @@ Status CudaBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam par
         if (image.GetMatType() == NCHW_FLOAT) {
             ScaleBias((float*)image.GetData(), blob_data, stream, scale_ptr_, bias_ptr_, 
                 DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+        } else if (image.GetMatType() == NCHW_HALF || image.GetMatType() == RESERVED_FP16_TEST) {
+            desc.data_type = DATA_TYPE_HALF;
+            blob_->SetBlobDesc(desc);
+            ScaleBias(reinterpret_cast<__half*>(image_ptr_), reinterpret_cast<__half*>(blob_data), stream, scale_ptr_, bias_ptr_, 
+                DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
         } else if (image.GetMatType() == NC_INT32) {
             desc.data_type = DATA_TYPE_INT32;
             blob_->SetBlobDesc(desc);
             cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t),
                 cudaMemcpyDeviceToDevice, stream);
+        } else if (image.GetMatType() == NC_INT64) {
+            desc.data_type = DATA_TYPE_INT64;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int64_t),
+                cudaMemcpyDeviceToDevice, stream);
         } else if (image.GetMatType() == N8UC4) {
             BGRToBlob(dims[0], chw, hw, (unsigned char*)image.GetData(), blob_data, stream, 4, scale_ptr_, bias_ptr_,
                 param.reverse_channel);
@@ -155,6 +204,11 @@ Status CudaBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam par
                 param.reverse_channel);
         } else if (image.GetMatType() == NGRAY) {
             GrayToBlob(nchw, (unsigned char*)image.GetData(), blob_data, stream, param.scale[0], param.bias[0]);
+        } else if (image.GetMatType() == NC_UINT8) {
+            desc.data_type = DATA_TYPE_UINT8;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(uint8_t),
+                cudaMemcpyDeviceToDevice, stream);
         } else {
             ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
         }
@@ -166,11 +220,24 @@ Status CudaBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam par
                 cudaMemcpyHostToDevice, stream);
             ScaleBias((float*)image_ptr_, blob_data, stream, scale_ptr_, bias_ptr_, 
                 DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
+        } else if (image.GetMatType() == NCHW_HALF || image.GetMatType() == RESERVED_FP16_TEST) {
+            desc.data_type = DATA_TYPE_HALF;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(fp16_t), 
+                cudaMemcpyHostToDevice, stream);
+            //ScaleBias May GOT ERROR in fp16
+            //ScaleBias(reinterpret_cast<__half*>(image_ptr_), reinterpret_cast<__half*>(blob_data), stream, scale_ptr_, bias_ptr_, 
+            //    DimsFunctionUtils::GetDim(dims, 0), DimsFunctionUtils::GetDim(dims, 1), hw);
         } else if (image.GetMatType() == NC_INT32) {
             desc.data_type = DATA_TYPE_INT32;
             blob_->SetBlobDesc(desc);
             cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t), 
                 cudaMemcpyHostToDevice, stream);
+        } else if (image.GetMatType() == NC_INT64) {
+            desc.data_type = DATA_TYPE_INT64;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(int64_t), 
+                cudaMemcpyHostToDevice, stream);
         } else if (image.GetMatType() == N8UC4) {
             cudaMemcpyAsync(image_ptr_, image.GetData(), dims[0] * 4 * hw * sizeof(unsigned char),
                 cudaMemcpyHostToDevice, stream);
@@ -185,6 +252,11 @@ Status CudaBlobConverterAcc::ConvertFromMatAsync(Mat& image, MatConvertParam par
             cudaMemcpyAsync(image_ptr_, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(unsigned char),
                 cudaMemcpyHostToDevice, stream);
             GrayToBlob(nchw, (unsigned char*)image_ptr_, blob_data, stream, param.scale[0], param.bias[0]);
+        } else if (image.GetMatType() == NC_UINT8) {
+            desc.data_type = DATA_TYPE_UINT8;
+            blob_->SetBlobDesc(desc);
+            cudaMemcpyAsync(blob_data, image.GetData(), DimsVectorUtils::Count(dims) * sizeof(uint8_t), 
+                cudaMemcpyHostToDevice, stream);
         } else {
             ret = Status(TNNERR_PARAM_ERR, "convert type not support yet");
         }
diff --git a/source/tnn/device/cuda/cuda_blob_converter_kernel.cu b/source/tnn/device/cuda/cuda_blob_converter_kernel.cu
index 3c2f1ebe4..223ebdc29 100644
--- a/source/tnn/device/cuda/cuda_blob_converter_kernel.cu
+++ b/source/tnn/device/cuda/cuda_blob_converter_kernel.cu
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/device/cuda/cuda_blob_converter_kernel.cuh"
+#include "tnn/device/cuda/utils.cuh"
 
 namespace TNN_NS {
 
@@ -26,11 +27,13 @@ inline __device__ unsigned char fp32_to_u8_sat(float in) {
     return (unsigned char)(x);
 }
 
-__global__ void scale_bias_kernel(int size, const float* src, float* dst, float* scale, float* bias, int hw, int channels) {
+template<typename I, typename O>
+__global__ void scale_bias_kernel(int size, const I* src, O* dst, float* scale, float* bias,
+        int hw, int channels) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < size) {
         int c = tid / hw % channels;
-        dst[tid] = src[tid] * scale[c] + bias[c];
+        dst[tid] = convert_float_value<O>(get_float_value<I>(src[tid]) * scale[c] + bias[c]);
     }
 }
 
@@ -133,8 +136,10 @@ void BlobToBGR(int batch, int CHW, int HW, const float *src, unsigned char *dst,
     dim3 grid;
     grid.x = (HW + ELEMENT_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELEMENT_PER_THREAD * THREAD_PER_BLOCK);
     grid.y = batch;
-    blob_to_bgr_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
-        CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+    if (batch > 0 && HW > 0 && CHW > 0) {
+        blob_to_bgr_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
+            CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+    }
 }
 
 void BlobToGray(int count, const float *src, unsigned char *dst, cudaStream_t stream, float scale, float bias) {
@@ -147,20 +152,40 @@ void BGRToBlob(int batch, int CHW, int HW, const unsigned char *src, float *dst,
     dim3 grid;
     grid.x = (HW + ELEMENT_PER_THREAD * THREAD_PER_BLOCK - 1) / (ELEMENT_PER_THREAD * THREAD_PER_BLOCK);
     grid.y = batch;
-    bgr_to_blob_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
-        CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+    if (batch > 0 && HW > 0 && CHW > 0) {
+        bgr_to_blob_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(
+            CHW, HW, src, dst, channels, scale, bias, reverse_channel);
+    }
 }
 
 void GrayToBlob(int count, const unsigned char *src, float *dst, cudaStream_t stream, float scale, float bias) {
     const int BLOCK_NUM = (count + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
-    gray_to_blob_kernel<<<BLOCK_NUM, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias);
+    if (count > 0 && BLOCK_NUM > 0) {
+        gray_to_blob_kernel<<<BLOCK_NUM, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias);
+    }
 }
 
-void ScaleBias(const float* src, float* dst, cudaStream_t stream, float* scale, float* bias, int batch, int channels, int hw) {
+template <typename I, typename O>
+void ScaleBias(const I* src, O* dst, cudaStream_t stream, float* scale, float* bias, int batch,
+        int channels, int hw) {
     int count = batch * channels * hw;
     int grid = (count + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
-    scale_bias_kernel<<<grid, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias, hw, channels); 
+    if (count > 0 && grid > 0) {
+        scale_bias_kernel<I, O><<<grid, THREAD_PER_BLOCK, 0, stream>>>(count, src, dst, scale, bias, hw, channels); 
+    }
 }
 
+template void ScaleBias<float, float>(const float* src, float* dst, cudaStream_t stream, float* scale,
+    float* bias, int batch, int channels, int hw);
+
+template void ScaleBias<__half, float>(const __half* src, float* dst, cudaStream_t stream, float* scale,
+    float* bias, int batch, int channels, int hw);
+
+template void ScaleBias<float, __half>(const float* src, __half* dst, cudaStream_t stream, float* scale,
+    float* bias, int batch, int channels, int hw);
+
+template void ScaleBias<__half, __half>(const __half* src, __half* dst, cudaStream_t stream, float* scale,
+    float* bias, int batch, int channels, int hw);
+
 }  //  namespace TNN_NS
 
diff --git a/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh b/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh
index eac3d7964..cadae0f16 100644
--- a/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh
+++ b/source/tnn/device/cuda/cuda_blob_converter_kernel.cuh
@@ -29,7 +29,9 @@ void BGRToBlob(int batch, int CHW, int HW, const unsigned char *src, float *dst,
 
 void GrayToBlob(int count, const unsigned char *src, float *dst, cudaStream_t stream, float scale, float bias);
 
-void ScaleBias(const float* src, float* dst, cudaStream_t stream, float* scale, float* bias, int batch, int channels, int hw);
+template<typename I, typename O>
+void ScaleBias(const I* src, O* dst, cudaStream_t stream, float* scale, float* bias,
+    int batch, int channels, int hw);
 
 }  //  namespace TNN_NS;
 
diff --git a/source/tnn/device/cuda/cuda_context.cc b/source/tnn/device/cuda/cuda_context.cc
index 58497297a..39ff58866 100644
--- a/source/tnn/device/cuda/cuda_context.cc
+++ b/source/tnn/device/cuda/cuda_context.cc
@@ -29,14 +29,25 @@ CudaContext::~CudaContext() {
         }
     }
 
-    cudnnStatus_t cudnn_status = cudnnDestroy(cudnn_handle_);
-    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
-        LOGE("destroy cudnn handle failed");
+    if(own_cudnn_handle_) {
+        cudnnStatus_t cudnn_status = cudnnDestroy(cudnn_handle_);
+        if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+            LOGE("destroy cudnn handle failed");
+        }
     }
 
-    cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        LOGE("destroy cublas handle failed");
+    if(own_cublas_handle_) {
+        cublasStatus_t cublas_status = cublasDestroy(cublas_handle_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("destroy cublas handle failed");
+        }
+    }
+
+    if(own_cublaslt_handle_) {
+        cublasStatus_t cublas_status = cublasLtDestroy(cublaslt_handle_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("destroy cublasLt handle failed");
+        }
     }
 
     if (workspace_) {
@@ -50,31 +61,6 @@ Status CudaContext::Setup(int device_id) {
     CUDA_CHECK(cudaSetDevice(device_id));
     CUDA_CHECK(cudaStreamCreate(&stream_));
     own_stream_ = true;
-
-    cudnnStatus_t cudnn_status = cudnnCreate(&cudnn_handle_);
-    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
-        LOGE("create cudnn handle failed");
-        return TNNERR_INST_ERR;
-    }
-
-    cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
-    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
-        LOGE("cudnn handle set stream failed");
-        return TNNERR_INST_ERR;
-    }
-
-    cublasStatus_t cublas_status = cublasCreate(&cublas_handle_);
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        LOGE("create cublas handle failed");
-        return TNNERR_INST_ERR;
-    }
-
-    cublas_status = cublasSetStream(cublas_handle_, stream_);
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        LOGE("cublas handle set stream failed");
-        return TNNERR_INST_ERR;
-    }
-
     return TNN_OK;
 }
 
@@ -88,6 +74,33 @@ Status CudaContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status CudaContext::SetCommandQueue(void* command_queue) {
+    if (own_stream_) {
+        CUDA_CHECK(cudaStreamSynchronize(stream_))
+        CUDA_CHECK(cudaStreamDestroy(stream_));
+    }
+    own_stream_ = false;
+    stream_ = (cudaStream_t)command_queue;
+
+    if(own_cudnn_handle_) {
+        cudnnStatus_t cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
+        if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+            LOGE("cudnn handle set stream failed");
+            return TNNERR_INST_ERR;
+        }
+    }
+
+    if(own_cublas_handle_) {
+        cublasStatus_t cublas_status = cublasSetStream(cublas_handle_, stream_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("cublas handle set stream failed");
+            return TNNERR_INST_ERR;
+        }
+    }
+
+    return TNN_OK;
+}
+
 Status CudaContext::ShareCommandQueue(Context* context) {
 
     if (context == nullptr)
@@ -104,16 +117,20 @@ Status CudaContext::ShareCommandQueue(Context* context) {
     own_stream_ = false;
     stream_ = cuda_ctx->GetStream();
 
-    cudnnStatus_t cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
-    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
-        LOGE("cudnn handle set stream failed");
-        return TNNERR_INST_ERR;
+    if(own_cudnn_handle_) {
+        cudnnStatus_t cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
+        if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+            LOGE("cudnn handle set stream failed");
+            return TNNERR_INST_ERR;
+        }
     }
 
-    cublasStatus_t cublas_status = cublasSetStream(cublas_handle_, stream_);
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        LOGE("cublas handle set stream failed");
-        return TNNERR_INST_ERR;
+    if(own_cublas_handle_) {
+        cublasStatus_t cublas_status = cublasSetStream(cublas_handle_, stream_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("cublas handle set stream failed");
+            return TNNERR_INST_ERR;
+        }
     }
 
     return TNN_OK;
@@ -130,6 +147,52 @@ cudaStream_t& CudaContext::GetStream() {
     return stream_;
 }
 
+cudnnHandle_t& CudaContext::GetCudnnHandle() {
+    if(!own_cudnn_handle_) {
+        cudnnStatus_t cudnn_status = cudnnCreate(&cudnn_handle_);
+        if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+            LOGE("create cudnn handle failed");
+            return cudnn_handle_;
+        }
+        own_cudnn_handle_ = true;
+        cudnn_status = cudnnSetStream(cudnn_handle_, stream_);
+        if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+            LOGE("cudnn handle set stream failed");
+            return cudnn_handle_;
+        }
+    }
+    return cudnn_handle_;
+}
+
+cublasHandle_t& CudaContext::GetCublasHandle() {
+    if(!own_cublas_handle_) {
+        cublasStatus_t cublas_status = cublasCreate(&cublas_handle_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("create cublas handle failed");
+            return cublas_handle_;
+        }
+        own_cublas_handle_ = true;
+        cublas_status = cublasSetStream(cublas_handle_, stream_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("cublas handle set stream failed");
+            return cublas_handle_;
+        }
+    }
+    return cublas_handle_;
+}
+
+cublasLtHandle_t& CudaContext::GetCublasLtHandle() {
+    if (!own_cublaslt_handle_) {
+        cublasStatus_t cublas_status = cublasLtCreate(&cublaslt_handle_);
+        if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+            LOGE("create cublasLt handle failed");
+            return cublaslt_handle_;
+        }
+        own_cublaslt_handle_ = true;
+    }
+    return cublaslt_handle_;
+}
+
 void* CudaContext::GetWorkspace() {
     return workspace_;
 }
@@ -153,4 +216,16 @@ Status CudaContext::Synchronize() {
     return TNN_OK;
 }
 
+Status CudaContext::AddQuantResource(std::string name, std::shared_ptr<RawBuffer> res) {
+    quant_extra_res_[name] = res;
+    return TNN_OK;
+}
+
+std::shared_ptr<RawBuffer> CudaContext::GetQuantResource(std::string name) {
+    if (quant_extra_res_.count(name) == 0) {
+        return nullptr;
+    }
+    return quant_extra_res_[name];
+}
+
 }  //  namespace TNN_NS
diff --git a/source/tnn/device/cuda/cuda_context.h b/source/tnn/device/cuda/cuda_context.h
index 1d82ab5e9..59b7eb65d 100644
--- a/source/tnn/device/cuda/cuda_context.h
+++ b/source/tnn/device/cuda/cuda_context.h
@@ -15,17 +15,28 @@
 #ifndef TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_CONTEXT_H_
 #define TNN_SOURCE_TNN_DEVICE_CUDA_CUDA_CONTEXT_H_
 
+#include <map>
 #include <string>
 #include <vector>
+#include <unordered_map>
 #include <cuda_runtime.h>
 
 #include <cudnn.h>
 #include <cublas_v2.h>
+#include <cublasLt.h>
 
 #include "tnn/core/context.h"
+#include "tnn/interpreter/raw_buffer.h"
 
 namespace TNN_NS {
 
+// Store information that is calculated during forward,
+// and shared by different layers.
+struct CudaSharedRuntimeInfo {
+    // shapes that is related to the value of input tensors
+    std::unordered_map<std::string, DimsVector> runtime_shapes;
+};
+
 class CudaContext : public Context {
 public:
     // @brief deconstructor
@@ -41,6 +52,10 @@ class CudaContext : public Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
     // @brief share tnn command queue to another context
     virtual Status ShareCommandQueue(Context* context);
 
@@ -56,20 +71,45 @@ class CudaContext : public Context {
     // @brief get cuda stream
     cudaStream_t& GetStream();
 
+    // @brief get cudnn stream
+    cudnnHandle_t& GetCudnnHandle();
+
+    // @brief get cublas stream
+    cublasHandle_t& GetCublasHandle();
+
+    // @brief get cublasLt stream
+    cublasLtHandle_t& GetCublasLtHandle();
+
     // @brief get workspace
     void* GetWorkspace();
 
     // @brief get worksapce size
     void SetWorkspaceSize(int size);
 
-public:
+    // @brief set quant resource
+    Status AddQuantResource(std::string name, std::shared_ptr<RawBuffer>);
+
+    // @brief get quant resource
+    std::shared_ptr<RawBuffer> GetQuantResource(std::string name);
+
+private:
     cudnnHandle_t cudnn_handle_;
     cublasHandle_t cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
     cudaStream_t stream_;
     void* workspace_ = nullptr;
     int workspace_size_ = 0;
     int device_id_ = 0;
     bool own_stream_ = false;
+    //TODO: share between in same thread(create instance and instance forward may in different threads).
+    //lazy create
+    bool own_cudnn_handle_ = false;
+    bool own_cublas_handle_ = false;
+
+    std::map<std::string, std::shared_ptr<RawBuffer>> quant_extra_res_;
+    bool own_cublaslt_handle_ = false;
+
+    CudaSharedRuntimeInfo runtime_info_;
 };
 
 }  //  namespace TNN_NS;
diff --git a/source/tnn/device/cuda/cuda_device.cc b/source/tnn/device/cuda/cuda_device.cc
index e342fafeb..5de2f7262 100644
--- a/source/tnn/device/cuda/cuda_device.cc
+++ b/source/tnn/device/cuda/cuda_device.cc
@@ -43,6 +43,8 @@ Status CudaDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
         desc.data_type = DATA_TYPE_FLOAT;
     } else if (mat_type == NC_INT32) {
         desc.data_type = DATA_TYPE_INT32;
+    } else if (mat_type == NC_INT64) {
+        desc.data_type = DATA_TYPE_INT64;
     } else {
         desc.data_type = DATA_TYPE_INT8;
     }
@@ -52,10 +54,15 @@ Status CudaDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
 
 Status CudaDevice::Allocate(void** handle, BlobMemorySizeInfo& size_info) {
     void* ptr;
-    int bytes_size = GetBlobMemoryBytesSize(size_info);
+    size_t bytes_size = GetBlobMemoryBytesSize(size_info);
     cudaError_t status = cudaMalloc(&ptr, bytes_size);
     if (cudaSuccess != status) {
-        LOGE("cuda alloc failed with size %d for %p status:%d\n", bytes_size, ptr, status);
+        LOGE("cuda alloc failed with size %lu for %p status:%d\n", bytes_size, ptr, status);
+        return TNNERR_OUTOFMEMORY;
+    }
+    status = cudaMemset(ptr, 0, bytes_size);
+    if (cudaSuccess != status) {
+        LOGE("cuda alloc::memset failed with size %lu for %p status:%d\n", bytes_size, ptr, status);
         return TNNERR_OUTOFMEMORY;
     }
 
@@ -74,6 +81,11 @@ Status CudaDevice::Allocate(void** handle, size_t size) {
         LOGE("cuda alloc got nullptr\n");
         return TNNERR_OUTOFMEMORY;
     }
+    status = cudaMemset(ptr, 0, size);
+    if (cudaSuccess != status) {
+        LOGE("cuda alloc::memset failed with size %lu for %p status:%d\n", size, ptr, status);
+        return TNNERR_OUTOFMEMORY;
+    }
     *handle = ptr;
     return TNN_OK;
 }
@@ -136,6 +148,20 @@ Status CudaDevice::CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDe
     return TNN_OK;
 }
 
+Status CudaDevice::GetCurrentDeviceId(int &device_id) {
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    return TNN_OK;
+}
+
+Status CudaDevice::GetCurrentSMVersion(int &sm) {
+    int device_id{-1};
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    cudaDeviceProp props;
+    CUDA_CHECK(cudaGetDeviceProperties(&props, device_id));
+    sm = props.major * 10 + props.minor;
+    return TNN_OK;
+}
+
 AbstractLayerAcc* CudaDevice::CreateLayerAcc(LayerType type) {
     auto layer_creator_map = GetLayerCreatorMap();
     if (layer_creator_map.count(type) > 0) {
diff --git a/source/tnn/device/cuda/cuda_device.h b/source/tnn/device/cuda/cuda_device.h
index 8d5f892ef..e5912f71a 100644
--- a/source/tnn/device/cuda/cuda_device.h
+++ b/source/tnn/device/cuda/cuda_device.h
@@ -54,6 +54,10 @@ class CudaDevice : public AbstractDevice {
 
     static Status RegisterLayerAccCreator(LayerType type, LayerAccCreator* creator);
 
+    Status GetCurrentDeviceId(int &device_id);
+
+    Status GetCurrentSMVersion(int &sm);
+
 private:
     static std::map<LayerType, std::shared_ptr<LayerAccCreator>>& GetLayerCreatorMap();
 };
diff --git a/source/tnn/device/cuda/cuda_macro.h b/source/tnn/device/cuda/cuda_macro.h
index 3d3bb355b..e848581ba 100644
--- a/source/tnn/device/cuda/cuda_macro.h
+++ b/source/tnn/device/cuda/cuda_macro.h
@@ -32,9 +32,10 @@ namespace TNN_NS {
 
 #define CUDA_CHECK(status) {                                               \
     std::stringstream _error;                                              \
-    if (cudaSuccess != status) {                                           \
-        _error << "Cuda failure: " << cudaGetErrorName(status) << " "      \
-               << cudaGetErrorString(status);                              \
+    auto _status = (status);                                               \
+    if (cudaSuccess != _status) {                                          \
+        _error << "Cuda failure: " << cudaGetErrorName(_status) << " "     \
+               << cudaGetErrorString(_status);                             \
         FatalError(_error.str());                                          \
     }                                                                      \
 }
diff --git a/source/tnn/device/cuda/cuda_mat_converter.cc b/source/tnn/device/cuda/cuda_mat_converter.cc
index 33a3a2e60..656f20d54 100644
--- a/source/tnn/device/cuda/cuda_mat_converter.cc
+++ b/source/tnn/device/cuda/cuda_mat_converter.cc
@@ -49,7 +49,8 @@ Status CudaMatConverterAcc::Copy(Mat& src, Mat& dst, void* command_queue) {
     } else if (src.GetDeviceType() == DEVICE_CUDA && dst.GetDeviceType() == DEVICE_NAIVE) {
         cudaMemcpy(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyDeviceToHost);
     } else if (src.GetDeviceType() == DEVICE_CUDA && dst.GetDeviceType() == DEVICE_CUDA) {
-        cudaMemcpy(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyDeviceToDevice);
+        cudaStream_t stream = static_cast<cudaStream_t>(command_queue);
+        cudaMemcpyAsync(dst.GetData(), src.GetData(), size_in_bytes, cudaMemcpyDeviceToDevice, stream);
     } else {
         memcpy(dst.GetData(), src.GetData(), size_in_bytes);
     }
@@ -75,10 +76,10 @@ Status CudaMatConverterAcc::Resize(Mat& src, Mat& dst, ResizeParam param, void*
         int channel = src.GetChannel();
         if (param.type == INTERP_TYPE_LINEAR) {
             ResizeBilinear((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetWidth(),
-                src.GetHeight(), dst_width, dst_height, channel);
+                src.GetHeight(), dst_width, dst_height, channel, command_queue);
         } else if(param.type == INTERP_TYPE_NEAREST) {
             ResizeNearest((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetWidth(),
-                src.GetHeight(), dst_width, dst_height, channel);
+                src.GetHeight(), dst_width, dst_height, channel, command_queue);
         } else {
             return Status(TNNERR_PARAM_ERR, "interpolation type not support yet");
         }
@@ -101,7 +102,7 @@ Status CudaMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* comm
         uint8_t* src_ptr = (uint8_t*)src.GetData();
         uint8_t* dst_ptr = (uint8_t*)dst.GetData();
         CropRGB(src_ptr, dst_ptr, src.GetBatch(), channel, src.GetWidth(), src.GetHeight(), dst.GetWidth(),
-            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y);
+            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y, command_queue);
     } else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
         if (param.top_left_x % 2 || param.top_left_y % 2 || param.width % 2 || param.height % 2) {
             return Status(TNNERR_PARAM_ERR, "corp param can not be odd");
@@ -109,7 +110,7 @@ Status CudaMatConverterAcc::Crop(Mat& src, Mat& dst, CropParam param, void* comm
         uint8_t* src_ptr = (uint8_t*)src.GetData();
         uint8_t* dst_ptr = (uint8_t*)dst.GetData();
         CropYUV(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
-            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y);
+            dst.GetHeight(), param.width, param.height, param.top_left_x, param.top_left_y, command_queue);
     } else {
         return Status(TNNERR_PARAM_ERR, "convert type not support yet");
     }
@@ -158,17 +159,17 @@ Status CudaMatConverterAcc::CvtColor(Mat& src, Mat& dst, ColorConversionType typ
         return ret;
 
     if (type == COLOR_CONVERT_NV12TOBGR) {
-        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, true);
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, true, command_queue);
     } else if (type == COLOR_CONVERT_NV21TOBGR) {
-        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, false);
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, false, command_queue);
     } else if (type == COLOR_CONVERT_NV12TOBGRA) {
-        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, true);
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, true, command_queue);
     } else if (type == COLOR_CONVERT_NV21TOBGRA) {
-        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, false);
+        YUVToGRBA((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, false, command_queue);
     } else if (type == COLOR_CONVERT_BGRTOGRAY) {
-        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3);
+        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 3, command_queue);
     } else if (type == COLOR_CONVERT_BGRATOGRAY) {
-        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4);
+        BGRAToGRAY((uint8_t*)src.GetData(), (uint8_t*)dst.GetData(), src.GetBatch(), src.GetHeight(), src.GetWidth(), 4, command_queue);
     } else {
         return Status(TNNERR_PARAM_ERR, "color conversion type not support yet");
     }
@@ -188,13 +189,13 @@ Status CudaMatConverterAcc::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderPar
 
     if (src.GetMatType() == NGRAY) {
         CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
-            dst.GetHeight(), 1, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+            dst.GetHeight(), 1, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val), command_queue);
     } else if (src.GetMatType() == N8UC3) {
         CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
-            dst.GetHeight(), 3, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+            dst.GetHeight(), 3, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val), command_queue);
     } else if (src.GetMatType() == N8UC4) {
         CudaCopyMakeBorder(src_ptr, dst_ptr, src.GetBatch(), src.GetWidth(), src.GetHeight(), dst.GetWidth(),
-            dst.GetHeight(), 4, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val));
+            dst.GetHeight(), 4, param.top, param.bottom, param.left, param.right, uint8_t(param.border_val), command_queue);
     } else {
         return Status(TNNERR_PARAM_ERR, "CopyMakeBorder mat type not support yet");
     }
diff --git a/source/tnn/device/cuda/cuda_mat_util.cu b/source/tnn/device/cuda/cuda_mat_util.cu
index 47b966835..9df07f7ca 100644
--- a/source/tnn/device/cuda/cuda_mat_util.cu
+++ b/source/tnn/device/cuda/cuda_mat_util.cu
@@ -462,7 +462,7 @@ __global__ void warp_affine_nearest_kernel(const uint8_t* src, uint8_t* dst, con
     }
 }
 
-void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel) {
+void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel, void* stream) {
     const int ELE_PER_THREAD = 4;
     const int THREAD_PER_BLOCK = 128;
     dim3 grid;
@@ -471,11 +471,11 @@ void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int
     grid.y = batch;
     float scale_x = (float)src_w / dst_w;
     float scale_y = (float)src_h / dst_h;
-    resize_bilinear_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK>>>(dst, size_dst, dst_h, dst_w, src, src_h,
+    resize_bilinear_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK, 0, (CUstream_st*)stream>>>(dst, size_dst, dst_h, dst_w, src, src_h,
         src_w, src_h * src_w * channel, channel, scale_x, scale_y);
 }
 
-void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel) {
+void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel, void* stream) {
     const int ELE_PER_THREAD = 4;
     const int THREAD_PER_BLOCK = 128;
     dim3 grid;
@@ -484,7 +484,7 @@ void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int s
     grid.y = batch;
     float scale_x = (float)src_w / dst_w;
     float scale_y = (float)src_h / dst_h;
-    resize_nearest_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK>>>(dst, size_dst, dst_h, dst_w, src, src_h,
+    resize_nearest_kernel<ELE_PER_THREAD><<<grid, THREAD_PER_BLOCK, 0, (CUstream_st*)stream>>>(dst, size_dst, dst_h, dst_w, src, src_h,
         src_w, src_h * src_w * channel, channel, scale_x, scale_y);
 }
 
@@ -588,26 +588,26 @@ void WarpAffineNearest(const uint8_t* src, int batch, int channel, int src_w, in
 }
 
 void CropRGB(const uint8_t* src, uint8_t* dst, int batch, int channel, int src_width, int src_height, int dst_width, int dst_height,
-        int width, int height, int top_left_x, int top_left_y) {
+        int width, int height, int top_left_x, int top_left_y, void* stream) {
     int THREAD_PER_BLOCK = 128;
     dim3 grid;
     grid.x = (width * height * channel + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
     grid.y = batch;
-    crop_rgb_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, channel, src_width, src_height, dst_width, dst_height,
+    crop_rgb_kernel<<<grid, THREAD_PER_BLOCK, 0, (CUstream_st*)stream>>>(src, dst, channel, src_width, src_height, dst_width, dst_height,
         width, height, top_left_x, top_left_y);
 }
 
 void CropYUV(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width, int dst_height,
-        int width, int height, int top_left_x, int top_left_y) {
+        int width, int height, int top_left_x, int top_left_y, void* stream) {
     int THREAD_PER_BLOCK = 128;
     dim3 grid;
     grid.x = (width * height + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
     grid.y = batch;
-    crop_yuv_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, src_width, src_height, dst_width, dst_height, width,
+    crop_yuv_kernel<<<grid, THREAD_PER_BLOCK, 0, (CUstream_st*)stream>>>(src, dst, src_width, src_height, dst_width, dst_height, width,
         height, top_left_x, top_left_y);
 }
 
-void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12) {
+void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12, void* stream) {
     dim3 block, grid;
     int BLOCKX = 32;
     int BLOCKY = 8;
@@ -616,10 +616,10 @@ void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int ch
     grid.x = (w + BLOCKX - 1) / BLOCKX;
     grid.y = (h + BLOCKY - 1) / BLOCKY;
     grid.z = batch;
-    yuv_to_rgba_kernel<<<grid, block>>>(src, dst, h, w, w * h, channel, is_nv12);
+    yuv_to_rgba_kernel<<<grid, block, 0, (CUstream_st*)stream>>>(src, dst, h, w, w * h, channel, is_nv12);
 }
 
-void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel) {
+void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, void* stream) {
     dim3 block, grid;
     int BLOCKX = 32;
     int BLOCKY = 8;
@@ -628,16 +628,16 @@ void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int c
     grid.x = (w + BLOCKX - 1) / BLOCKX;
     grid.y = (h + BLOCKY - 1) / BLOCKY;
     grid.z = batch;
-    bgra_to_gray_kernel<<<grid, block>>>(src, dst, h, w, w * h, channel);
+    bgra_to_gray_kernel<<<grid, block, 0, (CUstream_st*)stream>>>(src, dst, h, w, w * h, channel);
 }
 
 void CudaCopyMakeBorder(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width,
-        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val) {
+        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val, void* stream) {
     int THREAD_PER_BLOCK = 128;
     dim3 grid;
     grid.x = (src_width * src_height * channel + THREAD_PER_BLOCK - 1) / THREAD_PER_BLOCK;
     grid.y = batch;
-    copy_make_border_kernel<<<grid, THREAD_PER_BLOCK>>>(src, dst, src_height, src_width * channel, dst_height, dst_width * channel, top, bottom,
+    copy_make_border_kernel<<<grid, THREAD_PER_BLOCK, 0, (CUstream_st*)stream>>>(src, dst, src_height, src_width * channel, dst_height, dst_width * channel, top, bottom,
         left * channel, right * channel, pad_val);
 }
 
diff --git a/source/tnn/device/cuda/cuda_mat_util.cuh b/source/tnn/device/cuda/cuda_mat_util.cuh
index 0acbbae6f..319e984fd 100644
--- a/source/tnn/device/cuda/cuda_mat_util.cuh
+++ b/source/tnn/device/cuda/cuda_mat_util.cuh
@@ -22,16 +22,16 @@
 
 namespace TNN_NS {
 
-void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel);
-void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int w, int h, int channel);
+void ResizeBilinear(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int dst_w, int dst_h, int channel, void* stream);
+void ResizeNearest(const uint8_t* src, uint8_t* dst, int batch, int src_w, int src_h, int w, int h, int channel, void* stream);
 void CropRGB(const uint8_t* src, uint8_t* dst, int batch, int channel, int src_width, int src_height, int dst_width,
-        int dst_height, int width, int height, int top_left_x, int top_left_y);
+        int dst_height, int width, int height, int top_left_x, int top_left_y, void* stream);
 void CropYUV(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width, int dst_height,
-        int width, int height, int top_left_x, int top_left_y);
-void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12);
-void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel);
+        int width, int height, int top_left_x, int top_left_y, void* stream);
+void YUVToGRBA(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, bool is_nv12, void* stream);
+void BGRAToGRAY(const uint8_t* src, uint8_t* dst, int batch, int h, int w, int channel, void* stream);
 void CudaCopyMakeBorder(const uint8_t* src, uint8_t* dst, int batch, int src_width, int src_height, int dst_width,
-        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val);
+        int dst_height, int channel, int top, int bottom, int left, int right, uint8_t pad_val, void* stream);
 void WarpAffineBilinear(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
         const float (*transform)[3], const float border_val, BorderType border_type = BORDER_TYPE_CONSTANT, void* stream = nullptr);
 void WarpAffineNearest(const uint8_t* src, int batch, int channel, int src_w, int src_h, uint8_t* dst, int dst_w, int dst_h,
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject
deleted file mode 100644
index e76d1da67..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.cproject
+++ /dev/null
@@ -1,1223 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" moduleId="org.eclipse.cdt.core.settings" name="Default">
-				<externalSettings>
-					<externalSetting languages="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073"/>
-				</externalSettings>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration artifactName="B40CTrunk" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
-					<folderInfo id="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113" name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.481495889" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
-							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.100038061" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
-							<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.412463247" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.996758685" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
-								<option id="gnu.both.asm.option.include.paths.900454792" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.221302756" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1353653670" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
-								<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_functions.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
-								</option>
-								<option id="gnu.cpp.compiler.option.preprocessor.def.1893619952" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" useByScannerDiscovery="false" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="__device__"/>
-									<listOptionValue builtIn="false" value="__global__"/>
-									<listOptionValue builtIn="false" value="__shared__"/>
-									<listOptionValue builtIn="false" value="__forceinline__"/>
-									<listOptionValue builtIn="false" value="__host__"/>
-									<listOptionValue builtIn="false" value="__device_builtin__"/>
-									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
-									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
-									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
-									<listOptionValue builtIn="false" value="__align__(...)"/>
-									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
-									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
-								</option>
-								<option id="gnu.cpp.compiler.option.dialect.std.49639338" name="Language standard" superClass="gnu.cpp.compiler.option.dialect.std" useByScannerDiscovery="true" value="gnu.cpp.compiler.dialect.default" valueType="enumerated"/>
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
-								<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
-									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
-								</option>
-								<option id="gnu.c.compiler.option.preprocessor.def.symbols.1005509663" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" useByScannerDiscovery="false" valueType="definedSymbols">
-									<listOptionValue builtIn="false" value="__device__"/>
-									<listOptionValue builtIn="false" value="__global__"/>
-									<listOptionValue builtIn="false" value="__shared__"/>
-									<listOptionValue builtIn="false" value="__forceinline__"/>
-									<listOptionValue builtIn="false" value="__host__"/>
-									<listOptionValue builtIn="false" value="__device_builtin__"/>
-									<listOptionValue builtIn="false" value="__device_builtin_texture_type__"/>
-									<listOptionValue builtIn="false" value="TEST_ARCH=200"/>
-									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
-									<listOptionValue builtIn="false" value="__align__(...)"/>
-									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
-									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
-								</option>
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.1600375047" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.1176124124" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base">
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.958378367" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-						</toolChain>
-					</folderInfo>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.pathentry"/>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-			<storageModule moduleId="org.eclipse.cdt.core.language.mapping"/>
-			<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="B40CTrunk.null.1404415602" name="B40CTrunk"/>
-	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="2">
-		<configuration configurationName="Default">
-			<resource resourceType="PROJECT" workspacePath="/GIT_CUB"/>
-		</configuration>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
-	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings">
-		<doc-comment-owner id="org.eclipse.cdt.ui.doxygen">
-			<path value=""/>
-		</doc-comment-owner>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-			<buildOutputProvider>
-				<openAction enabled="true" filePath=""/>
-				<parser enabled="true"/>
-			</buildOutputProvider>
-			<scannerInfoProvider id="specsFile">
-				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-				<parser enabled="true"/>
-			</scannerInfoProvider>
-		</profile>
-		<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-			<buildOutputProvider>
-				<openAction enabled="true" filePath=""/>
-				<parser enabled="true"/>
-			</buildOutputProvider>
-			<scannerInfoProvider id="makefileGenerator">
-				<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-				<parser enabled="true"/>
-			</scannerInfoProvider>
-		</profile>
-		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-			<buildOutputProvider>
-				<openAction enabled="true" filePath=""/>
-				<parser enabled="true"/>
-			</buildOutputProvider>
-			<scannerInfoProvider id="specsFile">
-				<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-				<parser enabled="true"/>
-			</scannerInfoProvider>
-		</profile>
-		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-			<buildOutputProvider>
-				<openAction enabled="true" filePath=""/>
-				<parser enabled="true"/>
-			</buildOutputProvider>
-			<scannerInfoProvider id="specsFile">
-				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-				<parser enabled="true"/>
-			</scannerInfoProvider>
-		</profile>
-		<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-			<buildOutputProvider>
-				<openAction enabled="true" filePath=""/>
-				<parser enabled="true"/>
-			</buildOutputProvider>
-			<scannerInfoProvider id="specsFile">
-				<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-				<parser enabled="true"/>
-			</scannerInfoProvider>
-		</profile>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1665401269;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.494265807">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.43985841;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1045483126">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1240277003;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1264397663">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.459535216;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2120860882">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1758599759;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.466964704">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1671954574;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.304556051">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.2110267806;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.903720746">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1850250798;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1752562149">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1296776241;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.268633283">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.265387950;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.563557831">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.629007265;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.450470600">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.2085396856;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1885998497">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.652522784;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1098348915">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1149397878;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1156849140">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.586941236;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1654082299">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1214991320;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.332043455">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.440957653;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1117446939">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileCPP"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311;cdt.managedbuild.toolchain.gnu.cygwin.base.1260156311.1722659113;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.158380621;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1945715073">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCWinManagedMakePerProjectProfileC"/>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.make.core.GCCStandardMakePerFileProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="makefileGenerator">
-					<runAction arguments="-f ${project_name}_scd.mk" command="make" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfile">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/${specs_file}" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.cpp" command="g++" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-			<profile id="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC">
-				<buildOutputProvider>
-					<openAction enabled="true" filePath=""/>
-					<parser enabled="true"/>
-				</buildOutputProvider>
-				<scannerInfoProvider id="specsFile">
-					<runAction arguments="-E -P -v -dD ${plugin_state_location}/specs.c" command="gcc" useDefault="true"/>
-					<parser enabled="true"/>
-				</scannerInfoProvider>
-			</profile>
-		</scannerConfigBuildInfo>
-	</storageModule>
-</cproject>
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore
deleted file mode 100644
index 3441f55e5..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.p4config
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project
deleted file mode 100644
index 7aca9e046..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.project
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>GIT_CUB</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
-			<triggers>clean,full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
-			<triggers>full,incremental,</triggers>
-			<arguments>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.cdt.core.cnature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
-		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
-		<nature>org.eclipse.cdt.core.ccnature</nature>
-	</natures>
-</projectDescription>
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore
deleted file mode 100644
index d81d4c414..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/language.settings.xml
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs
deleted file mode 100644
index 64da7771b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.codan.core.prefs
+++ /dev/null
@@ -1,72 +0,0 @@
-eclipse.preferences.version=1
-org.eclipse.cdt.codan.checkers.errnoreturn=Warning
-org.eclipse.cdt.codan.checkers.errnoreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
-org.eclipse.cdt.codan.checkers.errreturnvalue=Error
-org.eclipse.cdt.codan.checkers.errreturnvalue.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.checkers.nocommentinside=-Error
-org.eclipse.cdt.codan.checkers.nocommentinside.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.checkers.nolinecomment=-Error
-org.eclipse.cdt.codan.checkers.nolinecomment.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.checkers.noreturn=Error
-org.eclipse.cdt.codan.checkers.noreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
-org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation=Error
-org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem=Error
-org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem=Error
-org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},no_break_comment\=>"no break",last_case_param\=>true,empty_case_param\=>false}
-org.eclipse.cdt.codan.internal.checkers.CatchByReference=Warning
-org.eclipse.cdt.codan.internal.checkers.CatchByReference.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},unknown\=>false,exceptions\=>()}
-org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem=Error
-org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization=Warning
-org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},skip\=>true}
-org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.InvalidArguments=Error
-org.eclipse.cdt.codan.internal.checkers.InvalidArguments.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem=Error
-org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem=Error
-org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem=Error
-org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker=-Info
-org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},pattern\=>"^[a-z]",macro\=>true,exceptions\=>()}
-org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.OverloadProblem=Error
-org.eclipse.cdt.codan.internal.checkers.OverloadProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem=Error
-org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem=-Warning
-org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem=-Warning
-org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>()}
-org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},paramNot\=>false}
-org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},else\=>false,afterelse\=>false}
-org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
-org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
-org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem=Warning
-org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>("@(\#)","$Id")}
-org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem=Error
-org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
-useParentScope=false
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs
deleted file mode 100644
index 80b8e65c7..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.core.prefs
+++ /dev/null
@@ -1,177 +0,0 @@
-eclipse.preferences.version=1
-indexer/indexAllFiles=true
-indexer/indexAllHeaderVersions=false
-indexer/indexAllVersionsSpecificHeaders=
-indexer/indexOnOpen=false
-indexer/indexUnusedHeadersWithAlternateLang=false
-indexer/indexUnusedHeadersWithDefaultLang=true
-indexer/indexerId=org.eclipse.cdt.core.fastIndexer
-indexer/skipFilesLargerThanMB=8
-indexer/skipImplicitReferences=false
-indexer/skipIncludedFilesLargerThanMB=16
-indexer/skipMacroReferences=false
-indexer/skipReferences=false
-indexer/skipTypeReferences=false
-indexer/useHeuristicIncludeResolution=true
-org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation=16
-org.eclipse.cdt.core.formatter.alignment_for_assignment=16
-org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration=48
-org.eclipse.cdt.core.formatter.alignment_for_binary_expression=16
-org.eclipse.cdt.core.formatter.alignment_for_compact_if=0
-org.eclipse.cdt.core.formatter.alignment_for_conditional_expression=48
-org.eclipse.cdt.core.formatter.alignment_for_conditional_expression_chain=18
-org.eclipse.cdt.core.formatter.alignment_for_constructor_initializer_list=0
-org.eclipse.cdt.core.formatter.alignment_for_declarator_list=16
-org.eclipse.cdt.core.formatter.alignment_for_enumerator_list=48
-org.eclipse.cdt.core.formatter.alignment_for_expression_list=0
-org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer=16
-org.eclipse.cdt.core.formatter.alignment_for_member_access=0
-org.eclipse.cdt.core.formatter.alignment_for_overloaded_left_shift_chain=16
-org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration=48
-org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration=48
-org.eclipse.cdt.core.formatter.brace_position_for_array_initializer=next_line
-org.eclipse.cdt.core.formatter.brace_position_for_block=next_line
-org.eclipse.cdt.core.formatter.brace_position_for_block_in_case=end_of_line
-org.eclipse.cdt.core.formatter.brace_position_for_method_declaration=next_line
-org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration=end_of_line
-org.eclipse.cdt.core.formatter.brace_position_for_switch=end_of_line
-org.eclipse.cdt.core.formatter.brace_position_for_type_declaration=next_line
-org.eclipse.cdt.core.formatter.comment.min_distance_between_code_and_line_comment=1
-org.eclipse.cdt.core.formatter.comment.never_indent_line_comments_on_first_column=true
-org.eclipse.cdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=true
-org.eclipse.cdt.core.formatter.compact_else_if=true
-org.eclipse.cdt.core.formatter.continuation_indentation=1
-org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer=1
-org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line=false
-org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header=false
-org.eclipse.cdt.core.formatter.indent_access_specifier_extra_spaces=0
-org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier=true
-org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header=false
-org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases=true
-org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header=false
-org.eclipse.cdt.core.formatter.indent_empty_lines=false
-org.eclipse.cdt.core.formatter.indent_statements_compare_to_block=true
-org.eclipse.cdt.core.formatter.indent_statements_compare_to_body=true
-org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases=true
-org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch=false
-org.eclipse.cdt.core.formatter.indentation.size=4
-org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_colon_in_constructor_initializer_list=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement=insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
-org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block=insert
-org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator=insert
-org.eclipse.cdt.core.formatter.insert_space_after_binary_operator=insert
-org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments=insert
-org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters=insert
-org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block=insert
-org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
-org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause=insert
-org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case=insert
-org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional=insert
-org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments=insert
-org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters=insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional=insert
-org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for=insert
-org.eclipse.cdt.core.formatter.insert_space_after_unary_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator=insert
-org.eclipse.cdt.core.formatter.insert_space_before_binary_operator=insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause=insert
-org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional=insert
-org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
-org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while=insert
-org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional=insert
-org.eclipse.cdt.core.formatter.insert_space_before_semicolon=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
-org.eclipse.cdt.core.formatter.insert_space_before_unary_operator=do not insert
-org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
-org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets=do not insert
-org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification=do not insert
-org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
-org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
-org.eclipse.cdt.core.formatter.join_wrapped_lines=true
-org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line=false
-org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line=false
-org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line=true
-org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line=false
-org.eclipse.cdt.core.formatter.lineSplit=80
-org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve=1
-org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line=true
-org.eclipse.cdt.core.formatter.tabulation.char=space
-org.eclipse.cdt.core.formatter.tabulation.size=4
-org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations=false
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs
deleted file mode 100644
index ca73f82de..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.cdt.ui.prefs
+++ /dev/null
@@ -1,3 +0,0 @@
-eclipse.preferences.version=1
-formatter_profile=_B40C
-formatter_settings_version=1
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs
deleted file mode 100644
index 2e6330e75..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/.settings/org.eclipse.core.runtime.prefs
+++ /dev/null
@@ -1,4 +0,0 @@
-content-types/enabled=true
-content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
-content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
-eclipse.preferences.version=1
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md
deleted file mode 100644
index 777a8091f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CHANGELOG.md
+++ /dev/null
@@ -1,848 +0,0 @@
-# CUB 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
-
-## Summary
-
-CUB 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
-  and the CUDA Toolkit 11.1 release.
-
-## Bug Fixes
-
-- #1217: Move static local in `cub::DeviceCount` to a separate host-only
-    function because NVC++ doesn't support static locals in host-device
-    functions.
-
-# CUB 1.9.10 (NVIDIA HPC SDK 20.5)
-
-## Summary
-
-CUB 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
-It adds CMake `find_package` support.
-C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
-Starting with the upcoming 1.10.0 release, C++03 support will be dropped
-  entirely.
-
-## Breaking Changes
-
-- Thrust now checks that it is compatible with the version of CUB found
-    in your include path, generating an error if it is not.
-  If you are using your own version of CUB, it may be too old.
-  It is recommended to simply delete your own version of CUB and use the
-    version of CUB that comes with Thrust.
-- C++03 and C++11 are deprecated.
-  Using these dialects will generate a compile-time warning.
-  These warnings can be suppressed by defining
-    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
-    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
-    deprecation warnings).
-  Suppression is only a short term solution.
-  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
-    near future.
-- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
-  Using these compilers will generate a compile-time warning.
-  These warnings can be suppressed by defining
-  `CUB_IGNORE_DEPRECATED_COMPILER`.
-  Suppression is only a short term solution.
-  We will be dropping support for these compilers in the near future.
-
-## New Features
-
-- CMake `find_package` support.
-  Just point CMake at the `cmake` folder in your CUB include directory
-    (ex: `cmake -DCUB_DIR=/usr/local/cuda/include/cub/cmake/ .`) and then you
-    can add CUB to your CMake project with `find_package(CUB REQUIRED CONFIG)`.
-
-# CUB 1.9.9 (CUDA 11.0)
-
-## Summary
-
-CUB 1.9.9 is the release accompanying the CUDA Toolkit 11.0 release.
-It introduces CMake support, version macros, platform detection machinery,
-  and support for NVC++, which uses Thrust (and thus CUB) to implement
-  GPU-accelerated C++17 Parallel Algorithms.
-Additionally, the scan dispatch layer was refactored and modernized.
-C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
-Starting with the upcoming 1.10.0 release, C++03 support will be dropped
-  entirely.
-
-## Breaking Changes
-
-- Thrust now checks that it is compatible with the version of CUB found
-    in your include path, generating an error if it is not.
-  If you are using your own version of CUB, it may be too old.
-  It is recommended to simply delete your own version of CUB and use the
-    version of CUB that comes with Thrust.
-- C++03 and C++11 are deprecated.
-  Using these dialects will generate a compile-time warning.
-  These warnings can be suppressed by defining
-    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
-    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP11` (to suppress C++11
-    deprecation warnings).
-  Suppression is only a short term solution.
-  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
-    near future.
-- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
-  Using these compilers will generate a compile-time warning.
-  These warnings can be suppressed by defining
-    `CUB_IGNORE_DEPRECATED_COMPILER`.
-  Suppression is only a short term solution.
-  We will be dropping support for these compilers in the near future.
-
-## New Features
-
-- CMake support.
-  Thanks to Francis Lemaire for this contribution.
-- Refactorized and modernized scan dispatch layer.
-  Thanks to Francis Lemaire for this contribution.
-- Policy hooks for device-wide reduce, scan, and radix sort facilities
-    to simplify tuning and allow users to provide custom policies.
-  Thanks to Francis Lemaire for this contribution.
-- `<cub/version.cuh>`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`,
-    `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`.
-- Platform detection machinery:
-  - `<cub/util_cpp_dialect.cuh>`: Detects the C++ standard dialect.
-  - `<cub/util_compiler.cuh>`: host and device compiler detection.
-  - `<cub/util_deprecated.cuh>`: `CUB_DEPRECATED`.
-  - <cub/config.cuh>`: Includes `<cub/util_arch.cuh>`,
-      `<cub/util_compiler.cuh>`, `<cub/util_cpp_dialect.cuh>`,
-      `<cub/util_deprecated.cuh>`, `<cub/util_macro.cuh>`,
-      `<cub/util_namespace.cuh>`
-- `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for
-    `cudaGetDeviceCount`.
-
-## Other Enhancements
-
-- Lazily initialize the per-device CUDAattribute caches, because CUDA context
-    creation is expensive and adds up with large CUDA binaries on machines with
-    many GPUs.
-  Thanks to the NVIDIA PyTorch team for bringing this to our attention.
-- Make `cub::SwitchDevice` avoid setting/resetting the device if the current
-    device is the same as the target device.
-
-## Bug Fixes
-
-- Add explicit failure parameter to CAS in the CUB attribute cache to workaround
-    a GCC 4.8 bug.
-- Revert a change in reductions that changed the signedness of the `lane_id`
-    variable to suppress a warning, as this introduces a bug in optimized device
-    code.
-- Fix initialization in `cub::ExclusiveSum`.
-  Thanks to Conor Hoekstra for this contribution.
-- Fix initialization of the `std::array` in the CUB attribute cache.
-- Fix `-Wsign-compare` warnings.
-  Thanks to Elias Stehle for this contribution.
-- Fix `test_block_reduce.cu` to build without parameters.
-  Thanks to Francis Lemaire for this contribution.
-- Add missing includes to `grid_even_share.cuh`.
-  Thanks to Francis Lemaire for this contribution.
-- Add missing includes to `thread_search.cuh`.
-  Thanks to Francis Lemaire for this contribution.
-- Add missing includes to `cub.cuh`.
-  Thanks to Felix Kallenborn for this contribution.
-
-# CUB 1.9.8-1 (NVIDIA HPC SDK 20.3)
-
-## Summary
-
-CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release.
-It contains modifications necessary to serve as the implementation of NVC++'s
-  GPU-accelerated C++17 Parallel Algorithms.
-
-# CUB 1.9.8 (CUDA 11.0 Early Access)
-
-## Summary
-
-CUB 1.9.8 is the first release of CUB to be officially supported and included
-  in the CUDA Toolkit.
-When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query
-  APIs, which improves performance of these queries by 20x to 50x when they
-  are called concurrently by multiple host threads.
-
-## Enhancements
-
-- (C++11 or later) Cache calls to `cudaFuncGetAttributes` and
-    `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`.
-    These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform
-    poorly under contention; with the caching, they are 20 to 50x faster when
-    called concurrently.
-  Thanks to Bilge Acun for bringing this issue to our attention.
-- `DispatchReduce` now takes an `OutputT` template parameter so that users can
-    specify the intermediate type explicitly.
-- Radix sort tuning policies updates to fix performance issues for element
-    types smaller than 4 bytes.
-
-## Bug Fixes
-
-- Change initialization style from copy initialization to direct initialization
-    (which is more permissive) in `AgentReduce` to allow a wider range of types
-    to be used with it.
-- Fix bad signed/unsigned comparisons in `WarpReduce`.
-- Fix computation of valid lanes in warp-level reduction primitive to correctly
-    handle the case where there are 0 input items per warp.
-
-# CUB 1.8.0
-
-## Summary
-
-CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces.
-
-## Breaking Changes
-
-- The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and
-    `cub::ShuffleDown` have been changed to allow for better computation of the
-    PTX SHFL control constant for logical warps smaller than 32 threads.
-
-## Bug Fixes
-
-- #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical
-    warps smaller than 32 threads.
-
-# CUB 1.7.5
-
-## Summary
-
-CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting
-  performance for 1 byte keys.
-It was incorporated into Thrust 1.9.2.
-
-## Enhancements
-
-- Radix sort support for `__half` keys.
-- Radix sort tuning policy updates to improve 1 byte key performance.
-
-## Bug Fixes
-
-- Syntax tweaks to mollify Clang.
-- #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results.
-- #128: 7-bit sorting passes fail for SM61 with large values.
-
-# CUB 1.7.4
-
-## Summary
-
-CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2.
-
-## Bug Fixes
-
-- #114: Can't pair non-trivially-constructible values in radix sort.
-- #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical
-    warp sizes smaller than 32.
-
-# CUB 1.7.3
-
-## Summary
-
-CUB 1.7.3 is a minor release.
-
-## Bug Fixes
-
-- #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs.
-
-# CUB 1.7.2
-
-## Summary
-
-CUB 1.7.2 is a minor release.
-
-## Bug Fixes
-
-- #104: Device-wide reduction is now "run-to-run" deterministic for
-    pseudo-associative reduction operators (like floating point addition).
-
-# CUB 1.7.1
-
-## Summary
-
-CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a
-  number of bug fixes.
-
-## Enhancements
-
-- Radix sort tuning policies updated for SM7x (Volta).
-
-## Bug Fixes
-
-- #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older.
-- #103: Can't mix Thrust from CUDA 9.0 and CUB.
-- #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict
-    with `std::min`/`std::max`.
-- #99: Radix sorting crashes NVCC on Windows 10 for SM52.
-- #98: cuda-memcheck: --tool initcheck failed with lineOfSight.
-- #94: Git clone size.
-- #93: Accept iterators for segment offsets.
-- #87: CUB uses anonymous unions which is not valid C++.
-- #44: Check for C++11 is incorrect for Visual Studio 2013.
-
-# CUB 1.7.0
-
-## Summary
-
-CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs.
-It is compatible with independent thread scheduling.
-It was incorporated into Thrust 1.9.0-5.
-
-## Breaking Changes
-
-- Remove `cub::WarpAll` and `cub::WarpAny`.
-  These functions served to emulate `__all` and `__any` functionality for
-    SM1x devices, which did not have those operations.
-  However, SM1x devices are now deprecated in CUDA, and the interfaces of these
-    two functions are now lacking the lane-mask needed for collectives to run on
-    SM7x and newer GPUs which have independent thread scheduling.
-
-## Other Enhancements
-
-- Remove any assumptions of implicit warp synchronization to be compatible with
-    SM7x's (Volta) independent thread scheduling.
-
-## Bug Fixes
-
-- #86: Incorrect results with reduce-by-key.
-
-# CUB 1.6.4
-
-## Summary
-
-CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x
-  (Pascal) GPUs.
-
-## Enhancements
-
-- Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) -
-    3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively.
-
-## Bug Fixes
-
-- Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5.
-- #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have
-    pointer-to-const type.
-- Mollify Clang device-side warnings.
-- Remove out-dated MSVC project files.
-
-# CUB 1.6.3
-
-## Summary
-
-CUB 1.6.3 improves support for Windows, changes
-  `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type,
-  and enhances radix sort performance for SM6x (Pascal) GPUs.
-
-## Breaking Changes
-
-- `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data
-    type, instead of the `Iterator` type.
-  This allows for output iterators having `void` as their `value_type` (e.g.
-    discard iterators).
-
-## Other Enhancements
-
-- Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte
-    keys/s on GP100.
-- Improved support for Windows (warnings, alignment, etc).
-
-## Bug Fixes
-
-- #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items.
-- #72: `cub:InequalityWrapper::operator` should be non-const.
-- #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor.
-- #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type`
-    isn't `T`.
-- #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch
-    specialization.
-
-# CUB 1.6.2 (previously 1.5.5)
-
-## Summary
-
-CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal)
-  GPUs.
-
-## Enhancements
-
-- Radix sort tuning policies updated for SM6x (Pascal) GPUs.
-
-## Bug Fixes
-
-- Fix AArch64 compilation of `cub::CachingDeviceAllocator`.
-
-# CUB 1.6.1 (previously 1.5.4)
-
-## Summary
-
-CUB 1.6.1 (previously 1.5.4) is a minor release.
-
-## Bug Fixes
-
-- Fix radix sorting bug introduced by scan refactorization.
-
-# CUB 1.6.0 (previously 1.5.3)
-
-## Summary
-
-CUB 1.6.0 changes the scan and reduce interfaces.
-Exclusive scans now accept an "initial value" instead of an "identity value".
-Scans and reductions now support differing input and output sequence types.
-Additionally, many bugs have been fixed.
-
-## Breaking Changes
-
-- Device/block/warp-wide exclusive scans have been revised to now accept an
-    "initial value" (instead of an "identity value") for seeding the computation
-    with an arbitrary prefix.
-- Device-wide reductions and scans can now have input sequence types that are
-    different from output sequence types (as long as they are convertible).
-
-## Other Enhancements
-
-- Reduce repository size by moving the doxygen binary to doc repository.
-- Minor reduction in `cub::BlockScan` instruction counts.
-
-## Bug Fixes
-
-- Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`.
-- Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into
-    double.
-- Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`.
-- Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error
-    state upon successful retry.
-- Issue #46: Very high amount of needed memory from the
-    `cub::DeviceHistogram::HistogramEven`.
-- Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled
-
-# CUB 1.5.2
-
-## Summary
-
-CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance
-  for SM5x (Maxwell).
-
-## Enhancements
-
-- Improved medium-size scan performance on SM5x (Maxwell).
-- Refactored `cub::CachingDeviceAllocator`:
-  - Now spends less time locked.
-  - Uses C++11's `std::mutex` when available.
-  - Failure to allocate a block from the runtime will retry once after
-  		freeing cached allocations.
-  - Now respects max-bin, fixing an issue where blocks in excess of max-bin
-      were still being retained in the free cache.
-
-## Bug fixes:
-
-- Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs.
-
-# CUB 1.5.1
-
-## Summary
-
-CUB 1.5.1 is a minor release.
-
-## Bug Fixes
-
-- Fix for incorrect `cub::DeviceRadixSort` output for some small problems on
-    SM52 (Mawell) GPUs.
-- Fix for macro redefinition warnings when compiling `thrust::sort`.
-
-# CUB 1.5.0
-
-CUB 1.5.0 introduces segmented sort and reduction primitives.
-
-## New Features:
-
-- Segmented device-wide operations for device-wide sort and reduction primitives.
-
-## Bug Fixes:
-
-- #36: `cub::ThreadLoad` generates compiler errors when loading from
-    pointer-to-const.
-- #29: `cub::DeviceRadixSort::SortKeys<bool>` yields compiler errors.
-- #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`.
-- #25: Fix for incorrect results and crashes when radix sorting 0-length
-    problems.
-- Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and
-    warp-reduction on non-primitive data types (e.g. user-defined structs).
-- Fix small radix sorting problems where 0 temporary bytes were required and
-    users code was invoking `malloc(0)` on some systems where that returns
-    `NULL`.
-  CUB assumed the user was asking for the size again and not running the sort.
-
-# CUB 1.4.1
-
-## Summary
-
-CUB 1.4.1 is a minor release.
-
-## Enhancements
-
-- Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types.
-
-## Bug Fixes
-
-- Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and
-    `cub::DeviceReduceByKey`.
-- Remove requirement for callers to define the `CUB_CDP` macro
-    when invoking CUB device-wide rountines using CUDA dynamic parallelism.
-- Fix headers not being included in the proper order (or missing includes)
-    for some block-wide functions.
-
-# CUB 1.4.0
-
-## Summary
-
-CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`,
-  improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell)
-  GPUs.
-
-## New Features:
-
-- `cub::DeviceSpmv` methods for multiplying sparse matrices by
-    dense vectors, load-balanced using a merge-based parallel decomposition.
-- `cub::DeviceRadixSort` sorting entry-points that always return
-    the sorted output into the specified buffer, as opposed to the
-    `cub::DoubleBuffer` in which it could end up in either buffer.
-- `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting
-    offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in
-    a given sequence.
-  Useful for top-down partitioning algorithms like MSD sorting of very-large
-    keys.
-
-## Other Enhancements
-
-- Support and performance tuning for SM5x (Maxwell) GPUs.
-- Updated cub::DeviceHistogram implementation that provides the same
-    "histogram-even" and "histogram-range" functionality as IPP/NPP.
-  Provides extremely fast and, perhaps more importantly, very uniform
-    performance response across diverse real-world datasets, including
-    pathological (homogeneous) sample distributions.
-
-# CUB 1.3.2
-
-## Summary
-
-CUB 1.3.2 is a minor release.
-
-## Bug Fixes
-
-- Fix `cub::DeviceReduce` where reductions of small problems (small enough to
-    only dispatch a single thread block) would run in the default stream (stream
-    zero) regardless of whether an alternate stream was specified.
-
-# CUB 1.3.1
-
-## Summary
-
-CUB 1.3.1 is a minor release.
-
-## Bug Fixes
-
-- Workaround for a benign WAW race warning reported by cuda-memcheck
-    in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm.
-- Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more
-    key bits than the caller specified (up to the nearest radix digit).
-- Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and
-    SM3x (Kepler) GPUs.
-
-# CUB 1.3.0
-
-## Summary
-
-CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide
-  primitives and adds an enhanced version of `cub::WarpScan`.
-
-## Breaking Changes
-
-- CUB's collective (block-wide, warp-wide) primitives underwent a minor
-    interface refactoring:
-  - To provide the appropriate support for multidimensional thread blocks,
-      The interfaces for collective classes are now template-parameterized by
-      X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being
-      optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`).
-    Furthermore, the constructors that accept remapped linear
-      thread-identifiers have been removed: all primitives now assume a
-      row-major thread-ranking for multidimensional thread blocks.
-  - To allow the host program (compiled by the host-pass) to accurately
-      determine the device-specific storage requirements for a given collective
-      (compiled for each device-pass), the interfaces for collective classes
-      are now (optionally) template-parameterized by the desired PTX compute
-      capability.
-    This is useful when aliasing collective storage to shared memory that has
-      been allocated dynamically by the host at the kernel call site.
-  - Most CUB programs having typical 1D usage should not require any
-      changes to accomodate these updates.
-
-## New Features
-
-- Added "combination" `cub::WarpScan` methods for efficiently computing
-    both inclusive and exclusive prefix scans (and sums).
-
-## Bug Fixes
-
-- Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and
-    `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be
-    returned when parameterized for floating-point types (fp32, fp64).
-- Workaround for ptxas error when compiling with -G flag on Linux (for
-    debug instrumentation).
-- Fixes for certain scan scenarios using custom scan operators where code
-    compiled for SM1x is run on newer GPUs of higher compute-capability: the
-    compiler could not tell which memory space was being used collective
-    operations and was mistakenly using global ops instead of shared ops.
-
-# CUB 1.2.3
-
-## Summary
-
-CUB 1.2.3 is a minor release.
-
-## Bug Fixes
-
-- Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for
-    non-primitive value types.
-- Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation.
-
-# CUB 1.2.2
-
-## Summary
-
-CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections
-  for examples.
-
-## New Features
-
-- MSVC project solutions for device-wide and block-wide examples
-- New algorithmic variant of cub::BlockReduce for improved performance
-    when using commutative operators (e.g., numeric addition).
-
-## Bug Fixes
-
-- Inclusion of Thrust headers in a certain order prevented CUB device-wide
-    primitives from working properly.
-
-# CUB 1.2.0
-
-## Summary
-
-CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and
-  `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0.
-
-## New Features
-
-- `cub::DeviceReduce::ReduceByKey`.
-- `cub::DeviceReduce::RunLengthEncode`.
-
-## Other Enhancements
-
-- Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition`
-    performance.
-- Documentation and testing:
-  - Added performance-portability plots for many device-wide primitives.
-  - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and
-      Thrust 1.6 (and older).
-- Revised the operation of temporary tile status bookkeeping for
-    `cub::DeviceScan` (and similar) to be safe for current code run on future
-    platforms (now uses proper fences).
-
-## Bug Fixes
-
-- Fix `cub::DeviceScan` bug where Windows alignment disagreements between host
-    and device regarding user-defined data types would corrupt tile status.
-- Fix `cub::BlockScan` bug where certain exclusive scans on custom data types
-    for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for
-    the first thread in the block.
-- Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0.
-
-# CUB 1.1.1
-
-## Summary
-
-CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting,
-  `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and
-  `cub::MaxSMOccupancy`.
-Additionally, scan and sort performance for older GPUs has been improved and
-  many bugs have been fixed.
-
-## Breaking Changes
-
-- Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing
-    cache-modifiers from their interfaces.
-  `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator`
-    should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that
-    behavior.
-
-## New Features
-
-- `cub::TexObjInputIterator`, `cub::TexRefInputIterator`,
-    `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator`
-    types for loading & storing arbitrary types through the cache hierarchy.
-  They are compatible with Thrust.
-- Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`.
-- Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`.
-- `cub::DeviceSelect` (select-unique, select-if, and select-flagged).
-- `cub::DevicePartition` (partition-if, partition-flagged).
-- Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for
-    warp-wide communication of arbitrary data types (SM3x and up).
-- `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given
-    kernel function pointer.
-
-## Other Enhancements
-
-- Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older
-    GPUs (SM1x to SM3x).
-- Renamed device-wide `stream_synchronous` param to `debug_synchronous` to
-    avoid confusion about usage.
-- Documentation improvements:
-  - Added simple examples of device-wide methods.
-  - Improved doxygen documentation and example snippets.
-- Improved test coverege to include up to 21,000 kernel variants and 851,000
-    unit tests (per architecture, per platform).
-
-## Bug Fixes
-
-- Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when
-    operating on non-primitive types for older architectures SM1x.
-- SHFL-based scans and reductions produced incorrect results for multi-word
-    types (size > 4B) on Linux.
-- For `cub::WarpScan`-based scans, not all threads in the first warp were
-    entering the prefix callback functor.
-- `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35
-    architectures.
-- `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit
-    Linux was incorrect.
-- `cub::BlockDiscontinuity` failed to compile for types other than
-    `int32_t`/`uint32_t`.
-- CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide
-    methods now report the same temporary storage allocation size requirement as
-    their host-callable counterparts.
-
-# CUB 1.0.2
-
-## Summary
-
-CUB 1.0.2 is a minor release.
-
-## Bug Fixes
-
-- Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`,
-    and `cub::BlockDiscontinuity`.
-- Cleaned up unnecessary/missing header includes.
-  You can now safely include a specific .cuh (instead of `cub.cuh`).
-- Bug/compilation fixes for `cub::BlockHistogram`.
-
-# CUB 1.0.1
-
-## Summary
-
-CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`.
-Numerous other performance and correctness fixes and included.
-
-## Breaking Changes
-
-- New collective interface idiom (specialize/construct/invoke).
-
-## New Features
-
-- `cub::DeviceRadixSort`.
-  Implements short-circuiting for homogenous digit passes.
-- `cub::DeviceScan`.
-  Implements single-pass "adaptive-lookback" strategy.
-
-## Other Enhancements
-
-- Significantly improved documentation (with example code snippets).
-- More extensive regression test suit for aggressively testing collective
-    variants.
-- Allow non-trially-constructed types (previously unions had prevented aliasing
-    temporary storage of those types).
-- Improved support for SM3x SHFL (collective ops now use SHFL for types larger
-    than 32 bits).
-- Better code generation for 64-bit addressing within
-    `cub::BlockLoad`/`cub::BlockStore`.
-- `cub::DeviceHistogram` now supports histograms of arbitrary bins.
-- Updates to accommodate CUDA 5.5 dynamic parallelism.
-
-## Bug Fixes
-
-- Workarounds for SM10 codegen issues in uncommonly-used
-    `cub::WarpScan`/`cub::WarpReduce` specializations.
-
-# CUB 0.9.4
-
-## Summary
-
-CUB 0.9.3 is a minor release.
-
-## Enhancements
-
-- Various documentation updates and corrections.
-
-## Bug Fixes
-
-- Fixed compilation errors for SM1x.
-- Fixed compilation errors for some WarpScan entrypoints on SM3x and up.
-
-# CUB 0.9.3
-
-## Summary
-
-CUB 0.9.3 adds histogram algorithms and work management utility descriptors.
-
-## New Features
-
-- `cub::DevicHistogram256`.
-- `cub::BlockHistogram256`.
-- `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which
-    trades more register consumption for less shared memory I/O.
-- `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors.
-
-## Other Enhancements
-
-- Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves
-    performance on SM3x by using SHFL.
-- Allow types other than builtin types to be used in `cub::WarpScan::*Sum`
-    methods if they only have `operator+` overloaded.
-  Previously they also required to support assignment from `int(0)`.
-- Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work
-    even when block size is not an even multiple of warp size.
-- Refactoring of `cub::DeviceAllocator` interface and
-    `cub::CachingDeviceAllocator` implementation.
-
-# CUB 0.9.2
-
-## Summary
-
-CUB 0.9.2 adds `cub::WarpReduce`.
-
-## New Features
-
-- `cub::WarpReduce`, which uses the SHFL instruction when applicable.
-  `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing
-    its own.
-
-## Enhancements
-
-- Documentation updates and corrections.
-
-## Bug Fixes
-
-- Fixes for 64-bit Linux compilation warnings and errors.
-
-# CUB 0.9.1
-
-## Summary
-
-CUB 0.9.1 is a minor release.
-
-## Bug Fixes
-
-- Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and
-    summation.
-  Summation entrypoints are now called `::Sum()`, similar to the
-    convention in `cub::BlockScan`.
-- Small edits to documentation and download tracking.
-
-# CUB 0.9.0
-
-## Summary
-
-Initial preview release.
-CUB is the first durable, high-performance library of cooperative block-level,
-  warp-level, and thread-level primitives for CUDA kernel programming.
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt
deleted file mode 100644
index 9b76052b3..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/CMakeLists.txt
+++ /dev/null
@@ -1,334 +0,0 @@
-cmake_minimum_required(VERSION 3.8)
-
-project(CUB CUDA CXX)
-
-set(CUB_SOURCE ${CMAKE_SOURCE_DIR})
-# include(cmake/common_variables.cmake)
-
-if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
-  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
-
-  set_property(
-    CACHE CMAKE_BUILD_TYPE
-    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
-  )
-endif ()
-
-if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
-  set(CMAKE_CONFIGURE_DEPENDS CONFIGURE_DEPENDS)
-endif ()
-
-list(INSERT CMAKE_MODULE_PATH 0 ${PROJECT_SOURCE_DIR}/cmake)
-include(AppendOptionIfAvailable)
-
-# Please note this also sets the default for the CUDA C++ version; see the comment below.
-set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-message("-- C++ Standard version: ${CMAKE_CXX_STANDARD}")
-
-if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
-  unset(CMAKE_CUDA_HOST_COMPILER CACHE)
-  message(FATAL_ERROR "CUB tests and examples require the C++ compiler"
-      " and the CUDA host compiler to be the same; to set this compiler, please"
-      " use the CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER"
-      " variable.")
-endif ()
-set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-
-enable_language(CUDA)
-
-# Force CUDA C++ standard to be the same as the C++ standard used.
-#
-# Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
-# which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
-# versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
-# In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
-if (DEFINED CMAKE_CUDA_STANDARD)
-    message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
-        " is used as the C++ standard version for both C++ and CUDA.")
-endif()
-unset(CMAKE_CUDA_STANDARD CACHE)
-set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
-
-set(CUB_HIGHEST_COMPUTE_ARCH 75)
-set(CUB_KNOWN_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72 75)
-
-option(CUB_DISABLE_ARCH_BY_DEFAULT "If ON, then all CUDA architectures are disabled on the initial CMake run." OFF)
-set(OPTION_INIT ON)
-if (CUB_DISABLE_ARCH_BY_DEFAULT)
-  set(OPTION_INIT OFF)
-endif ()
-
-if (NOT ${CUB_HIGHEST_COMPUTE_ARCH} IN_LIST CUB_KNOWN_COMPUTE_ARCHS)
-  message(FATAL_ERROR "When changing the highest compute version, don't forget to add it to the list!")
-endif ()
-
-foreach (COMPUTE_ARCH IN LISTS CUB_KNOWN_COMPUTE_ARCHS)
-  option(CUB_ENABLE_COMPUTE_${COMPUTE_ARCH} "Enable code generation for tests for sm_${COMPUTE_ARCH}" ${OPTION_INIT})
-  if (CUB_ENABLE_COMPUTE_${COMPUTE_ARCH})
-    list(APPEND CUB_ENABLED_ARCH ${COMPUTE_ARCH})
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
-    set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} sm_${COMPUTE_ARCH}")
-  endif ()
-endforeach ()
-
-option(CUB_ENABLE_COMPUTE_FUTURE "Enable code generation for tests for compute_${CUB_HIGHEST_COMPUTE_ARCH}" ${OPTION_INIT})
-if (CUB_ENABLE_COMPUTE_FUTURE)
-  list(APPEND CUB_ENABLED_ARCH ${CUB_HIGHEST_COMPUTE_ARCH})
-  set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${CUB_HIGHEST_COMPUTE_ARCH},code=compute_${CUB_HIGHEST_COMPUTE_ARCH}")
-  set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} compute_${CUB_HIGHEST_COMPUTE_ARCH}")
-endif ()
-
-message("-- Enabled CUDA architectures:${COMPUTE_MESSAGE}")
-
-# Create a variable containing the minimal target arch for tests
-list(REMOVE_DUPLICATES CUB_ENABLED_ARCH)
-list(SORT CUB_ENABLED_ARCH)
-list(GET CUB_ENABLED_ARCH 0 CUB_MINIMAL_ENABLED_ARCH)
-
-
-if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00)
-    message(FATAL_ERROR "This version of MSVC no longer supported.")
-  endif ()
-endif ()
-
-if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
-    message(FATAL_ERROR "This version of GCC no longer supported.")
-  endif ()
-endif ()
-
-if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  # TODO Enable /Wall
-  append_option_if_available("/WX" CUB_CXX_WARNINGS)
-
-  # Disabled loss-of-data conversion warnings.
-  # TODO Re-enable.
-  append_option_if_available("/wd4244" CUB_CXX_WARNINGS)
-  append_option_if_available("/wd4267" CUB_CXX_WARNINGS)
-
-  # Suppress numeric conversion-to-bool warnings.
-  # TODO Re-enable.
-  append_option_if_available("/wd4800" CUB_CXX_WARNINGS)
-
-  # Disable warning about applying unary operator- to unsigned type.
-  append_option_if_available("/wd4146" CUB_CXX_WARNINGS)
-
-  set(CUB_TREAT_FILE_AS_CXX "/TP")
-else ()
-  append_option_if_available("-Werror" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wall" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wextra" CUB_CXX_WARNINGS)
-  append_option_if_available("-Winit-self" CUB_CXX_WARNINGS)
-  append_option_if_available("-Woverloaded-virtual" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wcast-qual" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wno-cast-align" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wno-long-long" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wno-variadic-macros" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wno-unused-function" CUB_CXX_WARNINGS)
-  append_option_if_available("-Wno-unused-variable" CUB_CXX_WARNINGS)
-
-  set(CUB_TREAT_FILE_AS_CXX "-x c++")
-endif ()
-
-if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5)
-    # In GCC 4.4, the CUDA backend's kernel launch templates cause
-    # impossible-to-decipher "'<anonymous>' is used uninitialized in this
-    # function" warnings, so we disable uninitialized variable warnings.
-    append_option_if_available("-Wno-uninitialized" CUB_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5)
-    # This isn't available until GCC 4.3, and misfires on TMP code until
-    # GCC 4.5.
-    append_option_if_available("-Wlogical-op" CUB_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
-    # GCC 7.3 complains about name mangling changes due to `noexcept`
-    # becoming part of the type system; we don't care.
-    append_option_if_available("-Wno-noexcept-type" CUB_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_STANDARD EQUAL 98)
-    # thrust::complex can't really be made trivially copyable in pre-11.
-    # Disable a warning about a non-trivially-copyable type being memmoved that was added to GCC 8.
-    append_option_if_available("-Wno-class-memaccess" CUB_CXX_WARNINGS)
-  endif ()
-endif ()
-
-if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
-    ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}"))
-  # xlC and Clang warn about unused parameters in uninstantiated templates.
-  # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out
-  # (and thus has unused parameters) when you aren't using it.
-  append_option_if_available("-Wno-unused-parameters" CUB_CXX_WARNINGS)
-endif ()
-
-if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  # -Wunneeded-internal-declaration misfires in the unit test framework
-  # on older versions of Clang.
-  append_option_if_available("-Wno-unneeded-internal-declaration" CUB_CXX_WARNINGS)
-endif ()
-
-foreach (CXX_OPTION IN LISTS CUB_CXX_WARNINGS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_OPTION}")
-endforeach ()
-
-foreach (CXX_OPTION IN LISTS CUB_CXX_WARNINGS)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${CXX_OPTION}")
-endforeach ()
-
-# For every public header, build a translation unit containing `#include <header>`
-# to let the compiler try to figure out warnings in that header if it is not otherwise
-# included in tests, and also to verify if the headers are modular enough.
-# .inl files are not globbed for, because they are not supposed to be used as public
-# entrypoints.
-list(APPEND CUB_HEADER_GLOBS cub/*.cuh)
-
-# Get all .cuh files...
-file(
-  GLOB_RECURSE CUB_HEADERS
-  RELATIVE ${PROJECT_SOURCE_DIR}/cub
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${CUB_HEADER_GLOBS}
-)
-
-foreach (CUB_HEADER IN LISTS CUB_HEADERS)
-
-  set(CUB_HEADER_TEST_EXT .cu)
-
-  set(SOURCE_NAME headers/${CUB_HEADER}${CUB_HEADER_TEST_EXT})
-  configure_file(cmake/header_test.in ${SOURCE_NAME})
-
-  list(APPEND CUB_HEADER_TEST_SOURCES ${SOURCE_NAME})
-endforeach ()
-
-add_library(header-test OBJECT ${CUB_HEADER_TEST_SOURCES})
-target_include_directories(
-  header-test
-  PUBLIC ${PROJECT_SOURCE_DIR}
-)
-
-#Create Header-Only Library/Target
-
-add_library(CUB INTERFACE)
-target_include_directories(CUB INTERFACE ${CMAKE_SOURCE_DIR})
-
-
-
-include(CTest)
-enable_testing()
-
-# Handle tests.
-
-math(EXPR CUB_TEST_ARCH ${CUB_MINIMAL_ENABLED_ARCH}*10)
-message("-- CUB Test architecture (TEST_ARCH): ${CUB_TEST_ARCH}")
-
-set(CUB_TEST_RUN_ARGUMENTS
-  -DCUB_SOURCE=${CMAKE_SOURCE_DIR}
-  -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake")
-
-list(APPEND CUB_TEST_GLOBS test/test_*.cu)
-
-file(
-  GLOB CUB_TESTS
-  RELATIVE ${PROJECT_SOURCE_DIR}/test
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${CUB_TEST_GLOBS}
-)
-
-foreach (CUB_TEST_SOURCE IN LISTS CUB_TESTS)
-  # TODO: Per-test flags.
-
-  set(CUB_TEST_ADD_TO_CTEST ON)
-
-  get_filename_component(CUB_TEST_CATEGORY ${CUB_TEST_SOURCE} DIRECTORY)
-  if (NOT ("" STREQUAL "${CUB_TEST_CATEGORY}"))
-    set(CUB_TEST_CATEGORY "${CUB_TEST_CATEGORY}.")
-  endif ()
-
-  get_filename_component(CUB_TEST_NAME ${CUB_TEST_SOURCE} NAME_WE)
-
-  set(CUB_TEST "cub.test.${CUB_TEST_CATEGORY}${CUB_TEST_NAME}")
-
-  add_executable(
-    ${CUB_TEST}
-    ${PROJECT_SOURCE_DIR}/test/${CUB_TEST_SOURCE}
-  )
-
-  target_compile_definitions(${CUB_TEST} PRIVATE TEST_ARCH=${CUB_TEST_ARCH})
-
-  target_link_libraries(${CUB_TEST} CUB)
-
-  target_include_directories(
-    ${CUB_TEST} 
-    PRIVATE ${PROJECT_SOURCE_DIR}/test
-  )
-
-  if (CUB_TEST_ADD_TO_CTEST)
-    add_test(NAME ${CUB_TEST}
-      COMMAND ${CMAKE_COMMAND}
-        -DCUB_BINARY=$<TARGET_FILE:${CUB_TEST}>
-        ${CUB_TEST_RUN_ARGUMENTS})
-  endif ()
-
-endforeach ()
-
-# Handle examples.
-
-list(APPEND CUB_EXAMPLE_GLOBS examples/example_*.cu)
-
-if (CMAKE_VERSION VERSION_LESS 3.12)
-  file(
-    GLOB_RECURSE CUB_EXAMPLES
-    RELATIVE ${PROJECT_SOURCE_DIR}/examples
-    ${CUB_EXAMPLE_GLOBS}
-    CONFIGURE_DEPENDS
-  )
-else ()
-  file(
-    GLOB_RECURSE CUB_EXAMPLES
-    RELATIVE ${PROJECT_SOURCE_DIR}/examples
-    ${CUB_EXAMPLE_GLOBS}
-  )
-endif ()
-
-set(CUB_EXAMPLE_RUN_ARGUMENTS
-  -DCUB_SOURCE=${CMAKE_SOURCE_DIR}
-  -P "${CMAKE_SOURCE_DIR}/cmake/run_example.cmake")
-
-foreach (CUB_EXAMPLE_SOURCE IN LISTS CUB_EXAMPLES)
-  # TODO: Per-example flags.
-
-  get_filename_component(CUB_EXAMPLE_CATEGORY ${CUB_EXAMPLE_SOURCE} DIRECTORY)
-  if (NOT ("" STREQUAL "${CUB_EXAMPLE_CATEGORY}"))
-    set(CUB_EXAMPLE_CATEGORY "${CUB_EXAMPLE_CATEGORY}.")
-  endif ()
-
-  get_filename_component(CUB_EXAMPLE_NAME ${CUB_EXAMPLE_SOURCE} NAME_WE)
-
-  set(CUB_EXAMPLE "cub.example.${CUB_EXAMPLE_CATEGORY}${CUB_EXAMPLE_NAME}")
-
-  add_executable(
-    ${CUB_EXAMPLE}
-    ${PROJECT_SOURCE_DIR}/examples/${CUB_EXAMPLE_SOURCE}
-  )
-
-  target_link_libraries(${CUB_EXAMPLE} CUB)
-
-  target_include_directories(
-    ${CUB_EXAMPLE}
-    PRIVATE ${PROJECT_SOURCE_DIR}/examples
-  )
-
-  add_test(NAME ${CUB_EXAMPLE}
-    COMMAND ${CMAKE_COMMAND}
-      -DCUB_EXAMPLE=${CUB_EXAMPLE}
-      -DCUB_BINARY=$<TARGET_FILE:${CUB_EXAMPLE}>
-      ${CUB_EXAMPLE_RUN_ARGUMENTS})
-endforeach ()
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md
deleted file mode 100644
index bb231d66e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/DEVELOPMENT_MODEL.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# CUB Development Model
-
-The following is a description of the basic development process that CUB follows. This is a living
-document that will evolve as our process evolves.
-
-CUB is distributed in three ways:
-
-   * On GitHub.
-   * In the NVIDIA HPC SDK.
-   * In the CUDA Toolkit.
-
-## Trunk Based Development
-
-CUB uses [trunk based development](https://trunkbaseddevelopment.com). There is a single long-lived
-branch called `master`. Engineers may create branches for feature development. Such branches always
-merge into `master`. There are no release branches. Releases are produced by taking a snapshot of
-`master` ("snapping"). After a release has been snapped from `master`, it will never be changed.
-
-## Repositories
-
-As CUB is developed both on GitHub and internally at NVIDIA, there are three main places where code lives:
-
-   * The Source of Truth, the [public CUB repository](https://github.com/thrust/cub), referred to as
-     `github` later in this document.
-   * An internal GitLab repository, referred to as `gitlab` later in this document.
-   * An internal Perforce repository, referred to as `perforce` later in this document.
-
-## Versioning
-
-CUB has its own versioning system for releases, independent of the versioning scheme of the NVIDIA
-HPC SDK or the CUDA Toolkit.
-
-Today, CUB version numbers have a specific [semantic meaning](https://semver.org/).
-Releases prior to 1.10.0 largely, but not strictly, followed these semantic meanings.
-
-The version number for a CUB release uses the following format: `MMM.mmm.ss-ppp`, where:
-
-   * `CUB_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits. It is incremented
-     when the fundamental nature of the library evolves, leading to widespread changes across the
-     entire library interface with no guarantee of API, ABI, or semantic compatibility with former
-     versions.
-   * `CUB_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits. It is incremented when
-     breaking API, ABI, or semantic changes are made.
-   * `CUB_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits. It is incremented
-     when notable new features or bug fixes or features that are API, ABI, and semantic backwards
-     compatible are added.
-   * `CUB_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits. It is incremented if any
-     change in the repo whatsoever is made and no other version component has been incremented.
-
-The `<cub/version.h>` header defines `CUB_*` macros for all of the version components mentioned
-above. Additionally, a `CUB_VERSION` macro is defined, which is an integer literal containing all
-of the version components except for `CUB_PATCH_NUMBER`.
-
-## Branches and Tags
-
-The following tag names are used in the CUB project:
-
-  * `github/nvhpc-X.Y`: the tag that directly corresponds to what has been shipped in the NVIDIA HPC SDK release X.Y.
-  * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y.
-  * `github/A.B.C`: the tag that directly corresponds to a CUB version A.B.C.
-
-The following branch names are used in the CUB project:
-
-  * `github/master`: the Source of Truth development branch of CUB.
-  * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories.
-  * `github/feature/<name>`: feature branch for a feature under development.
-  * `github/bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where `bug-system` is `github` or `nvidia`.
-  * `gitlab/master`: mirror of `github/master`.
-  * `perforce/private`: mirrored `github/master`, plus files necessary for internal NVIDIA testing systems.
-
-On the rare occasion that we cannot do work in the open, for example when developing a change specific to an
-unreleased product, these branches may exist on `gitlab` instead of `github`. By default, everything should be
-in the open on `github` unless there is a strong motivation for it to not be open.
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md
deleted file mode 100644
index 6c4d2973b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/README.md
+++ /dev/null
@@ -1,161 +0,0 @@
-<hr>
-<h3>About CUB</h3>
-
-CUB provides state-of-the-art, reusable software components for every layer
-of the CUDA programming model:
-- [<b><em>Device-wide primitives</em></b>](https://nvlabs.github.com/cub/group___device_module.html)
-  - Sort, prefix scan, reduction, histogram, etc.
-  - Compatible with CUDA dynamic parallelism
-- [<b><em>Block-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___block_module.html)
-  - I/O, sort, prefix scan, reduction, histogram, etc.
-  - Compatible with arbitrary thread block sizes and types
-- [<b><em>Warp-wide "collective" primitives</em></b>](https://nvlabs.github.com/cub/group___warp_module.html)
-  - Warp-wide prefix scan, reduction, etc.
-  - Safe and architecture-specific
-- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
-  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc.
-
-![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
-
-CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit.
-
-We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
-
-<br><hr>
-<h3>A Simple Example</h3>
-
-```C++
-#include <cub/cub.cuh>
-
-// Block-sorting CUDA kernel
-__global__ void BlockSortKernel(int *d_in, int *d_out)
-{
-     using namespace cub;
-
-     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
-     // owning 16 integer items each
-     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
-     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
-
-     // Allocate shared memory
-     __shared__ union {
-         typename BlockRadixSort::TempStorage  sort;
-         typename BlockLoad::TempStorage       load;
-         typename BlockStore::TempStorage      store;
-     } temp_storage;
-
-     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
-
-     // Obtain a segment of 2048 consecutive keys that are blocked across threads
-     int thread_keys[16];
-     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
-     __syncthreads();
-
-     // Collectively sort the keys
-     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
-     __syncthreads();
-
-     // Store the sorted segment
-     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
-}
-```
-
-Each thread block uses `cub::BlockRadixSort` to collectively sort
-its own input segment.  The class is specialized by the
-data type being sorted, by the number of threads per block, by the number of
-keys per thread, and implicitly by the targeted compilation architecture.
-
-The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized.
-Furthermore, to provide coalesced accesses to device memory, these primitives are
-configured to access memory using a striped access pattern (where consecutive threads
-simultaneously access consecutive items) and then <em>transpose</em> the keys into
-a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.
-
-Once specialized, these classes expose opaque `TempStorage` member types.
-The thread block uses these storage types to statically allocate the union of
-shared memory needed by the thread block.  (Alternatively these storage types
-could be aliased to global memory allocations).
-
-<br><hr>
-<h3>Releases</h3>
-
-CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition
-to GitHub.
-
-See the [changelog](CHANGELOG.md) for details about specific releases.
-
-| CUB Release               | Included In                    |
-| ------------------------- | ------------------------------ |
-| 1.9.10                    | NVIDIA HPC SDK 20.5            |
-| 1.9.9                     | CUDA Toolkit 11.0              |
-| 1.9.8-1                   | NVIDIA HPC SDK 20.3            |
-| 1.9.8                     | CUDA Toolkit 11.0 Early Access |
-| 1.9.8                     | CUDA 11.0 Early Access         |
-| 1.8.0                     |                                |
-| 1.7.5                     | Thrust 1.9.2                   |
-| 1.7.4                     | Thrust 1.9.1-2                 |
-| 1.7.3                     |                                |
-| 1.7.2                     |                                |
-| 1.7.1                     |                                |
-| 1.7.0                     | Thrust 1.9.0-5                 |
-| 1.6.4                     |                                |
-| 1.6.3                     |                                |
-| 1.6.2 (previously 1.5.5)  |                                |
-| 1.6.1 (previously 1.5.4)  |                                |
-| 1.6.0 (previously 1.5.3)  |                                |
-| 1.5.2                     |                                |
-| 1.5.1                     |                                |
-| 1.5.0                     |                                |
-| 1.4.1                     |                                |
-| 1.4.0                     |                                |
-| 1.3.2                     |                                |
-| 1.3.1                     |                                |
-| 1.3.0                     |                                |
-| 1.2.3                     |                                |
-| 1.2.2                     |                                |
-| 1.2.0                     |                                |
-| 1.1.1                     |                                |
-| 1.0.2                     |                                |
-| 1.0.1                     |                                |
-| 0.9.4                     |                                |
-| 0.9.2                     |                                |
-| 0.9.1                     |                                |
-| 0.9.0                     |                                |
-
-<br><hr>
-<h3>Development Model</h3>
-
-For information on development model, see [this document](DEVELOPMENT_MODEL.md).
-
-<br><hr>
-<h3>Open Source License</h3>
-
-CUB is available under the "New BSD" open-source license:
-
-```
-Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
-Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-```
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake
deleted file mode 100644
index e4957977f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/AppendOptionIfAvailable.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-include_guard(GLOBAL)
-include(CheckCXXCompilerFlag)
-
-macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
-
-set(_VAR "CXX_FLAG_${_FLAG}")
-check_cxx_compiler_flag(${_FLAG} ${_VAR})
-
-if (${${_VAR}})
-  list(APPEND ${_LIST} ${_FLAG})
-endif ()
-
-endmacro ()
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in
deleted file mode 100644
index 17928084b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/header_test.in
+++ /dev/null
@@ -1 +0,0 @@
-#include <cub/${CUB_HEADER}>
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake
deleted file mode 100644
index bc5ca631b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_example.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-include("${CUB_SOURCE}/cmake/common_variables.cmake")
-
-execute_process(
-  COMMAND "${CUB_BINARY}"
-  ${FILECHECK_COMMAND}
-  RESULT_VARIABLE EXIT_CODE
-  OUTPUT_VARIABLE STDOUT
-  ERROR_VARIABLE STDERR
-)
-
-if (NOT "0" STREQUAL "${EXIT_CODE}")
-  message(FATAL_ERROR "${CUB_BINARY} failed (${EXIT_CODE}):\n${STDERR}")
-endif ()
-
-if (CHECK_EMPTY_OUTPUT)
-  string(LENGTH "${OUTPUT_VARIABLE}" LENGTH)
-  if (NOT ${LENGTH} EQUAL 0)
-    message(FATAL_ERROR "${CUB_BINARY}: output received, but not expected.")
-  endif ()
-endif ()
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake
deleted file mode 100644
index 5bc422d98..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cmake/run_test.cmake
+++ /dev/null
@@ -1,8 +0,0 @@
-execute_process(
-  COMMAND "${CUB_BINARY}"
-  RESULT_VARIABLE EXIT_CODE
-)
-
-if (NOT "0" STREQUAL "${EXIT_CODE}")
-    message(FATAL_ERROR "${CUB_BINARY} failed (${EXIT_CODE})")
-endif ()
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk
deleted file mode 100644
index 4010ed309..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/common.mk
+++ /dev/null
@@ -1,203 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
-# *
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# *
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-
-#-------------------------------------------------------------------------------
-# Commandline Options
-#-------------------------------------------------------------------------------
-
-# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
-
-COMMA = ,
-ifdef sm
-	SM_ARCH = $(subst $(COMMA),-,$(sm))
-else
-    SM_ARCH = 600
-endif
-
-ifeq (700, $(findstring 700, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
-    SM_DEF 		+= -DSM700
-    TEST_ARCH 	= 700
-endif
-ifeq (620, $(findstring 620, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
-    SM_DEF 		+= -DSM620
-    TEST_ARCH 	= 620
-endif
-ifeq (610, $(findstring 610, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\"
-    SM_DEF 		+= -DSM610
-    TEST_ARCH 	= 610
-endif
-ifeq (600, $(findstring 600, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
-    SM_DEF 		+= -DSM600
-    TEST_ARCH 	= 600
-endif
-ifeq (520, $(findstring 520, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\"
-    SM_DEF 		+= -DSM520
-    TEST_ARCH 	= 520
-endif
-ifeq (370, $(findstring 370, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\"
-    SM_DEF 		+= -DSM370
-    TEST_ARCH 	= 370
-endif
-ifeq (350, $(findstring 350, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
-    SM_DEF 		+= -DSM350
-    TEST_ARCH 	= 350
-endif
-ifeq (300, $(findstring 300, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
-    SM_DEF 		+= -DSM300
-    TEST_ARCH 	= 300
-endif
-
-
-# [cdp=<0|1>] CDP enable option (default: no)
-ifeq ($(cdp), 1)
-	DEFINES += -DCUB_CDP
-	CDP_SUFFIX = cdp
-    NVCCFLAGS += -rdc=true -lcudadevrt
-else
-	CDP_SUFFIX = nocdp
-endif
-
-
-# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default)
-ifeq ($(force32), 1)
-	CPU_ARCH = -m32
-	CPU_ARCH_SUFFIX = i386
-else
-	CPU_ARCH = -m64
-	CPU_ARCH_SUFFIX = x86_64
-    NPPI = -lnppist
-endif
-
-
-# [abi=<0|1>] CUDA ABI option (enabled by default)
-ifneq ($(abi), 0)
-	ABI_SUFFIX = abi
-else
-	NVCCFLAGS += -Xptxas -abi=no
-	ABI_SUFFIX = noabi
-endif
-
-
-# [open64=<0|1>] Middle-end compiler option (nvvm by default)
-ifeq ($(open64), 1)
-	NVCCFLAGS += -open64
-	PTX_SUFFIX = open64
-else
-	PTX_SUFFIX = nvvm
-endif
-
-
-# [verbose=<0|1>] Verbose toolchain output from nvcc option
-ifeq ($(verbose), 1)
-	NVCCFLAGS += -v
-endif
-
-
-# [keep=<0|1>] Keep intermediate compilation artifacts option
-ifeq ($(keep), 1)
-	NVCCFLAGS += -keep
-endif
-
-# [debug=<0|1>] Generate debug mode code
-ifeq ($(debug), 1)
-	NVCCFLAGS += -G
-endif
-
-
-#-------------------------------------------------------------------------------
-# Compiler and compilation platform
-#-------------------------------------------------------------------------------
-
-CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
-
-NVCC ?= "$(shell which nvcc)"
-ifdef nvccver
-    NVCC_VERSION = $(nvccver)
-else
-    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
-endif
-
-# detect OS
-OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
-
-# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
-NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
-
-ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
-    # For MSVC
-    # Enable more warnings and treat as errors
-    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
-    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-    NVCCFLAGS += -Xcompiler /fp:strict
-    # Help the compiler/linker work with huge numbers of kernels on Windows
-	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
-	CC = cl
-
-	# Multithreaded runtime
-	NVCCFLAGS += -Xcompiler /MT
-
-ifneq ($(force32), 1)
-	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
-else
-	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
-endif
-	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
-else
-    # For g++
-    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-    NVCCFLAGS += -Xcompiler -ffloat-store
-    CC = g++
-ifneq ($(force32), 1)
-    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
-else
-    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
-endif
-endif
-
-# Suffix to append to each binary
-BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
-
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
-			$(CUB_DIR)common.mk
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh
deleted file mode 100644
index 7559bf126..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_histogram.cuh
+++ /dev/null
@@ -1,787 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_load.cuh"
-#include "../config.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- *
- */
-enum BlockHistogramMemoryPreference
-{
-    GMEM,
-    SMEM,
-    BLEND
-};
-
-
-/**
- * Parameterizable tuning policy type for AgentHistogram
- */
-template <
-    int                             _BLOCK_THREADS,                 ///< Threads per thread block
-    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
-    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
-    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
-struct AgentHistogramPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
-        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
-        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
-        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-template <
-    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
-    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
-    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
-    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
-struct AgentHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    /// The pixel type of SampleT
-    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
-
-    /// The quad type of SampleT
-    typedef typename CubVector<SampleT, 4>::Type QuadT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
-
-        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
-        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
-
-        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
-        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
-
-        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
-
-        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
-                                        AgentHistogramPolicyT::MEM_PREFERENCE :
-                                        GMEM,
-
-        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
-    };
-
-    /// Cache load modifier for reading input elements
-    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
-
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
-            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
-        WrappedSampleIteratorT;
-
-    /// Pixel input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
-        WrappedPixelIteratorT;
-
-    /// Qaud input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
-        WrappedQuadIteratorT;
-
-    /// Parameterized BlockLoad type for samples
-    typedef BlockLoad<
-            SampleT,
-            BLOCK_THREADS,
-            SAMPLES_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadSampleT;
-
-    /// Parameterized BlockLoad type for pixels
-    typedef BlockLoad<
-            PixelT,
-            BLOCK_THREADS,
-            PIXELS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadPixelT;
-
-    /// Parameterized BlockLoad type for quads
-    typedef BlockLoad<
-            QuadT,
-            BLOCK_THREADS,
-            QUADS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadQuadT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
-
-        int tile_idx;
-
-        // Aliasable storage layout
-        union Aliasable
-        {
-            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
-            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
-
-        } aliasable;
-    };
-
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Sample input iterator (with cache modifier applied, if possible)
-    WrappedSampleIteratorT d_wrapped_samples;
-
-    /// Native pointer for input samples (possibly NULL if unavailable)
-    SampleT* d_native_samples;
-
-    /// The number of output bins for each channel
-    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// The number of privatized bins for each channel
-    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to gmem privatized histograms for each channel
-    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to final output histograms (gmem)
-    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining privatized counter indices from samples, one for each channel
-    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// Whether to prefer privatized smem counters vs privatized global counters
-    bool prefer_smem;
-
-
-    //---------------------------------------------------------------------
-    // Initialize privatized bin counters
-    //---------------------------------------------------------------------
-
-    // Initialize privatized bin counters
-    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
-            {
-                privatized_histograms[CHANNEL][privatized_bin] = 0;
-            }
-        }
-
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void InitSmemBinCounters()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        InitBinCounters(privatized_histograms);
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void InitGmemBinCounters()
-    {
-        InitBinCounters(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Update final output histograms
-    //---------------------------------------------------------------------
-
-    // Update final output histograms from privatized histograms
-    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-
-        // Apply privatized bin counts to output bin counts
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
-                    privatized_bin += BLOCK_THREADS)
-            {
-                int         output_bin  = -1;
-                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
-                bool        is_valid    = count > 0;
-
-                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
-
-                if (output_bin >= 0)
-                {
-                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
-                }
-
-            }
-        }
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void StoreSmemOutput()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        StoreOutput(privatized_histograms);
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void StoreGmemOutput()
-    {
-        StoreOutput(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile accumulation
-    //---------------------------------------------------------------------
-
-    // Accumulate pixels.  Specialized for RLE compression.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<true>      is_rle_compress)
-    {
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            // Bin pixels
-            int bins[PIXELS_PER_THREAD];
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            {
-                bins[PIXEL] = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
-            }
-
-            CounterT accumulator = 1;
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
-            {
-                if (bins[PIXEL] != bins[PIXEL + 1])
-                {
-                    if (bins[PIXEL] >= 0)
-                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
-
-                     accumulator = 0;
-                }
-                accumulator++;
-            }
-
-            // Last pixel
-            if (bins[PIXELS_PER_THREAD - 1] >= 0)
-                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
-        }
-    }
-
-
-    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<false>     is_rle_compress)
-    {
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                int bin = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
-                if (bin >= 0)
-                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
-            }
-        }
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for smem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateSmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for gmem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateGmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Tile loading
-    //---------------------------------------------------------------------
-
-    // Load full, aligned tile using pixel iterator (multi-channel)
-    template <int _NUM_ACTIVE_CHANNELS>
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples));
-    }
-
-    // Load full, aligned tile using quad iterator (single-channel)
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<1>                     num_active_channels)
-    {
-        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
-
-        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped quad iterator
-        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
-            d_wrapped_quads,
-            reinterpret_cast<AliasedQuads&>(samples));
-    }
-
-    // Load full, aligned tile
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
-    }
-
-    // Load full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        // Load using sample iterator
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples));
-    }
-
-    // Load partially-full, aligned tile using the pixel iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        int valid_pixels = valid_samples / NUM_CHANNELS;
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples),
-            valid_pixels);
-    }
-
-    // Load partially-full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples),
-            valid_samples);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    // Consume a tile of data samples
-    template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
-        bool IS_FULL_TILE>      // Whether the tile is full
-    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
-    {
-        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
-        bool        is_valid[PIXELS_PER_THREAD];
-
-        // Load tile
-        LoadTile(
-            block_offset,
-            valid_samples,
-            samples,
-            Int2Type<IS_FULL_TILE>(),
-            Int2Type<IS_ALIGNED>());
-
-        // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
-
-        // Accumulate samples
-#if CUB_PTX_ARCH >= 120
-        if (prefer_smem)
-            AccumulateSmemPixels(samples, is_valid);
-        else
-            AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
-    }
-
-
-    // Consume row tiles.  Specialized for work-stealing from queue
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<true>      is_work_stealing)
-    {
-
-        int         num_tiles                   = num_rows * tiles_per_row;
-        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
-        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
-
-        while (tile_idx < num_tiles)
-        {
-            int     row             = tile_idx / tiles_per_row;
-            int     col             = tile_idx - (row * tiles_per_row);
-            OffsetT row_offset      = row * row_stride_samples;
-            OffsetT col_offset      = (col * TILE_SAMPLES);
-            OffsetT tile_offset     = row_offset + col_offset;
-
-            if (col == tiles_per_row - 1)
-            {
-                // Consume a partially-full tile at the end of the row
-                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
-                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-            } 
-            else
-            {
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-            }
-
-            CTA_SYNC();
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
-
-            CTA_SYNC();
-
-            tile_idx = temp_storage.tile_idx;
-        }
-    }
-
-
-    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<false>     is_work_stealing)
-    {
-        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
-        {
-            OffsetT row_begin   = row * row_stride_samples;
-            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
-            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
-
-            while (tile_offset < row_end)
-            {
-                OffsetT num_remaining = row_end - tile_offset;
-
-                if (num_remaining < TILE_SAMPLES)
-                {
-                    // Consume partial tile
-                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-                    break;
-                }
-
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-                tile_offset += gridDim.x * TILE_SAMPLES;
-            }
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Parameter extraction
-    //---------------------------------------------------------------------
-
-    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
-    template <
-        CacheLoadModifier   _MODIFIER,
-        typename            _ValueT,
-        typename            _OffsetT>
-    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
-    {
-        return itr.ptr;
-    }
-
-    // Return a native pixel pointer (specialized for other types)
-    template <typename IteratorT>
-    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
-    {
-        return NULL;
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentHistogram(
-        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
-        SampleIteratorT     d_samples,                                          ///< Input data to reduce
-        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
-        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
-        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
-        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
-        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    :
-        temp_storage(temp_storage.Alias()),
-        d_wrapped_samples(d_samples),
-        num_output_bins(num_output_bins),
-        num_privatized_bins(num_privatized_bins),
-        d_output_histograms(d_output_histograms),
-        privatized_decode_op(privatized_decode_op),
-        output_decode_op(output_decode_op),
-        d_native_samples(NativePointer(d_wrapped_samples)),
-        prefer_smem((MEM_PREFERENCE == SMEM) ?
-            true :                              // prefer smem privatized histograms
-            (MEM_PREFERENCE == GMEM) ?
-                false :                         // prefer gmem privatized histograms
-                blockIdx.x & 1)                 // prefer blended privatized histograms
-    {
-        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
-
-        // Initialize the locations of this block's privatized histograms
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
-    }
-
-
-    /**
-     * Consume image
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
-        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
-        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
-        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
-
-        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
-                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
-                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
-
-        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
-                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
-                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
-
-        // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
-            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-        else
-            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-    }
-
-
-    /**
-     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void InitBinCounters()
-    {
-        if (prefer_smem)
-            InitSmemBinCounters();
-        else
-            InitGmemBinCounters();
-    }
-
-
-    /**
-     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void StoreOutput()
-    {
-        if (prefer_smem)
-            StoreSmemOutput();
-        else
-            StoreGmemOutput();
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh
deleted file mode 100644
index c861a41e8..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_downsweep.cuh
+++ /dev/null
@@ -1,790 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_radix_rank.cuh"
-#include "../block/block_exchange.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Radix ranking algorithm
- */
-enum RadixRankAlgorithm
-{
-    RADIX_RANK_BASIC,
-    RADIX_RANK_MEMOIZE,
-    RADIX_RANK_MATCH
-};
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortDownsweep
- */
-template <
-    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename            ComputeT,                       ///< Dominant compute type
-    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
-    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
-    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
-    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
-    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentRadixSortDownsweepPolicy :
-    ScalingType
-{
-    enum
-    {
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
-    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
-    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-
-
-
-/**
- * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-template <
-    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
-    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,                              ///< KeyT type
-    typename ValueT,                            ///< ValueT type
-    typename OffsetT>                           ///< Signed integer type for global offsets
-struct AgentRadixSortDownsweep
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of KeyT
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
-    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
-    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
-    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
-
-    enum
-    {
-        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
-        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
-
-    // Radix ranking type to use
-    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
-            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
-            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
-                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
-            >::Type
-        >::Type BlockRadixRankT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
-    };
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadKeysT;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadValuesT;
-
-    // Value exchange array type
-    typedef ValueT ValueExchangeT[TILE_ITEMS];
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        typename BlockLoadKeysT::TempStorage    load_keys;
-        typename BlockLoadValuesT::TempStorage  load_values;
-        typename BlockRadixRankT::TempStorage   radix_rank;
-
-        struct
-        {
-            UnsignedBits                        exchange_keys[TILE_ITEMS];
-            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
-        };
-
-        Uninitialized<ValueExchangeT>           exchange_values;
-
-        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-    ValuesItr       d_values_in;
-    UnsignedBits    *d_keys_out;
-    ValueT          *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-    // Whether to short-cirucit
-    int             short_circuit;
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         valid_items)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
-            UnsignedBits digit          = BFE(key, current_bit, num_bits);
-            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
-
-            // Un-twiddle
-            key = Traits<KeyT>::TwiddleOut(key);
-
-            if (FULL_TILE || 
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        ValueT      (&values)[ITEMS_PER_THREAD],
-        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        OffsetT     valid_items)
-    {
-        CTA_SYNC();
-
-        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            exchange_values[ranks[ITEM]] = values[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
-
-            if (FULL_TILE ||
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
-            }
-        }
-    }
-
-    /**
-     * Load a tile of keys (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
-
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys, valid_items, oob_item);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
-
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
-    }
-
-
-    /**
-     * Load a tile of values (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of values (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
-
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values, valid_items);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
-
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         block_offset,
-        OffsetT         valid_items,
-        Int2Type<false> /*is_keys_only*/)
-    {
-        ValueT values[ITEMS_PER_THREAD];
-
-        CTA_SYNC();
-
-        LoadValues(
-            values,
-            block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items);
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
-        int             (&/*ranks*/)[ITEMS_PER_THREAD],
-        OffsetT         /*block_offset*/,
-        OffsetT         /*valid_items*/,
-        Int2Type<true>  /*is_keys_only*/)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        OffsetT block_offset,
-        const OffsetT &valid_items = TILE_ITEMS)
-    {
-        UnsignedBits    keys[ITEMS_PER_THREAD];
-        int             ranks[ITEMS_PER_THREAD];
-        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
-
-        // Assign default (min/max) value to all keys
-        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
-
-        // Load tile of keys
-        LoadKeys(
-            keys,
-            block_offset,
-            valid_items, 
-            default_key,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
-            keys,
-            ranks,
-            current_bit,
-            num_bits,
-            exclusive_digit_prefix);
-
-        CTA_SYNC();
-
-        // Share exclusive digit prefix
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Store exclusive prefix
-                temp_storage.exclusive_digit_prefix[bin_idx] =
-                    exclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Get inclusive digit prefix
-        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                {
-                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
-                }
-                else
-                {
-                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Update global scatter base offsets for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_offset[track] -= exclusive_digit_prefix[track];
-                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
-                bin_offset[track] += inclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
-
-        // Gather/scatter values
-        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
-    }
-
-    //---------------------------------------------------------------------
-    // Copy shortcut
-    //---------------------------------------------------------------------
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <
-        typename InputIteratorT,
-        typename T>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  d_in,
-        T               *d_out,
-        OffsetT         block_offset,
-        OffsetT         block_end)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_end)
-        {
-            OffsetT valid_items = block_end - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  /*d_in*/,
-        NullType        * /*d_out*/,
-        OffsetT         /*block_offset*/,
-        OffsetT         /*block_end*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
-        OffsetT         num_items,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            this->bin_offset[track] = bin_offset[track];
-
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Short circuit if the histogram has only bin counts of only zeros or problem-size
-                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         num_items,
-        OffsetT         *d_spine,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
-                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-
-                // Load my block's bin offset for my bin
-                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT   block_offset,
-        OffsetT   block_end)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_end);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_end);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            #pragma unroll 1
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-
-                CTA_SYNC();
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_end)
-            {
-                ProcessTile<false>(block_offset, block_end - block_offset);
-            }
-
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh
deleted file mode 100644
index c65773f12..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_radix_sort_upsweep.cuh
+++ /dev/null
@@ -1,527 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_load.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../block/block_load.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortUpsweep
- */
-template <
-    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename            ComputeT,                       ///< Dominant compute type
-    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
-    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentRadixSortUpsweepPolicy :
-    ScalingType
-{
-    enum
-    {
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-template <
-    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
-    typename KeyT,                          ///< KeyT type
-    typename OffsetT>                       ///< Signed integer type for global offsets
-struct AgentRadixSortUpsweep
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
-        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            AgentRadixSortUpsweep       &cta,
-            UnsignedBits                keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
-
-        // Extract current digit bits
-        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
-
-        // Get sub-counter offset
-        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
-
-        // Get row offset
-        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
-
-        // Increment counter
-        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = LaneId();
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-        CTA_SYNC();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        OffsetT block_offset,
-        const OffsetT &block_end)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_end)
-        {
-            // Load and bucket key
-            UnsignedBits key = d_keys_in[block_offset];
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortUpsweep(
-        TempStorage &temp_storage,
-        const KeyT  *d_keys_in,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit),
-        num_bits(num_bits)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT          block_offset,
-        const OffsetT    &block_end)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_end)
-        {
-            for (int i = 0; i < UNROLL_COUNT; ++i)
-            {
-                ProcessFullTile(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            CTA_SYNC();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            CTA_SYNC();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_end);
-
-        CTA_SYNC();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-    }
-
-
-    /**
-     * Extract counts (saving them to the external array)
-     */
-    template <bool IS_DESCENDING>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT     *counters,
-        int         bin_stride = 1,
-        int         bin_offset = 0)
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-
-        // Whole blocks
-        #pragma unroll
-        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
-            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
-            BIN_BASE += BLOCK_THREADS)
-        {
-            int bin_idx = BIN_BASE + threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-
-        // Remainder
-        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
-        {
-            int bin_idx = threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-    }
-
-
-    /**
-     * Extract counts
-     */
-    template <int BINS_TRACKED_PER_THREAD>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_count[track] = 0;
-
-                #pragma unroll
-                for (int i = 0; i < WARP_THREADS; ++i)
-                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
-            }
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh
deleted file mode 100644
index 0f3ba7510..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce.cuh
+++ /dev/null
@@ -1,386 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduce
- */
-template <
-    int                     NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                     NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename                ComputeT,                       ///< Dominant compute type
-    int                     _VECTOR_LOAD_LENGTH,            ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,               ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    typename                ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentReducePolicy :
-    ScalingType
-{
-    enum
-    {
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
-    typename InputIteratorT,           ///< Random-access iterator type for input
-    typename OutputIteratorT,          ///< Random-access iterator type for output
-    typename OffsetT,                  ///< Signed integer type for global offsets
-    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct AgentReduce
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    /// The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    /// Vector type of InputT for data movement
-    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
-                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
-                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
-
-    };
-
-    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
-
-    /// Parameterized BlockReduce primitive
-    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        typename BlockReduceT::TempStorage  reduce;
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIteratorT          d_in;               ///< Input data to reduce
-    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-
-
-    //---------------------------------------------------------------------
-    // Utility
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  /*can_vectorize*/)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-    }
-
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        /*d_in*/,
-        Int2Type<false> /*can_vectorize*/)
-    {
-        return false;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentReduce(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIteratorT          d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Tile consumption
-    //---------------------------------------------------------------------
-
-    /**
-     * Consume a full tile of input (non-vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        OutputT items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a full tile of input (vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Alias items as an array of VectorT and load it in striped fashion
-        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-        // Fabricate a vectorized input iterator
-        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
-        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
-            reinterpret_cast<VectorT*>(d_in_unqualified));
-
-        // Load items as vector items
-        InputT input_items[ITEMS_PER_THREAD];
-        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
-        #pragma unroll
-        for (int i = 0; i < WORDS; ++i)
-            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-        // Convert from input type to output type
-        OutputT items[ITEMS_PER_THREAD];
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-            items[i] = input_items[i];
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a partial tile of input
-     */
-    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     valid_items,        ///< The number of valid items in the tile
-        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Partial tile
-        int thread_offset = threadIdx.x;
-
-        // Read first item
-        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
-        {
-            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-            thread_offset += BLOCK_THREADS;
-        }
-
-        // Continue reading items (block-striped)
-        while (thread_offset < valid_items)
-        {
-            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
-            thread_aggregate    = reduction_op(thread_aggregate, item);
-            thread_offset       += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeRange(
-        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
-    {
-        OutputT thread_aggregate;
-
-        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // At least one full block
-        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        even_share.block_offset += even_share.block_stride;
-
-        // Consume subsequent full tiles of input
-        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
-        {
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            even_share.block_offset += even_share.block_stride;
-        }
-
-        // Consume a partially-full tile
-        if (even_share.block_offset < even_share.block_end)
-        {
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        GridEvenShare<OffsetT> even_share;
-        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
-
-        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
-    {
-        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
-        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
-
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh
deleted file mode 100644
index 01eded897..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_reduce_by_key.cuh
+++ /dev/null
@@ -1,547 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduceByKey
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentReduceByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
-    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentReduceByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
-
-    // Tuple type for pairing keys and values
-    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-    // Guarded inequality functor
-    template <typename _EqualityOpT>
-    struct GuardedInequalityWrapper
-    {
-        _EqualityOpT     op;             ///< Wrapped equality operator
-        int             num_remaining;  ///< Items remaining
-
-        /// Constructor
-        __host__ __device__ __forceinline__
-        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
-
-        /// Boolean inequality operator, returns <tt>(a != b)</tt>
-        template <typename T>
-        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
-        {
-            if (idx < num_remaining)
-                return !op(a, b);   // In bounds
-
-            // Return true if first out-of-bounds item, false otherwise
-            return (idx == num_remaining);
-       }
-    };
-
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
-        WrappedKeysInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedValuesInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for keys
-    typedef BlockLoad<
-            KeyOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadKeysT;
-
-    // Parameterized BlockLoad type for values
-    typedef BlockLoad<
-            ValueOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadValuesT;
-
-    // Parameterized BlockDiscontinuity type for keys
-    typedef BlockDiscontinuity<
-            KeyOutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityKeys;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetValuePairT,
-            BLOCK_THREADS,
-            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Key and value exchange types
-    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
-    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadKeysT::TempStorage load_keys;
-
-        // Smem needed for loading values
-        typename BlockLoadValuesT::TempStorage load_values;
-
-        // Smem needed for compacting key value pairs(allows non POD items in this union)
-        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
-    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
-    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
-    EqualityOpT                     equality_op;        ///< KeyT equality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentReduceByKey(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        KeysInputIteratorT          d_keys_in,          ///< Input keys
-        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
-        ValuesInputIteratorT        d_values_in,        ///< Input values
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_unique_out(d_unique_out),
-        d_values_in(d_values_in),
-        d_aggregates_out(d_aggregates_out),
-        d_num_runs_out(d_num_runs_out),
-        equality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Directly scatter flagged items to output offsets
-     */
-    __device__ __forceinline__ void ScatterDirect(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
-    {
-        // Scatter flagged keys and values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
-                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
-            }
-        }
-    }
-
-
-    /**
-     * 2-phase scatter flagged items to output offsets
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate: the scatter offsets must be decremented for value aggregates
-     */
-    __device__ __forceinline__ void ScatterTwoPhase(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        CTA_SYNC();
-
-        // Compact and scatter pairs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
-        {
-            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
-            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
-            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    __device__ __forceinline__ void Scatter(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
-        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
-        {
-            ScatterTwoPhase(
-                scatter_items,
-                segment_flags,
-                segment_indices,
-                num_tile_segments,
-                num_tile_segments_prefix);
-        }
-        else
-        {
-            ScatterDirect(
-                scatter_items,
-                segment_flags,
-                segment_indices);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
-        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
-        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
-        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
-        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
-        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
-        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
-
-        // Load keys
-        if (IS_LAST_TILE)
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
-        else
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
-
-        // Load tile predecessor key in first thread
-        KeyOutputT tile_predecessor;
-        if (threadIdx.x == 0)
-        {
-            tile_predecessor = (tile_idx == 0) ?
-                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
-                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
-        }
-
-        CTA_SYNC();
-
-        // Load values
-        if (IS_LAST_TILE)
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
-        else
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
-
-        CTA_SYNC();
-
-        // Initialize head-flags and shuffle up the previous keys
-        if (IS_LAST_TILE)
-        {
-            // Use custom flag operator to additionally flag the first out-of-bounds item
-            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-        else
-        {
-            InequalityWrapper<EqualityOpT> flag_op(equality_op);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-
-        // Zip values and head flags
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scan_items[ITEM].value  = values[ITEM];
-            scan_items[ITEM].key    = head_flags[ITEM];
-        }
-
-        // Perform exclusive tile scan
-        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
-        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
-        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
-            num_segments_prefix     = 0;
-            total_aggregate         = block_aggregate;
-
-            // Update tile status if there are successor tiles
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
-
-            block_aggregate         = prefix_op.GetBlockAggregate();
-            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
-            total_aggregate         = prefix_op.GetInclusivePrefix();
-        }
-
-        // Rezip scatter items and segment indices
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scatter_items[ITEM].key     = prev_keys[ITEM];
-            scatter_items[ITEM].value   = scan_items[ITEM].value;
-            segment_indices[ITEM]       = scan_items[ITEM].key;
-        }
-
-        // At this point, each flagged segment head has:
-        //  - The key for the previous segment
-        //  - The reduced value from the previous segment
-        //  - The segment index for the reduced value
-
-        // Scatter flagged keys and values
-        OffsetT num_tile_segments = block_aggregate.key;
-        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
-
-        // Last thread in last tile will output final count (and last pair, if necessary)
-        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
-        {
-            OffsetT num_segments = num_segments_prefix + num_tile_segments;
-
-            // If the last tile is a whole tile, output the final_value
-            if (num_remaining == TILE_ITEMS)
-            {
-                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
-                d_aggregates_out[num_segments]  = total_aggregate.value;
-                num_segments++;
-            }
-
-            // Output the total number of items selected
-            *d_num_runs_out = num_segments;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh
deleted file mode 100644
index 79697b7ec..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_rle.cuh
+++ /dev/null
@@ -1,837 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRle
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentRlePolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
- */
-template <
-    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename    InputIteratorT,         ///< Random-access input iterator type for data
-    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
-    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
-    typename    EqualityOpT,            ///< T equality operator type
-    typename    OffsetT>                ///< Signed integer type for global offsets
-struct AgentRle
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    /// The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    /// Tuple type for scanning (pairs run-length and run-index)
-    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
-
-    /// Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
-        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
-        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
-    };
-
-
-    /**
-     * Special operator that signals all out-of-bounds items are not equal to everything else,
-     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
-     * trivial.
-     */
-    template <bool LAST_TILE>
-    struct OobInequalityOp
-    {
-        OffsetT         num_remaining;
-        EqualityOpT      equality_op;
-
-        __device__ __forceinline__ OobInequalityOp(
-            OffsetT     num_remaining,
-            EqualityOpT  equality_op)
-        :
-            num_remaining(num_remaining),
-            equality_op(equality_op)
-        {}
-
-        template <typename Index>
-        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
-        {
-            if (!LAST_TILE || (idx < num_remaining))
-                return !equality_op(first, second);
-            else
-                return true;
-        }
-    };
-
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
-            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Parameterized BlockLoad type for data
-    typedef BlockLoad<
-            T,
-            AgentRlePolicyT::BLOCK_THREADS,
-            AgentRlePolicyT::ITEMS_PER_THREAD,
-            AgentRlePolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockDiscontinuity type for data
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized WarpScan type
-    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
-
-    // Reduce-length-by-run scan operator
-    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            LengthOffsetPair,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Warp exchange types
-    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
-
-    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
-
-    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
-    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
-
-    typedef LengthOffsetPair WarpAggregates[WARPS];
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        // Aliasable storage layout
-        union Aliasable
-        {
-            struct
-            {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
-                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage                    load;
-
-            // Aliasable layout needed for two-phase scatter
-            union ScatterAliasable
-            {
-                unsigned long long                              align;
-                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
-
-            } scatter_aliasable;
-
-        } aliasable;
-
-        OffsetT             tile_idx;                   // Shared tile index
-        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
-        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-
-    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
-    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
-    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
-
-    EqualityOpT                     equality_op;        ///< T equality operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentRle(
-        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
-        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
-        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
-        EqualityOpT                 equality_op,        ///< [in] T equality operator
-        OffsetT                     num_items)          ///< [in] Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_offsets_out(d_offsets_out),
-        d_lengths_out(d_lengths_out),
-        equality_op(equality_op),
-        scan_op(cub::Sum()),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT             tile_offset,
-        OffsetT             num_remaining,
-        T                   (&items)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        bool                head_flags[ITEMS_PER_THREAD];
-        bool                tail_flags[ITEMS_PER_THREAD];
-
-        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
-
-        if (FIRST_TILE && LAST_TILE)
-        {
-            // First-and-last-tile always head-flags the first item and tail-flags the last item
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, items, inequality_op);
-        }
-        else if (FIRST_TILE)
-        {
-            // First-tile always head-flags the first item
-
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, tile_successor_item, items, inequality_op);
-        }
-        else if (LAST_TILE)
-        {
-            // Last-tile always flags the last item
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
-        }
-        else
-        {
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
-        }
-
-        // Zip counts and runs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
-            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan of allocations
-     */
-    __device__ __forceinline__ void WarpScanAllocations(
-        LengthOffsetPair    &tile_aggregate,
-        LengthOffsetPair    &warp_aggregate,
-        LengthOffsetPair    &warp_exclusive_in_tile,
-        LengthOffsetPair    &thread_exclusive_in_warp,
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        // Perform warpscans
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        LengthOffsetPair identity;
-        identity.key = 0;
-        identity.value = 0;
-
-        LengthOffsetPair thread_inclusive;
-        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
-        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
-            thread_aggregate,
-            thread_inclusive,
-            thread_exclusive_in_warp,
-            identity,
-            scan_op);
-
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
-
-        CTA_SYNC();
-
-        // Accumulate total selected and the warp-wide prefix
-        warp_exclusive_in_tile          = identity;
-        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
-        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_exclusive_in_tile = tile_aggregate;
-
-            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Two-phase scatter, specialized for warp time-slicing
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<true>      is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Locally compact items within the warp (first warp)
-        if (warp_id == 0)
-        {
-            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-        }
-
-        // Locally compact items within the warp (remaining warps)
-        #pragma unroll
-        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-            }
-        }
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Two-phase scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<false>     is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Unzip
-        OffsetT run_offsets[ITEMS_PER_THREAD];
-        LengthT run_lengths[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
-            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
-        }
-
-        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
-            run_offsets, thread_num_runs_exclusive_in_warp);
-
-        WARP_SYNC(0xffffffff);
-
-        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
-            run_lengths, thread_num_runs_exclusive_in_warp);
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = run_offsets[ITEM];
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Direct scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    thread_num_runs_exclusive_in_warp[ITEM];
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if (item_offset >= 1)
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OffsetT             tile_num_runs_aggregate,
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
-        {
-            // Direct scatter if the warp has any items
-            if (warp_num_runs_aggregate)
-            {
-                ScatterDirect<FIRST_TILE>(
-                    tile_num_runs_exclusive_in_global,
-                    warp_num_runs_aggregate,
-                    warp_num_runs_exclusive_in_tile,
-                    thread_num_runs_exclusive_in_warp,
-                    lengths_and_offsets);
-            }
-        }
-        else
-        {
-            // Scatter two phase
-            ScatterTwoPhase<FIRST_TILE>(
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets,
-                Int2Type<STORE_WARP_TIME_SLICING>());
-        }
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
-        OffsetT             num_items,          ///< Total number of global input items
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT      &tile_status)       ///< Global list of tile status
-    {
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<true, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_aggregate);
-
-            // Update thread_exclusive_in_warp to fold in warp run-length
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
-
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-
-            // Downsweep scan through lengths_and_num_runs
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = 0;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<true>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return tile_aggregate;
-        }
-        else
-        {
-            // Not first tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<false, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // First warp computes tile prefix in lane 0
-            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
-            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-            if (warp_id == 0)
-            {
-                prefix_op(tile_aggregate);
-                if (threadIdx.x == 0)
-                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
-            }
-
-            CTA_SYNC();
-
-            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
-
-            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
-            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += thread_exclusive.value;
-
-            // Downsweep scan through lengths_and_num_runs
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<false>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return prefix_op.inclusive_prefix;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_tiles,              ///< Total number of input tiles
-        ScanTileStateT&     tile_status,            ///< Global list of tile status
-        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selected
-                *d_num_runs_out = running_total.key;
-
-                // The inclusive prefix contains accumulated length reduction for the last run
-                if (running_total.key > 0)
-                    d_lengths_out[running_total.key - 1] = running_total.value;
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh
deleted file mode 100644
index 0781b3e9e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_scan.cuh
+++ /dev/null
@@ -1,469 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../config.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentScan
- */
-template <
-    int                         NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                         NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename                    ComputeT,                       ///< Dominant compute type
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
-    BlockScanAlgorithm          _SCAN_ALGORITHM,                ///< The BlockScan algorithm to use
-    typename                    ScalingType =  MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-
-struct AgentScanPolicy :
-    ScalingType
-{
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-template <
-    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
-    typename InputIteratorT,        ///< Random-access input iterator type
-    typename OutputIteratorT,       ///< Random-access output iterator type
-    typename ScanOpT,               ///< Scan functor type
-    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
-    typename OffsetT>               ///< Signed integer type for global offsets
-struct AgentScan
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OutputT> ScanTileStateT;
-
-    // Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Constants
-    enum
-    {
-        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
-        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockStore type
-    typedef BlockStore<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::STORE_ALGORITHM>
-        BlockStoreT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OutputT,
-            ScanOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef BlockScanRunningPrefixOp<
-            OutputT,
-            ScanOpT>
-        RunningPrefixCallbackOp;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
-        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
-
-        struct
-        {
-            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
-            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
-        };
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&               temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT       d_in;               ///< Input data
-    OutputIteratorT             d_out;              ///< Output data
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    InitValueT                  init_value;         ///< The init_value element for ScanOpT
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        OutputT             init_value,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
-        block_aggregate = scan_op(init_value, block_aggregate);
-    }
-
-
-    /**
-     * Inclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        InitValueT          /*init_value*/,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * Exclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    /**
-     * Inclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentScan(
-        TempStorage&    temp_storage,       ///< Reference to temp_storage
-        InputIteratorT  d_in,               ///< Input data
-        OutputIteratorT d_out,              ///< Output data
-        ScanOpT         scan_op,            ///< Binary scan operator
-        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        init_value(init_value)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scan an sequence of consecutive tiles (independent of other thread blocks)
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool                        IS_FIRST_TILE,
-        bool                        IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT                     tile_offset,                ///< Tile offset
-        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
-        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Block scan
-        if (IS_FIRST_TILE)
-        {
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
-        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
-
-        if (range_offset + TILE_ITEMS <= range_end)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (range_offset + TILE_ITEMS <= range_end)
-            {
-                ConsumeTile<false, true>(range_offset, prefix_op);
-                range_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (range_offset < range_end)
-            {
-                int valid_items = range_end - range_offset;
-                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = range_end - range_offset;
-            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
-        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
-
-        // Consume full tiles of input
-        while (range_offset + TILE_ITEMS <= range_end)
-        {
-            ConsumeTile<true, false>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (range_offset < range_end)
-        {
-            int valid_items = range_end - range_offset;
-            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh
deleted file mode 100644
index 9cd524aa2..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_segment_fixup.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSegmentFixup
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSegmentFixupPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentSegmentFixup
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of key-value input iterator
-    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
-
-    // Value type
-    typedef typename KeyValuePairT::Value ValueT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not do fixup using RLE + global atomics
-        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
-                                (Equals<ValueT, float>::VALUE || 
-                                 Equals<ValueT, int>::VALUE ||
-                                 Equals<ValueT, unsigned int>::VALUE ||
-                                 Equals<ValueT, unsigned long long>::VALUE),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
-        WrappedPairsInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for pairs
-    typedef BlockLoad<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
-        BlockLoadPairs;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            KeyValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadPairs::TempStorage load_pairs;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSegmentFixup(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        PairsInputIteratorT         d_pairs_in,          ///< Input keys
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_pairs_in(d_pairs_in),
-        d_aggregates_out(d_aggregates_out),
-        d_fixup_in(d_aggregates_out),
-        inequality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process input tile.  Specialized for atomic-fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        // RLE 
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
-            if (pairs[ITEM].key != pairs[ITEM - 1].key)
-                atomicAdd(d_scatter, pairs[ITEM - 1].value);
-            else
-                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
-        }
-
-        // Flush last item if valid
-        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
-        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
-            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
-    }
-
-
-    /**
-     * Process input tile.  Specialized for reduce-by-key fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        CTA_SYNC();
-
-        KeyValuePairT tile_aggregate;
-        if (tile_idx == 0)
-        {
-            // Exclusive scan of values and segment_flags
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
-
-            // Update tile status if this is not the last tile
-            if (threadIdx.x == 0)
-            {
-                // Set first segment id to not trigger a flush (invalid from exclusive scan)
-                scatter_pairs[0].key = pairs[0].key;
-
-                if (!IS_LAST_TILE)
-                    tile_state.SetInclusive(0, tile_aggregate);
-
-            }
-        }
-        else
-        {
-            // Exclusive scan of values and segment_flags
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
-            tile_aggregate = prefix_op.GetBlockAggregate();
-        }
-
-        // Scatter updated values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
-            {
-                // Update the value at the key location
-                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
-                value           = reduction_op(value, scatter_pairs[ITEM].value);
-
-                d_aggregates_out[scatter_pairs[ITEM].key] = value;
-            }
-        }
-
-        // Finalize the last item
-        if (IS_LAST_TILE)
-        {
-            // Last thread will output final count and last item, if necessary
-            if (threadIdx.x == BLOCK_THREADS - 1)
-            {
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    // Update the value at the key location
-                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
-                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        int                 num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh
deleted file mode 100644
index e9568f3b0..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_select_if.cuh
+++ /dev/null
@@ -1,703 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../config.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSelectIf
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSelectIfPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-/**
- * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
-    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct AgentSelectIf
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        USE_SELECT_OP,
-        USE_SELECT_FLAGS,
-        USE_DISCONTINUITY,
-
-        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
-
-        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
-                                    USE_SELECT_OP :
-                                    (!Equals<FlagT, NullType>::VALUE) ?
-                                        USE_SELECT_FLAGS :
-                                        USE_DISCONTINUITY
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
-        WrappedFlagsInputIteratorT;
-
-    // Parameterized BlockLoad type for input data
-    typedef BlockLoad<
-            OutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockLoad type for flags
-    typedef BlockLoad<
-            FlagT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadFlags;
-
-    // Parameterized BlockDiscontinuity type for items
-    typedef BlockDiscontinuity<
-            OutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetT,
-            BLOCK_THREADS,
-            AgentSelectIfPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetT,
-            cub::Sum,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Item exchange type
-    typedef OutputT ItemExchangeT[TILE_ITEMS];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading items
-        typename BlockLoadT::TempStorage load_items;
-
-        // Smem needed for loading values
-        typename BlockLoadFlags::TempStorage load_flags;
-
-        // Smem needed for compacting items (allows non POD items in this union)
-        Uninitialized<ItemExchangeT> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT           d_in;               ///< Input items
-    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
-    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
-    SelectOpT                       select_op;          ///< Selection operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSelectIf(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorT              d_in,               ///< Input data
-        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,     ///< Output data
-        SelectOpT                   select_op,          ///< Selection operator
-        EqualityOpT                 equality_op,        ///< Equality operator
-        OffsetT                     num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_flags_in(d_flags_in),
-        d_selected_out(d_selected_out),
-        select_op(select_op),
-        inequality_op(equality_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize selections (specialized for selection operator)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     /*tile_offset*/,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_OP>     /*select_method*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Out-of-bounds items are selection_flags
-            selection_flags[ITEM] = 1;
-
-            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
-                selection_flags[ITEM] = select_op(items[ITEM]);
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for valid flags)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
-    {
-        CTA_SYNC();
-
-        FlagT flags[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-            // Out-of-bounds items are selection_flags
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
-        }
-        else
-        {
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
-        }
-
-        // Convert flag type to selection_flags type
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            selection_flags[ITEM] = flags[ITEM];
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for discontinuity detection)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_DISCONTINUITY> /*select_method*/)
-    {
-        if (IS_FIRST_TILE)
-        {
-            CTA_SYNC();
-
-            // Set head selection_flags.  First tile sets the first flag for the first item
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
-        }
-        else
-        {
-            OutputT tile_predecessor;
-            if (threadIdx.x == 0)
-                tile_predecessor = d_in[tile_offset - 1];
-
-            CTA_SYNC();
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
-        }
-
-        // Set selection flags for out-of-bounds items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Set selection_flags for out-of-bounds items
-            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
-                selection_flags[ITEM] = 1;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scatter flagged items to output offsets (specialized for direct scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OutputT (&items)[ITEMS_PER_THREAD],
-        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
-        OffsetT num_selections)
-    {
-        // Scatter flagged items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (selection_flags[ITEM])
-            {
-                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
-                {
-                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
-        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        // Compact and scatter items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
-            if (selection_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
-        {
-            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        int tile_num_rejections = num_tile_items - num_tile_selections;
-
-        // Scatter items to shared memory (rejections first)
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
-            int local_rejection_idx     = item_idx - local_selection_idx;
-            int local_scatter_offset    = (selection_flags[ITEM]) ?
-                                            tile_num_rejections + local_selection_idx :
-                                            local_rejection_idx;
-
-            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // Gather items from shared memory and scatter to global
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
-            int rejection_idx       = item_idx;
-            int selection_idx       = item_idx - tile_num_rejections;
-            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
-                                        num_items - num_rejected_prefix - rejection_idx - 1 :
-                                        num_selections_prefix + selection_idx;
-
-            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
-
-            if (!IS_LAST_TILE || (item_idx < num_tile_items))
-            {
-                d_selected_out[scatter_offset] = item;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        OffsetT         num_selections)                             ///< Total number of selections including this tile
-    {
-        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
-        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
-        {
-            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_tile_items,
-                num_tile_selections,
-                num_selections_prefix,
-                num_rejected_prefix,
-                Int2Type<KEEP_REJECTS>());
-        }
-        else
-        {
-            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_selections);
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<true, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of selection_flags
-        OffsetT num_tile_selections;
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
-
-        if (threadIdx.x == 0)
-        {
-            // Update tile status if this is not the last tile
-            if (!IS_LAST_TILE)
-                tile_state.SetInclusive(0, num_tile_selections);
-        }
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-            num_tile_selections -= (TILE_ITEMS - num_tile_items);
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, true>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            0,
-            0,
-            num_tile_selections);
-
-        return num_tile_selections;
-    }
-
-
-    /**
-     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<false, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of values and selection_flags
-        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
-
-        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
-        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
-        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
-        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-        {
-            int num_discount    = TILE_ITEMS - num_tile_items;
-            num_selections      -= num_discount;
-            num_tile_selections -= num_discount;
-        }
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, false>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            num_selections_prefix,
-            num_rejected_prefix,
-            num_selections);
-
-        return num_selections;
-    }
-
-
-    /**
-     * Process a tile of input
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,         ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OffsetT num_selections;
-        if (tile_idx == 0)
-        {
-            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
-        }
-        else
-        {
-            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
-        }
-
-        return num_selections;
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
-        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
-        }
-        else
-        {
-            // The last tile (possibly partially-full)
-            OffsetT num_remaining   = num_items - tile_offset;
-            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selection_flags
-                *d_num_selected_out = num_selections;
-            }
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh
deleted file mode 100644
index 810f893fb..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/agent_spmv_orig.cuh
+++ /dev/null
@@ -1,670 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_reduce.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../config.cuh"
-#include "../thread/thread_search.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/counting_input_iterator.cuh"
-#include "../iterator/tex_ref_input_iterator.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSpmv
- */
-template <
-    int                             _BLOCK_THREADS,                         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
-    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
-    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
-    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
-    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
-    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
-    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
-    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
-struct AgentSpmvPolicy
-{
-    enum
-    {
-        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
-        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
-        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
-    };
-
-    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
-    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
-    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
-    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
-
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-template <
-    typename        ValueT,              ///< Matrix and vector value type
-    typename        OffsetT>             ///< Signed integer type for sequence offsets
-struct SpmvParams
-{
-    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
-    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
-    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
-    ValueT          alpha;               ///< Alpha multiplicand
-    ValueT          beta;                ///< Beta addend-multiplicand
-
-    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
-};
-
-
-/**
- * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT,                    ///< Signed integer type for sequence offsets
-    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
-    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
-struct AgentSpmv
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    /// 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    /// Input iterator wrapper types (for applying cache modifiers)
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        ColumnIndicesIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        ValueIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // BlockReduce specialization
-    typedef BlockReduce<
-            ValueT,
-            BLOCK_THREADS,
-            BLOCK_REDUCE_WARP_REDUCTIONS>
-        BlockReduceT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            ValueT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockPrefixSumT;
-
-    // BlockExchange specialization
-    typedef BlockExchange<
-            ValueT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeT;
-
-    /// Merge item type (either a non-zero value or a row-end offset)
-    union MergeItem
-    {
-        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
-        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
-
-        OffsetT     row_end_offset;
-        MergeValueT nonzero;
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CoordinateT tile_coords[2];
-
-        union Aliasable
-        {
-            // Smem needed for tile of merge items
-            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
-
-            // Smem needed for block exchange
-            typename BlockExchangeT::TempStorage exchange;
-
-            // Smem needed for block-wide reduction
-            typename BlockReduceT::TempStorage reduce;
-
-            // Smem needed for tile scanning
-            typename BlockScanT::TempStorage scan;
-
-            // Smem needed for tile prefix sum
-            typename BlockPrefixSumT::TempStorage prefix_sum;
-
-        } aliasable;
-    };
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-
-    _TempStorage&                   temp_storage;         /// Reference to temp_storage
-
-    SpmvParams<ValueT, OffsetT>&    spmv_params;
-
-    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentSpmv(
-        TempStorage&                    temp_storage,           ///< Reference to temp_storage
-        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
-    :
-        temp_storage(temp_storage.Alias()),
-        spmv_params(spmv_params),
-        wd_values(spmv_params.d_values),
-        wd_row_end_offsets(spmv_params.d_row_end_offsets),
-        wd_column_indices(spmv_params.d_column_indices),
-        wd_vector_x(spmv_params.d_vector_x),
-        wd_vector_y(spmv_params.d_vector_y)
-    {}
-
-
-
-
-    /**
-     * Consume a merge tile, specialized for direct-load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-
-        ValueT          running_total = 0.0;
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
-            OffsetT column_idx          = wd_column_indices[nonzero_idx];
-            ValueT  value               = wd_values[nonzero_idx];
-
-            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-            vector_value                = wd_vector_x[column_idx];
-#endif
-            ValueT  nonzero             = value * vector_value;
-
-            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
-
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                running_total += nonzero;
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = tile_num_rows;
-                ++thread_current_coord.y;
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = thread_current_coord.x;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key   = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (tile_num_rows > 0)
-        {
-            if (threadIdx.x == 0)
-                scan_item.key = -1;
-
-            // Direct scatter
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM].key < tile_num_rows)
-                {
-                    if (scan_item.key == scan_segment[ITEM].key)
-                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
-
-                    if (HAS_ALPHA)
-                    {
-                        scan_segment[ITEM].value *= spmv_params.alpha;
-                    }
-
-                    if (HAS_BETA)
-                    {
-                        // Update the output vector element
-                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
-                        scan_segment[ITEM].value += addend;
-                    }
-
-                    // Set the output vector element
-                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
-                }
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-
-    /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-
-#if (CUB_PTX_ARCH >= 520)
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
-
-            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
-            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
-            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
-
-            if (nonzero_idx < tile_num_nonzeros)
-            {
-
-                OffsetT column_idx              = *ci;
-                ValueT  value                   = *a;
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-                vector_value                    = wd_vector_x[column_idx];
-
-                ValueT  nonzero                 = value * vector_value;
-
-                *s    = nonzero;
-            }
-        }
-
-
-#else
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        if (tile_num_nonzeros > 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
-                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
-
-                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
-                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-                vector_value                    = wd_vector_x[column_idx];
-#endif
-                ValueT  nonzero                 = value * vector_value;
-
-                s_tile_nonzeros[nonzero_idx]    = nonzero;
-            }
-        }
-
-#endif
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        #pragma unroll 1
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-        ValueT          running_total = 0.0;
-
-        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
-        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                scan_segment[ITEM].value    = nonzero;
-                running_total               += nonzero;
-                ++thread_current_coord.y;
-                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = 0.0;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
-            }
-
-            scan_segment[ITEM].key = thread_current_coord.x;
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (threadIdx.x == 0)
-        {
-            scan_item.key = thread_start_coord.x;
-            scan_item.value = 0.0;
-        }
-
-        if (tile_num_rows > 0)
-        {
-
-            CTA_SYNC();
-
-            // Scan downsweep and scatter
-            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
-
-            if (scan_item.key != scan_segment[0].key)
-            {
-                s_partials[scan_item.key] = scan_item.value;
-            }
-            else
-            {
-                scan_segment[0].value += scan_item.value;
-            }
-
-            #pragma unroll
-            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
-                {
-                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
-                }
-                else
-                {
-                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll 1
-            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
-            {
-                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-    /**
-     * Consume input tile
-     */
-    __device__ __forceinline__ void ConsumeTile(
-        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
-        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-        int             num_merge_tiles)        ///< [in] Number of merge tiles
-    {
-        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-
-        if (tile_idx >= num_merge_tiles)
-            return;
-
-        // Read our starting coordinates
-        if (threadIdx.x < 2)
-        {
-            if (d_tile_coordinates == NULL)
-            {
-                // Search our starting coordinates
-                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
-                CoordinateT                     tile_coord;
-                CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-                // Search the merge path
-                MergePathSearch(
-                    diagonal,
-                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-                    nonzero_indices,
-                    spmv_params.num_rows,
-                    spmv_params.num_nonzeros,
-                    tile_coord);
-
-                temp_storage.tile_coords[threadIdx.x] = tile_coord;
-            }
-            else
-            {
-                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
-            }
-        }
-
-        CTA_SYNC();
-
-        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
-        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
-
-        // Consume multi-segment tile
-        KeyValuePairT tile_carry = ConsumeTile(
-            tile_idx,
-            tile_start_coord,
-            tile_end_coord,
-            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
-
-        // Output the tile's carry-out
-        if (threadIdx.x == 0)
-        {
-            if (HAS_ALPHA)
-                tile_carry.value *= spmv_params.alpha;
-
-            tile_carry.key += tile_start_coord.x;
-            d_tile_carry_pairs[tile_idx]    = tile_carry;
-        }
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh
deleted file mode 100644
index b3cd92971..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/agent/single_pass_scan_operators.cuh
+++ /dev/null
@@ -1,814 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Callback operator types for supplying BlockScan prefixes
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a
- * region independent of other thread blocks
- ******************************************************************************/
-
-/**
- * Stateful callback operator type for supplying BlockScan prefixes.
- * Maintains a running prefix that can be applied to consecutive
- * BlockScan operations.
- */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOpT>            ///< Wrapped scan operator type
-struct BlockScanRunningPrefixOp
-{
-    ScanOpT     op;                 ///< Wrapped scan operator
-    T           running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOpT op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
-};
-
-
-/******************************************************************************
- * Generic tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID = 99, // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
-};
-
-
-/**
- * Tile status interface.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
-struct ScanTileState;
-
-
-/**
- * Tile status interface specialized for scan status and value types
- * that can be combined into one machine word that can be
- * read/written coherently in a single access.
- */
-template <typename T>
-struct ScanTileState<T, true>
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-
-    // Unit word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                uchar2>::Type>::Type>::Type TxnWord;
-
-
-    // Device word type
-    struct TileDescriptor
-    {
-        StatusWord  status;
-        T           value;
-    };
-
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-        TxnWord val = TxnWord();
-        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-
-/**
- * Tile status interface specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <typename T>
-struct ScanTileState<T, false>
-{
-    // Status word type
-    typedef char StatusWord;
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Device storage
-    StatusWord  *d_tile_status;
-    T           *d_tile_partial;
-    T           *d_tile_inclusive;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL),
-        d_tile_partial(NULL),
-        d_tile_inclusive(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            void*   allocations[3] = {};
-            size_t  allocation_sizes[3];
-
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
-
-            // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Alias the offsets
-            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
-            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
-            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        // Specify storage allocation requirements
-        size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
-
-        // Set the necessary size of the blob
-        void* allocations[3] = {};
-        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        // Update tile inclusive value
-        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        // Update tile partial value
-        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        do {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-
-            __threadfence();    // prevent hoisting loads from loop or loads below above this one
-
-        } while (status == SCAN_TILE_INVALID);
-
-        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
-            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        else
-            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
-    }
-};
-
-
-/******************************************************************************
- * ReduceByKey tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Tile status interface for reduction by key.
- *
- */
-template <
-    typename    ValueT,
-    typename    KeyT,
-    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
-struct ReduceByKeyScanTileState;
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <
-    typename    ValueT,
-    typename    KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
-    ScanTileState<KeyValuePair<KeyT, ValueT> >
-{
-    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState() : SuperClass() {}
-};
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * can be combined into one machine word that can be read/written coherently in a single access.
- */
-template <
-    typename ValueT,
-    typename KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, true>
-{
-    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
-
-    // Constants
-    enum
-    {
-        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
-        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
-        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
-
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Status word type
-    typedef typename If<(STATUS_WORD_SIZE == 8),
-        long long,
-        typename If<(STATUS_WORD_SIZE == 4),
-            int,
-            typename If<(STATUS_WORD_SIZE == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Status word type
-    typedef typename If<(TXN_WORD_SIZE == 16),
-        longlong2,
-        typename If<(TXN_WORD_SIZE == 8),
-            long long,
-            int>::Type>::Type TxnWord;
-
-    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
-    struct TileDescriptorBigStatus
-    {
-        KeyT        key;
-        ValueT      value;
-        StatusWord  status;
-    };
-
-    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
-    struct TileDescriptorLittleStatus
-    {
-        ValueT      value;
-        StatusWord  status;
-        KeyT        key;
-    };
-
-    // Device word type
-    typedef typename If<
-            (sizeof(ValueT) == sizeof(KeyT)),
-            TileDescriptorBigStatus,
-            TileDescriptorLittleStatus>::Type
-        TileDescriptor;
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
-        TxnWord         val         = TxnWord();
-        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value   = tile_inclusive.value;
-        tile_descriptor.key     = tile_inclusive.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_PARTIAL;
-        tile_descriptor.value   = tile_partial.value;
-        tile_descriptor.key     = tile_partial.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int                     tile_idx,
-        StatusWord              &status,
-        KeyValuePairT           &value)
-    {
-//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//
-//        while (tile_descriptor.status == SCAN_TILE_INVALID)
-//        {
-//            __threadfence_block(); // prevent hoisting loads from loop
-//
-//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//        }
-//
-//        status      = tile_descriptor.status;
-//        value.value = tile_descriptor.value;
-//        value.key   = tile_descriptor.key;
-
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status      = tile_descriptor.status;
-        value.value = tile_descriptor.value;
-        value.key   = tile_descriptor.key;
-    }
-
-};
-
-
-/******************************************************************************
- * Prefix call-back operator for coupling local block scan within a
- * block-cooperative scan
- ******************************************************************************/
-
-/**
- * Stateful block-scan prefix functor.  Provides the running prefix for
- * the current tile by using the call-back warp to wait on
- * aggregates/prefixes from predecessor tiles to become available.
- */
-template <
-    typename    T,
-    typename    ScanOpT,
-    typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct TilePrefixCallbackOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
-
-    // Temporary storage type
-    struct _TempStorage
-    {
-        typename WarpReduceT::TempStorage   warp_reduce;
-        T                                   exclusive_prefix;
-        T                                   inclusive_prefix;
-        T                                   block_aggregate;
-    };
-
-    // Alias wrapper allowing temporary storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-    // Type of status word
-    typedef typename ScanTileStateT::StatusWord StatusWord;
-
-    // Fields
-    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
-    ScanTileStateT&             tile_status;        ///< Interface to tile status
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    TilePrefixCallbackOp(
-        ScanTileStateT       &tile_status,
-        TempStorage         &temp_storage,
-        ScanOpT              scan_op,
-        int                 tile_idx)
-    :
-        temp_storage(temp_storage.Alias()),
-        tile_status(tile_status),
-        scan_op(scan_op),
-        tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the warp-wide window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
-    {
-        T value;
-        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window.
-        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
-
-        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
-            value,
-            tail_flag,
-            SwizzleScanOp<ScanOpT>(scan_op));
-    }
-
-
-    // BlockScan prefix callback functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            temp_storage.block_aggregate = block_aggregate;
-            tile_status.SetPartial(tile_idx, block_aggregate);
-        }
-
-        int         predecessor_idx = tile_idx - threadIdx.x - 1;
-        StatusWord  predecessor_status;
-        T           window_aggregate;
-
-        // Wait for the warp-wide window of predecessor tiles to become valid
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
-        {
-            predecessor_idx -= CUB_PTX_WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            tile_status.SetInclusive(tile_idx, inclusive_prefix);
-
-            temp_storage.exclusive_prefix = exclusive_prefix;
-            temp_storage.inclusive_prefix = inclusive_prefix;
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-
-    // Get the exclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetExclusivePrefix()
-    {
-        return temp_storage.exclusive_prefix;
-    }
-
-    // Get the inclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetInclusivePrefix()
-    {
-        return temp_storage.inclusive_prefix;
-    }
-
-    // Get the block aggregate stored in temporary storage
-    __device__ __forceinline__
-    T GetBlockAggregate()
-    {
-        return temp_storage.block_aggregate;
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh
deleted file mode 100644
index c8953756d..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_adjacent_difference.cuh
+++ /dev/null
@@ -1,596 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockAdjacentDifference
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(b, a, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(b, a);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh
deleted file mode 100644
index 37b8c2992..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_discontinuity.cuh
+++ /dev/null
@@ -1,1148 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                The data type to be flagged.
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- *   that differ from their predecessors (or successors).  For example, head flags are convenient
- *   for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * \par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * \par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockDiscontinuity
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(a, b);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head & tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh
deleted file mode 100644
index 35a033347..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_exchange.cuh
+++ /dev/null
@@ -1,1246 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the device-accessible memory subsystem prefers access patterns
- *   where data items are "striped" across threads (where consecutive threads access consecutive items),
- *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
- *   (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
- *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- */
-template <
-    typename    InputT,
-    int         BLOCK_DIM_X,
-    int         ITEMS_PER_THREAD,
-    bool        WARP_TIME_SLICING   = false,
-    int         BLOCK_DIM_Y         = 1,
-    int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
-class BlockExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
-
-        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct __align__(16) _TempStorage
-    {
-        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{BlockExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-    unsigned int lane_id;
-    unsigned int warp_id;
-    unsigned int warp_offset;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        if (warp_id == 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                temp_storage.buff[item_offset] = input_items[ITEM];
-            }
-
-            WARP_SYNC(0xffffffff);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                output_items[ITEM] = temp_storage.buff[item_offset];
-            }
-        }
-
-        #pragma unroll
-        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // No timeslicing
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        // Warp time-slicing
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Write a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage.buff[item_offset] = input_items[ITEM];
-                    }
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        #pragma unroll
-        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            CTA_SYNC();
-
-            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true> /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId()),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        lane_id(LaneId()),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to device-accessible memory.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from device-accessible memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (ranks[ITEM] >= 0)
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
-     */
-    template <typename OutputT, typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (is_valid[ITEM])
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    //@}  end member group
-
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(items, items);
-    }
-
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(items, items);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStripedGuarded(items, items, ranks);
-    }
-
-    template <typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
-    {
-        ScatterToStriped(items, items, ranks, is_valid);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-template <
-    typename    T,
-    int         ITEMS_PER_THREAD,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        // Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        T buff[WARP_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{WarpExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-public:
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpExchange(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
-            temp_storage.buff[ranks[ITEM]] = items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-};
-
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh
deleted file mode 100644
index 030209063..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_histogram.cuh
+++ /dev/null
@@ -1,414 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_histogram_sort.cuh"
-#include "specializations/block_histogram_atomic.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
- */
-enum BlockHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
-};
-
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-
-/**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * \endcode
- *
- * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or device-accessible memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockHistogram
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
-        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
-        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
-
-    /// Shared memory storage layout type for BlockHistogram
-    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockHistogram}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Initialize the shared histogram counters to zero.
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <typename CounterT     >
-    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
-    {
-        // Initialize histogram bin counts to zeros
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-    }
-
-
-    /**
-     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        // Initialize histogram bin counts to zeros
-        InitHistogram(histogram);
-
-        CTA_SYNC();
-
-        // Composite the histogram
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-
-
-    /**
-     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh
deleted file mode 100644
index a894ddf16..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_load.cuh
+++ /dev/null
@@ -1,1229 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
-        {
-            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Internal implementation for load vectorization
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
-    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T      *block_ptr,                 ///< [in] Input pointer for loading from
-    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Biggest memory access word that T is a whole multiple of
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
-
-        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
-            4 :
-            (TOTAL_WORDS % 2 == 0) ?
-                2 :
-                1,
-
-        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
-
-    // Vector items
-    Vector vec_items[VECTORS_PER_THREAD];
-
-    // Aliased input ptr
-    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
-
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
-    {
-        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
-    }
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
-    }
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T   *block_ptr,                 ///< [in] Input pointer for loading from
-    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-}
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
-        {
-            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-
-/** @} */       // end group UtilIo
-
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-enum BlockLoadAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIteratorTis not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
-     * read efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly larger latencies than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     * - Provisions more shared storage, but incurs smaller latencies than the
-     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * of data is read directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
-     * requirement, only one warp's worth of shared memory is provisioned and is
-     * subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-};
-
-
-/**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockLoad class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockLoad can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename            InputT,
-    int                 BLOCK_DIM_X,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockLoad
-{
-private:
-
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Load helper
-    template <BlockLoadAlgorithm _POLICY, int DUMMY>
-    struct LoadInternal;
-
-
-    /**
-     * BLOCK_LOAD_DIRECT specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_VECTORIZE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <
-            CacheLoadModifier   MODIFIER,
-            typename            ValueType,
-            typename            OffsetT>
-        __device__ __forceinline__ void Load(
-            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
-        template <typename _InputIteratorT>
-        __device__ __forceinline__ void Load(
-            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
-            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalLoad::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-    /// \smemstorage{BlockLoad}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Load a linear segment of items from memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
-     *
-     */
-    template <typename InputIteratorT, typename DefaultT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
-    }
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh
deleted file mode 100644
index a98976fc2..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_rank.cuh
+++ /dev/null
@@ -1,695 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
- */
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_scan.cuh"
-#include "../block/block_scan.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * Blah...
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par Examples
- * \par
- * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
- *      \code
- *      #include <cub/cub.cuh>
- *
- *      template <int BLOCK_THREADS>
- *      __global__ void ExampleKernel(...)
- *      {
- *
- *      \endcode
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRank
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    // Integer type for digit counters (to be packed into words of type PackedCounters)
-    typedef unsigned short DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
-        unsigned long long,
-        unsigned int>::Type PackedCounter;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_COUNTER           = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
-        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
-
-        // The number of packed counters per thread (plus one for padding)
-        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
-        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
-    };
-
-private:
-
-
-    /// BlockScan type
-    typedef BlockScan<
-            PackedCounter,
-            BLOCK_DIM_X,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScan;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        union Aliasable
-        {
-            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-
-        } aliasable;
-
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /// Copy of raking segment, promoted to registers
-    PackedCounter cached_segment[RAKING_SEGMENT];
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal storage allocator
-     */
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Performs upsweep raking reduction, returning the aggregate
-     */
-    __device__ __forceinline__ PackedCounter Upsweep()
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-        PackedCounter *raking_ptr;
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        PackedCounter raking_partial)
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-
-        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        // Exclusive raking downsweep scan
-        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /**
-     * Reset shared memory digit counters
-     */
-    __device__ __forceinline__ void ResetCounters()
-    {
-        // Reset shared memory digit counters
-        #pragma unroll
-        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
-        {
-            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
-        }
-    }
-
-
-    /**
-     * Block-scan prefix callback
-     */
-    struct PrefixCallBack
-    {
-        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
-        {
-            PackedCounter block_prefix = 0;
-
-            // Propagate totals in packed fields
-            #pragma unroll
-            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-            {
-                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-            }
-
-            return block_prefix;
-        }
-    };
-
-
-    /**
-     * Scan shared memory digit counters.
-     */
-    __device__ __forceinline__ void ScanCounters()
-    {
-        // Upsweep scan
-        PackedCounter raking_partial = Upsweep();
-
-        // Compute exclusive sum
-        PackedCounter exclusive_partial;
-        PrefixCallBack prefix_call_back;
-        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
-
-        // Downsweep scan with exclusive partial
-        ExclusiveDownsweep(exclusive_partial);
-    }
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
-        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
-
-        // Reset shared memory digit counters
-        ResetCounters();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Get digit
-            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            // Get sub-counter
-            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
-
-            // Get counter lane
-            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
-
-            if (IS_DESCENDING)
-            {
-                sub_counter = PACKING_RATIO - 1 - sub_counter;
-                counter_lane = COUNTER_LANES - 1 - counter_lane;
-            }
-
-            // Pointer to smem digit counter
-            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[ITEM] = *digit_counters[ITEM];
-
-            // Store inclusive prefix
-            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
-        }
-
-        CTA_SYNC();
-
-        // Scan shared memory counters
-        ScanCounters();
-
-        CTA_SYNC();
-
-        // Extract the local ranks of each key
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Add in thread block exclusive prefix
-            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
-        }
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        // Rank keys
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-                // first counter column, resulting in unavoidable bank conflicts.)
-                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
-                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
-            }
-        }
-    }
-};
-
-
-
-
-
-/**
- * Radix-rank using match.any
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRankMatch
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    typedef int32_t    RankT;
-    typedef int32_t    DigitCounterT;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
-                                    WARPS + 1 :
-                                    WARPS,
-
-        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
-        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
-                                    RAKING_SEGMENT + 1 :
-                                    RAKING_SEGMENT,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
-    };
-
-private:
-
-    /// BlockScan type
-    typedef BlockScan<
-            DigitCounterT,
-            BLOCK_THREADS,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScanT;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        typename BlockScanT::TempStorage            block_scan;
-
-        union __align__(16) Aliasable
-        {
-            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
-            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
-
-        } aliasable;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRankMatch(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        // Initialize shared digit counters
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
-
-        CTA_SYNC();
-
-        // Each warp will strip-mine its section of input, one strip at a time
-
-        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
-        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
-        uint32_t                lane_mask_lt    = LaneMaskLt();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // My digit
-            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            if (IS_DESCENDING)
-                digit = RADIX_DIGITS - digit - 1;
-
-            // Mask of peers who have same digit as me
-            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
-
-            // Pointer to smem digit counter for this key
-            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
-
-            // Number of occurrences in previous strips
-            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of peers having same digit as me
-            int32_t digit_count = __popc(peer_mask);
-
-            // Number of lower-ranked peers having same digit seen so far
-            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
-
-            if (peer_digit_prefix == 0)
-            {
-                // First thread for each digit updates the shared warp counter
-                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
-            }
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of prior keys having same digit
-            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
-        }
-
-        CTA_SYNC();
-
-        // Scan warp counters
-
-        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
-
-        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
-
-        CTA_SYNC();
-
-        // Seed ranks with counter values from previous warps
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-            ranks[ITEM] += *digit_counters[ITEM];
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get exclusive count for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh
deleted file mode 100644
index e66690215..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_radix_sort.cuh
+++ /dev/null
@@ -1,862 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
- */
-
-
-#pragma once
-
-#include "block_exchange.cuh"
-#include "block_radix_rank.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
- * \ingroup BlockModule
- *
- * \tparam KeyT                 KeyT type
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
- *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- *   half-precision floating-point type. Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering.
- * - \rowmajor
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockRadixSort}
- * \par
- * The code snippet below illustrates a sort of 512 integer keys that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
- *
- *     // Allocate shared memory for BlockRadixSort
- *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_keys[4];
- *     ...
- *
- *     // Collectively sort the keys
- *     BlockRadixSort(temp_storage).Sort(thread_keys);
- *
- *     ...
- * \endcode
- * \par
- * Suppose the set of input \p thread_keys across the block of threads is
- * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
- * corresponding output \p thread_keys in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename                KeyT,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    typename                ValueT                   = NullType,
-    int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixSort
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        // Whether or not there are values to be trucked along with keys
-        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // KeyT traits and unsigned bits type
-    typedef Traits<KeyT>                        KeyTraits;
-    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
-
-    /// Ascending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            false,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        AscendingBlockRadixRank;
-
-    /// Descending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            true,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        DescendingBlockRadixRank;
-
-    /// BlockExchange utility type for keys
-    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
-
-    /// BlockExchange utility type for values
-    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
-        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
-        typename BlockExchangeKeys::TempStorage        exchange_keys;
-        typename BlockExchangeValues::TempStorage      exchange_values;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-    /// Rank keys (specialized for ascending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<false> /*is_descending*/)
-    {
-        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// Rank keys (specialized for descending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<true>  /*is_descending*/)
-    {
-        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<true>  /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<false> /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for keys-only sort)
-    template <int IS_BLOCKED>
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
-        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
-        Int2Type<true>          /*is_keys_only*/,
-        Int2Type<IS_BLOCKED>    /*is_blocked*/)
-    {}
-
-    /// Sort blocked arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlocked(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Sort blocked -> striped arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
-
-                // Last pass exchanges through shared memory in striped arrangement
-                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// \smemstorage{BlockRadixSort}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-    /**
-     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_block_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh
deleted file mode 100644
index bbacdf3e0..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_raking_layout.cuh
+++ /dev/null
@@ -1,150 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS = BLOCK_THREADS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
-
-        /// Degree of bank conflicts (e.g., 4-way)
-        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
-            1,
-
-        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
-        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
-        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    struct __align__(16) _TempStorage
-    {
-        T buff[BlockRakingLayout::GRID_ELEMENTS];
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        // Offset for partial
-        unsigned int offset = linear_tid;
-
-        // Add in one padding element for every segment
-        if (USE_SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage.Alias().buff + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh
deleted file mode 100644
index 1bf971f0f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_reduce.cuh
+++ /dev/null
@@ -1,607 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_raking_commutative_only.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA thread block.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
-     *   and is preferable when the reduction operator is commutative.  This variant
-     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
-
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators. \blocked.
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs more communication than BLOCK_REDUCE_RAKING
-     *   and is only preferable when the reduction operator is non-commutative.  This variant
-     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators.
-     *
-     * \par
-     * Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
-     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
-     *   throughput across the GPU.  However turn-around latency may be lower and
-     *   thus useful when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_WARP_REDUCTIONS,
-};
-
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - \rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Very efficient (only one synchronization barrier).
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for a 1D block of 128 threads on type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
-    int                     BLOCK_DIM_Y     = 1,
-    int                     BLOCK_DIM_Z     = 1,
-    int                     PTX_ARCH        = CUB_PTX_ARCH>
-class BlockReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
-    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
-
-    /// Internal specialization type
-    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
-        WarpReductions,
-        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
-            RakingCommutativeOnly,
-            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
-
-    /// Shared memory storage layout type for BlockReduce
-    typedef typename InternalBlockReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
-    {
-        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, reduction_op);
-        return Reduce(partial, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
-    {
-        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
-    }
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, cub::Sum());
-        return Sum(partial);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
-        }
-    }
-
-
-    //@}  end member group
-};
-
-/**
- * \example example_block_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh
deleted file mode 100644
index 513ef358b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_scan.cuh
+++ /dev/null
@@ -1,2141 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_scan_raking.cuh"
-#include "specializations/block_scan_warp_scans.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
- */
-enum BlockScanAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_SCAN_RAKING,
-
-
-    /**
-     * \par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
-    BLOCK_SCAN_RAKING_MEMOIZE,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
-    BLOCK_SCAN_WARP_SCANS,
-};
-
-
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - \rowmajor
- * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Invokes a minimal number of minimal block-wide synchronization barriers (only
- *   one or two depending on algorithm selection)
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - \blocksize
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for a 1D block of 128 threads on type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
-    int                 BLOCK_DIM_Y     = 1,
-    int                 BLOCK_DIM_Z     = 1,
-    int                 PTX_ARCH        = CUB_PTX_ARCH>
-class BlockScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
-     * cannot be used with thread block sizes not a multiple of the
-     * architectural warp size.
-     */
-    static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
-            BLOCK_SCAN_RAKING :
-            ALGORITHM;
-
-    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
-    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
-
-    /// Define the delegate type for the desired algorithm
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
-        WarpScans,
-        Raking>::Type InternalBlockScan;
-
-    /// Shared memory storage layout type for BlockScan
-    typedef typename InternalBlockScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
-        T initial_value = 0;
-#else
-        T initial_value{};
-#endif
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
-        T initial_value = 0;
-#else
-        T initial_value{};
-#endif
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
-        T initial_value = 0;
-#else
-        T initial_value{};
-#endif
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-#if CUB_CPP_DIALECT < 2011 // T must be able to be initialized from 0 pre-c++11
-        T initial_value = 0;
-#else
-        T initial_value{};
-#endif
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-
-    //@}  end member group        // Exclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op,            ///< [in] Binary scan functor 
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op,                      ///< [in] Binary scan functor
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    //@}  end member group
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    //@}  end member group
-#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        InclusiveScan(input, output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InclusiveScan(input, output, cub::Sum(), block_aggregate);
-    }
-
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0]);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan (with no initial value)
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_block_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh
deleted file mode 100644
index e32f0ca75..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_shuffle.cuh
+++ /dev/null
@@ -1,303 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
- * either (a) up to their successor or (b) down to their predecessor.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockShuffle
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    enum
-    {
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T prev[BLOCK_THREADS];
-        T next[BLOCK_THREADS];
-    };
-
-
-public:
-
-    /// \smemstorage{BlockShuffle}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Shuffle movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Offset(
-        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
-        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
-        int distance = 1)           ///< [in] Offset distance (may be negative)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
-            output = temp_storage[linear_tid + distance].prev;
-    }
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Rotate(
-        T   input,                  ///< [in] The calling thread's input item
-        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
-        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        unsigned int offset = threadIdx.x + distance;
-        if (offset >= BLOCK_THREADS)
-            offset -= BLOCK_THREADS;
-
-        output = temp_storage[offset].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh
deleted file mode 100644
index 1c56336ee..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/block_store.cuh
+++ /dev/null
@@ -1,999 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for writing linear segments of data from the CUDA thread block
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[ITEM] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
-        {
-            thread_itr[ITEM] = items[ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
- *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreDirectBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
-
-    // Alias global pointer
-    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
-
-    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    Vector raw_vector[VECTORS_PER_THREAD];
-    T *raw_items = reinterpret_cast<T*>(raw_vector);
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        raw_items[ITEM] = items[ITEM];
-    }
-
-    // Direct-store using vector types
-    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
-        {
-            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-//@}  end member group
-
-
-/** @} */       // end group UtilIo
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockStore abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
-enum BlockStoreAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIteratorT is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * To reduce the shared memory requirement, only one warp's worth of shared
-     * memory is provisioned and is subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
-};
-
-
-/**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam T                    The type of data to be written.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockStore class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockStore}
- * \par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     int thread_data[4];
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockStore
-{
-private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Store helper
-    template <BlockStoreAlgorithm _POLICY, int DUMMY>
-    struct StoreInternal;
-
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_VECTORIZE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
-        }
-
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
-            int               valid_items)                  ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef StoreInternal<ALGORITHM, 0> InternalStore;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalStore::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-
-    /// \smemstorage{BlockStore}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Store items into a linear segment of memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
-    }
-
-    /**
-     * \brief Store items into a linear segment of memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh
deleted file mode 100644
index cbb9d7662..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_sort.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename    T,                  ///< Sample type
-    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
-    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
-    int         BINS,               ///< The number of bins into which histogram samples may fall
-    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>           ///< The PTX compute capability for which to specialize this collective
-struct BlockHistogramSort
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<
-            T,
-            BLOCK_DIM_X,
-            ITEMS_PER_THREAD,
-            NullType,
-            4,
-            (PTX_ARCH >= 350) ? true : false,
-            BLOCK_SCAN_WARP_SCANS,
-            cudaSharedMemBankSizeFourByte,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<
-            T,
-            BLOCK_DIM_X,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockDiscontinuityT;
-
-    /// Shared memory
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            unsigned int run_begin[BINS];
-            unsigned int run_end[BINS];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramSort(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    // Composite data onto an existing histogram
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        CTA_SYNC();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-
-        CTA_SYNC();
-
-        int flags[ITEMS_PER_THREAD];    // unused
-
-        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
-
-        CTA_SYNC();
-
-        // Composite into histogram
-        histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-
-        // Finish up with guarded composition if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh
deleted file mode 100644
index e12596356..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../block/block_raking_layout.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- *
- * Supports non-commutative binary reduction operators.  Unlike commutative
- * reduction operators (e.g., addition), the application of a non-commutative
- * reduction operator (e.g, string concatenation) across a sequence of inputs must
- * honor the relative ordering of items and partial reductions when applying the
- * reduction operator.
- *
- * Compared to the implementation of BlockReduceRaking (which does not support
- * non-commutative operators), this implementation requires a few extra
- * rounds of inter-thread communication.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockReduceRaking
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
-
-        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
-        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
-
-        /// Whether or not accesses into smem are unguarded
-        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
-
-    };
-
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<ITERATION>         /*iteration*/)
-    {
-        // Update partial if addend is in range
-        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
-        {
-            T addend = raking_segment[ITERATION];
-            partial = reduction_op(partial, addend);
-        }
-        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
-    }
-
-    template <bool IS_FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
-        T                           * /*raking_segment*/,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return partial;
-    }
-
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                IS_FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
-                partial,
-                num_valid,
-                reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
-
-                int valid_raking_threads = (IS_FULL_TILE) ?
-                    RAKING_THREADS :
-                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
-
-                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
-                    partial,
-                    valid_raking_threads,
-                    reduction_op);
-
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool IS_FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum reduction_op;
-
-        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
-    }
-
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh
deleted file mode 100644
index deed7fa3d..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ /dev/null
@@ -1,199 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "block_reduce_raking.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockReduceRakingCommutativeOnly
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Whether or not to use fall-back
-        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
-
-        /// Number of raking threads
-        RAKING_THREADS = WARP_THREADS,
-
-        /// Number of threads actually sharing items with the raking threads
-        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
-    };
-
-    ///  WarpReduce utility type
-    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        struct
-        {
-            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
-            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
-        };
-        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh
deleted file mode 100644
index f0e62f808..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ /dev/null
@@ -1,217 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../warp/warp_reduce.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockReduceWarpReductions
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// The logical warp size for warp reductions
-        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-
-        /// Whether or not the logical warp size evenly divides the thread block size
-        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
-    };
-
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-    int warp_id;
-    int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceWarpReductions(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
-    {
-        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
-        {
-            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
-            warp_aggregate = reduction_op(warp_aggregate, addend);
-        }
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
-    }
-
-    template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<WARPS>     /*successor_warp*/)
-    {
-        return warp_aggregate;
-    }
-
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        // Share lane aggregates
-        if (lane_id == 0)
-        {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-        }
-
-        CTA_SYNC();
-
-        // Update total aggregate in warp 0, lane 0
-        if (linear_tid == 0)
-        {
-            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
-        }
-
-        return warp_aggregate;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum    reduction_op;
-        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
-        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
-                            LOGICAL_WARP_SIZE :
-                            num_valid - warp_offset;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
-            input,
-            warp_num_valid,
-            cub::Sum());
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
-                            LOGICAL_WARP_SIZE :
-                            num_valid - warp_offset;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
-            input,
-            warp_num_valid,
-            reduction_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh
deleted file mode 100644
index fc49b56fd..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_raking.cuh
+++ /dev/null
@@ -1,665 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-/**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../block/block_raking_layout.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_scan.cuh"
-#include "../../warp/warp_scan.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,              ///< Data type being scanned
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockScanRaking
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
-        T                                           block_aggregate;    ///< Block aggregate
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    T               cached_segment[SEGMENT_LENGTH];
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /// Templated reduction
-    template <int ITERATION, typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                  raking_ptr,         ///< [in] Input array
-        ScanOp              scan_op,            ///< [in] Binary reduction operator
-        T                   raking_partial,     ///< [in] Prefix to seed reduction with
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
-        {
-            T addend = raking_ptr[ITERATION];
-            raking_partial = scan_op(raking_partial, addend);
-        }
-
-        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
-    }
-
-
-    /// Templated reduction (base case)
-    template <typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                          /*raking_ptr*/,    ///< [in] Input array
-        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
-        T                           raking_partial,    ///< [in] Prefix to seed reduction with
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return raking_partial;
-    }
-
-
-    /// Templated copy
-    template <int ITERATION>
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        out[ITERATION] = in[ITERATION];
-        CopySegment(out, in, Int2Type<ITERATION + 1>());
-    }
-
- 
-    /// Templated copy (base case)
-    __device__ __forceinline__ void CopySegment(
-        T*                  /*out*/,            ///< [out] Out array
-        T*                  /*in*/,             ///< [in] Input array
-        Int2Type<SEGMENT_LENGTH> /*iteration*/)
-    {}
-
-
-    /// Performs upsweep raking reduction, returning the aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ T Upsweep(
-        ScanOp scan_op)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data into registers
-        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-
-        T raking_partial = cached_segment[0];
-
-        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    /// Performs inclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            exclusive_output = *placement_ptr;
-        }
-    }
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial= Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-
-                // Broadcast aggregate to other threads
-                if (linear_tid == 0)
-                    temp_storage.block_aggregate = block_aggregate;
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            output = scan_op(block_prefix, output);
-            if (linear_tid == 0)
-                output = block_prefix;
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            // Update prefix with exclusive warpscan partial
-            output = scan_op(block_prefix, output);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh
deleted file mode 100644
index 215649395..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans.cuh
+++ /dev/null
@@ -1,391 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
-
-    /// Shared memory storage layout type
-
-    struct __align__(32) _TempStorage
-    {
-        T                               warp_aggregates[WARPS];
-        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                               block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  /*addend_warp*/)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> /*addend_warp*/)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh
deleted file mode 100644
index ffdf520f6..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans2.cuh
+++ /dev/null
@@ -1,435 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
-        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                                           warp_aggregates[WARPS];
-        T                                           block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  addend_warp)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> addend_warp)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh
deleted file mode 100644
index b6e9a689f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_scan_warp_scans3.cuh
+++ /dev/null
@@ -1,417 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
-
-        /// Number of outer scan warps
-        OUTER_WARPS = INNER_WARP_THREADS
-    };
-
-    ///  Outer WarpScan utility type
-    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
-
-    ///  Inner WarpScan utility type
-    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
-
-    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union Aliasable
-        {
-            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
-            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
-
-        } aliasable;
-
-        T                               warp_aggregates[OUTER_WARPS];
-
-        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
-        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = outer_warp_exclusive;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-        {
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-        }
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        // Retrieve block aggregate
-        block_aggregate = temp_storage.block_aggregate;
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake
deleted file mode 100644
index 4260ba66f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config-version.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# Parse version information from version.cuh:
-file(READ "${CMAKE_CURRENT_LIST_DIR}/../version.cuh" CUB_VERSION_HEADER)
-string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
-set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
-# Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
-string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
-set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
-
-math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
-math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
-math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
-
-# Build comparison versions:
-set(CUB_COMPAT "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}")
-set(CUB_EXACT "${CUB_COMPAT}.${CUB_VERSION_TWEAK}")
-set(FIND_COMPAT "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}.${PACKAGE_FIND_VERSION_PATCH}")
-set(FIND_EXACT "${FIND_COMPAT}.${PACKAGE_FIND_VERSION_TWEAK}")
-
-# Set default results
-set(PACKAGE_VERSION ${CUB_EXACT})
-set(PACKAGE_VERSION_UNSUITABLE FALSE)
-set(PACKAGE_VERSION_COMPATIBLE FALSE)
-set(PACKAGE_VERSION_EXACT FALSE)
-
-# Test for compatibility (ignores tweak)
-if (FIND_COMPAT VERSION_EQUAL CUB_COMPAT)
-  set(PACKAGE_VERSION_COMPATIBLE TRUE)
-endif()
-
-# Test for exact (does not ignore tweak)
-if (FIND_EXACT VERSION_EQUAL CUB_EXACT)
-  set(PACKAGE_VERSION_EXACT TRUE)
-endif()
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake
deleted file mode 100644
index 13b0e2d9f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cmake/cub-config.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# find_package(CUB) config file.
-#
-# Defines a CUB::CUB target that may be linked from user projects to include
-# CUB.
-
-function(_cub_declare_interface_alias alias_name ugly_name)
-  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
-  # 2) When an IMPORTED library is linked to another target, its include
-  #    directories are treated as SYSTEM includes.
-  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
-  #    system includes. This means that the Toolkit CUB will *always* be used
-  #    during compilation, and the include paths of an IMPORTED CUB::CUB
-  #    target will never have any effect.
-  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
-  #    on EVERY target that links to CUB::CUB. This would be a burden and a
-  #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
-  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
-  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
-  #    configure, that seems to work too).
-  add_library(${ugly_name} INTERFACE)
-  add_library(${alias_name} ALIAS ${ugly_name})
-endfunction()
-
-#
-# Setup targets
-#
-
-_cub_declare_interface_alias(CUB::CUB _CUB_CUB)
-# Strip out the 'cub/cmake/' from 'cub/cmake/cub-config.cmake':
-get_filename_component(_CUB_INCLUDE_DIR "../.." ABSOLUTE BASE_DIR "${CMAKE_CURRENT_LIST_DIR}")
-target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
-
-if (CUB_IGNORE_DEPRECATED_CPP_DIALECT)
-  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
-endif()
-
-if (CUB_IGNORE_DEPRECATED_CPP_11)
-  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
-endif()
-
-if (CUB_IGNORE_DEPRECATED_COMPILER)
-  target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
-endif()
-
-#
-# Standardize version info
-#
-
-set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "")
-set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "")
-set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "")
-set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "")
-set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "")
-set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "")
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh
deleted file mode 100644
index a71d78fe0..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/cub.cuh
+++ /dev/null
@@ -1,99 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * CUB umbrella include file
- */
-
-#pragma once
-
-// Static configuration
-#include "config.cuh"
-
-// Block
-#include "block/block_histogram.cuh"
-#include "block/block_discontinuity.cuh"
-#include "block/block_exchange.cuh"
-#include "block/block_load.cuh"
-#include "block/block_radix_rank.cuh"
-#include "block/block_radix_sort.cuh"
-#include "block/block_reduce.cuh"
-#include "block/block_scan.cuh"
-#include "block/block_store.cuh"
-//#include "block/block_shift.cuh"
-
-// Device
-#include "device/device_histogram.cuh"
-#include "device/device_partition.cuh"
-#include "device/device_radix_sort.cuh"
-#include "device/device_reduce.cuh"
-#include "device/device_run_length_encode.cuh"
-#include "device/device_scan.cuh"
-#include "device/device_segmented_radix_sort.cuh"
-#include "device/device_segmented_reduce.cuh"
-#include "device/device_select.cuh"
-#include "device/device_spmv.cuh"
-
-// Grid
-//#include "grid/grid_barrier.cuh"
-#include "grid/grid_even_share.cuh"
-#include "grid/grid_mapping.cuh"
-#include "grid/grid_queue.cuh"
-
-// Thread
-#include "thread/thread_load.cuh"
-#include "thread/thread_operators.cuh"
-#include "thread/thread_reduce.cuh"
-#include "thread/thread_scan.cuh"
-#include "thread/thread_store.cuh"
-
-// Warp
-#include "warp/warp_reduce.cuh"
-#include "warp/warp_scan.cuh"
-
-// Iterator
-#include "iterator/arg_index_input_iterator.cuh"
-#include "iterator/cache_modified_input_iterator.cuh"
-#include "iterator/cache_modified_output_iterator.cuh"
-#include "iterator/constant_input_iterator.cuh"
-#include "iterator/counting_input_iterator.cuh"
-#include "iterator/discard_output_iterator.cuh"
-#include "iterator/tex_obj_input_iterator.cuh"
-#include "iterator/tex_ref_input_iterator.cuh"
-#include "iterator/transform_input_iterator.cuh"
-
-// Util
-#include "util_allocator.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_device.cuh"
-#include "util_macro.cuh"
-#include "util_ptx.cuh"
-#include "util_type.cuh"
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh
deleted file mode 100644
index 2ee967b10..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_histogram.cuh
+++ /dev/null
@@ -1,866 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_histogram.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par Usage Considerations
- * \cdp_class{DeviceHistogram}
- *
- */
-struct DeviceHistogram
-{
-    /******************************************************************//**
-     * \name Evenly-segmented bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage  = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_pixels;         // e.g., 5
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            lower_level,
-            upper_level,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-
-
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Custom bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of an six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
-     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1] = {d_histogram};
-        int                 num_levels1[1]  = {num_levels};
-        LevelT*             d_levels1[1]    = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ , , , , , , , ]
-     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
-     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT*             d_levels1[1]        = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int            num_pixels;       // e.g., 5
-     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
-     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
-     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int            num_levels[3];    // e.g., {5, 5, 5};
-     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [0, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            d_levels,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
-     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int              num_levels[3];      // e.g., {5, 5, 5};
-     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [2, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [1, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-
-    //@}  end member group
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh
deleted file mode 100644
index 65db3b7b5..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_partition.cuh
+++ /dev/null
@@ -1,273 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
- * a specified input sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DevicePartition}
- *
- * \par Performance
- * \linear_performance{partition}
- *
- * \par
- * The following chart illustrates DevicePartition::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected for the first partition.
- * \plots_below
- *
- * \image html partition_if_int32_50_percent.png
- *
- */
-struct DevicePartition
-{
-    /**
-     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated partition-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected for the first partition with 50% probability.
-     *
-     * \image html partition_if_int32_50_percent.png
-     * \image html partition_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability for the first partition:
-     *
-     * \image html partition_if_int32_5_percent.png
-     * \image html partition_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_partition_flagged.cu
- * \example example_device_partition_if.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh
deleted file mode 100644
index df218a7c3..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_radix_sort.cuh
+++ /dev/null
@@ -1,796 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
- *
- * \par Performance
- * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
- * performance across different CUDA architectures for uniform-random \p uint32 keys.
- * \plots_below
- *
- * \image html lsb_radix_sort_int32_keys.png
- *
- */
-struct DeviceRadixSort
-{
-
-    /******************************************************************//**
-     * \name KeyT-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_device_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh
deleted file mode 100644
index 4f01c2446..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_reduce.cuh
+++ /dev/null
@@ -1,734 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- * \par Performance
- * \linear_performance{reduction, reduce-by-key, and run-length encode}
- *
- * \par
- * The following chart illustrates DeviceReduce::Sum
- * performance across different CUDA architectures for \p int32 keys.
- *
- * \image html reduce_int32.png
- *
- * \par
- * The following chart illustrates DeviceReduce::ReduceByKey (summation)
- * performance across different CUDA architectures for \p fp32
- * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
- *
- * \image html reduce_by_key_fp32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceReduce
-{
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     __device__ __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;  // e.g., 7
-     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;     // e.g., [-]
-     * CustomMin    min_op;
-     * int          init;       // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    ReductionOpT,
-        typename                    T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
-        T                           init,                               ///< [in] Initial value of the reduction
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op,
-            init,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition (\p +) operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction.
-     * - Does not support \p + operators that are non-commutative..
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sum-reduction performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html reduce_int32.png
-     * \image html reduce_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Min(),
-            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // d_out <-- [{5, 0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // d_out <-- [9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // d_out <-- [{6, 9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
-     *
-     * \par
-     * This operation computes segmented reductions within \p d_values_in using
-     * the specified binary \p reduction_op functor.  The segments are identified by
-     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
-     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
-     * the first key of the run and the corresponding value aggregate of that run are
-     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
-     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following chart illustrates reduction-by-key (sum) performance across
-     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
-     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
-     *
-     * \image html reduce_by_key_fp32_len_500.png
-     * \image html reduce_by_key_fp64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html reduce_by_key_fp32_len_5.png
-     * \image html reduce_by_key_fp64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the segmented reduction of \p int values grouped
-     * by runs of associated \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_num_runs_out;    // e.g., [-]
-     * CustomMin    reduction_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduce-by-key
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
-     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     */
-    template <
-        typename                    KeysInputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    ValuesInputIteratorT,
-        typename                    AggregatesOutputIteratorT,
-        typename                    NumRunsOutputIteratorT,
-        typename                    ReductionOpT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t ReduceByKey(
-        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // FlagT iterator type (not used)
-
-        // Selection op (not used)
-
-        // Default == operator
-        typedef Equality EqualityOp;
-
-        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_unique_out,
-            d_values_in,
-            d_aggregates_out,
-            d_num_runs_out,
-            EqualityOp(),
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh
deleted file mode 100644
index e31ebf014..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_run_length_encode.cuh
+++ /dev/null
@@ -1,278 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../config.cuh"
-#include "dispatch/dispatch_rle.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
- * computes a simple compressed representation of a sequence of input elements such that each
- * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
- * count of the elements in that run.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRunLengthEncode}
- *
- * \par Performance
- * \linear_performance{run-length encode}
- *
- * \par
- * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
- * different CUDA architectures for \p int32 items.
- * Segments have lengths uniformly sampled from [1,1000].
- *
- * \image html rle_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceRunLengthEncode
-{
-
-    /**
-     * \brief Computes a run-length encoding of the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
-     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
-     *   respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated encode performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html rle_int32_len_500.png
-     * \image html rle_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html rle_int32_len_5.png
-     * \image html rle_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_counts_out      <-- [1, 2, 1, 3, 1]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    LengthsOutputIteratorT,
-        typename                    NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Encode(
-        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
-        typedef NullType    SelectOp;                   // Selection op (not used)
-        typedef Equality    EqualityOp;                 // Default == operator
-        typedef cub::Sum    ReductionOp;                // Value reduction operator
-
-        // The lengths output value type
-        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-            OffsetT,                                                                                                    // ... then the OffsetT type,
-            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-        // Generator type for providing 1s values for run-length reduction
-        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
-
-        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            LengthsInputIteratorT((LengthT) 1),
-            d_counts_out,
-            d_num_runs_out,
-            EqualityOp(),
-            ReductionOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
-     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
-     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     *
-     * \par Snippet
-     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // d_offsets_out         <-- [1, 4]
-     * // d_lengths_out         <-- [2, 3]
-     * // d_num_runs_out        <-- [2]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                InputIteratorT,
-        typename                OffsetsOutputIteratorT,
-        typename                LengthsOutputIteratorT,
-        typename                NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t NonTrivialRuns(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
-        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
-        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef Equality    EqualityOp;                 // Default == operator
-
-        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs_out,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh
deleted file mode 100644
index ae8a5902c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_scan.cuh
+++ /dev/null
@@ -1,443 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../config.cuh"
-#include "dispatch/dispatch_scan.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output sequence where each element is computed to be the reduction
- * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
- * for performing global prefix scan with only a single pass through the
- * input data, as described in our 2016 technical report [1].  The central
- * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
- * of global prefix propagation with local computation.  As such, our algorithm requires only
- * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
- * proceeds at "memcpy" speeds.
- *
- * \par
- * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
- *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
- *
- * \par Performance
- * \linear_performance{prefix scan}
- *
- * \par
- * The following chart illustrates DeviceScan::ExclusiveSum
- * performance across different CUDA architectures for \p int32 keys.
- * \plots_below
- *
- * \image html scan_int32.png
- *
- */
-struct DeviceScan
-{
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated exclusive sum performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html scan_int32.png
-     * \image html scan_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSum(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        // Initial value
-        OutputT init_value = 0;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSum(
-        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
-        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_device_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh
deleted file mode 100644
index 2ab2a7dde..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_radix_sort.cuh
+++ /dev/null
@@ -1,875 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../config.cuh"
-#include "dispatch/dispatch_radix_sort.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedRadixSort}
- *
- */
-struct DeviceSegmentedRadixSort
-{
-
-    /******************************************************************//**
-     * \name Key-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh
deleted file mode 100644
index 97308c5a5..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_segmented_reduce.cuh
+++ /dev/null
@@ -1,619 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../config.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedReduce}
- *
- */
-struct DeviceSegmentedReduce
-{
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_segments;   // e.g., 3
-     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [-, -, -]
-     * CustomMin    min_op;
-     * int          initial_value;           // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT,
-        typename            ReductionOp,
-        typename            T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
-        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            reduction_op,
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p + operators that are non-commutative..
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [21, 0, 17]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Min(),
-            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [8, INT_MIN, 9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh
deleted file mode 100644
index 136d26044..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_select.cuh
+++ /dev/null
@@ -1,369 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to selectively copy
- * items from a specified input sequence to a compact output sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSelect}
- *
- * \par Performance
- * \linear_performance{select-flagged, select-if, and select-unique}
- *
- * \par
- * The following chart illustrates DeviceSelect::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected.
- *
- * \image html select_if_int32_50_percent.png
- *
- * \par
- * The following chart illustrates DeviceSelect::Unique
- * performance across different CUDA architectures for \p int32 items
- * where segments have lengths uniformly sampled from [1,1000].
- *
- * \image html select_unique_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceSelect
-{
-    /**
-     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected with 50% probability.
-     *
-     * \image html select_if_int32_50_percent.png
-     * \image html select_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability:
-     *
-     * \image html select_if_int32_5_percent.png
-     * \image html select_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-unique performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html select_unique_int32_len_500.png
-     * \image html select_unique_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html select_unique_int32_len_5.png
-     * \image html select_unique_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [0, 2, 9, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Unique(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef Equality                EqualityOp;     // Default == operator
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_select_flagged.cu
- * \example example_device_select_if.cu
- * \example example_device_select_unique.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh
deleted file mode 100644
index 0be0c20e7..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/device_spmv.cuh
+++ /dev/null
@@ -1,174 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_spmv_orig.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
- * performs the matrix-vector operation
- * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
- * where:
- *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
- *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
- *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
- *  - <em>x</em> and <em>y</em> are dense vectors
- *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSpmv}
- *
- */
-struct DeviceSpmv
-{
-    /******************************************************************//**
-     * \name CSR matrix operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
-     *
-     * \par Snippet
-     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
-     * representing a 3x3 lattice (24 non-zeros).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
-     * // and output vector y
-     * int    num_rows = 9;
-     * int    num_cols = 9;
-     * int    num_nonzeros = 24;
-     *
-     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
-     *
-     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
-     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
-     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
-     *
-     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
-     *
-     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
-     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run SpMV
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
-     *
-     * \endcode
-     *
-     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
-     */
-    template <
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t CsrMV(
-        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
-        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
-        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
-        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
-        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        SpmvParams<ValueT, int> spmv_params;
-        spmv_params.d_values             = d_values;
-        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
-        spmv_params.d_column_indices     = d_column_indices;
-        spmv_params.d_vector_x           = d_vector_x;
-        spmv_params.d_vector_y           = d_vector_y;
-        spmv_params.num_rows             = num_rows;
-        spmv_params.num_cols             = num_cols;
-        spmv_params.num_nonzeros         = num_nonzeros;
-        spmv_params.alpha                = 1.0;
-        spmv_params.beta                 = 0.0;
-
-        return DispatchSpmv<ValueT, int>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            spmv_params,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh
deleted file mode 100644
index 339b3d67b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_histogram.cuh
+++ /dev/null
@@ -1,1092 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../../agent/agent_histogram.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../config.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Histogram kernel entry points
- *****************************************************************************/
-
-/**
- * Histogram initialization kernel entry point
- */
-template <
-    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                        OffsetT>                        ///< Signed integer type for global offsets
-__global__ void DeviceHistogramInitKernel(
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
-    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    if ((threadIdx.x == 0) && (blockIdx.x == 0))
-        tile_queue.ResetDrain();
-
-    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-    #pragma unroll
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
-            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
-    }
-}
-
-
-/**
- * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
- */
-template <
-    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
-    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
-    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename                                            OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
-__global__ void DeviceHistogramSweepKernel(
-    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
-    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
-    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
-    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
-    int                                                     tiles_per_row,                      ///< Number of image tiles per row
-    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for compositing input tiles
-    typedef AgentHistogram<
-            AgentHistogramPolicyT,
-            PRIVATIZED_SMEM_BINS,
-            NUM_CHANNELS,
-            NUM_ACTIVE_CHANNELS,
-            SampleIteratorT,
-            CounterT,
-            PrivatizedDecodeOpT,
-            OutputDecodeOpT,
-            OffsetT>
-        AgentHistogramT;
-
-    // Shared memory for AgentHistogram
-    __shared__ typename AgentHistogramT::TempStorage temp_storage;
-
-    AgentHistogramT agent(
-        temp_storage,
-        d_samples,
-        num_output_bins_wrapper.array,
-        num_privatized_bins_wrapper.array,
-        d_output_histograms_wrapper.array,
-        d_privatized_histograms_wrapper.array,
-        output_decode_op_wrapper.array,
-        privatized_decode_op_wrapper.array);
-
-    // Initialize counters
-    agent.InitBinCounters();
-
-    // Consume input tiles
-    agent.ConsumeTiles(
-        num_row_pixels,
-        num_rows,
-        row_stride_samples,
-        tiles_per_row,
-        tile_queue);
-
-    // Store output to global (if necessary)
-    agent.StoreOutput();
-
-}
-
-
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
- */
-template <
-    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
-    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    LevelT,                     ///< Type for specifying bin level boundaries
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DipatchHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample value type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    enum
-    {
-        // Maximum number of bins per channel for which we will use a privatized smem strategy
-        MAX_PRIVATIZED_SMEM_BINS = 256
-    };
-
-
-    //---------------------------------------------------------------------
-    // Transform functors for converting samples to bin-ids
-    //---------------------------------------------------------------------
-
-    // Searches for bin given a list of bin-boundary levels
-    template <typename LevelIteratorT>
-    struct SearchTransform
-    {
-        LevelIteratorT  d_levels;                   // Pointer to levels array
-        int             num_output_levels;          // Number of levels in array
-
-        // Initializer
-        __host__ __device__ __forceinline__ void Init(
-            LevelIteratorT  d_levels,               // Pointer to levels array
-            int             num_output_levels)      // Number of levels in array
-        {
-            this->d_levels          = d_levels;
-            this->num_output_levels = num_output_levels;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            /// Level iterator wrapper type
-            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
-                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
-                WrappedLevelIteratorT;
-
-            WrappedLevelIteratorT wrapped_levels(d_levels);
-
-            int num_bins = num_output_levels - 1;
-            if (valid)
-            {
-                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
-                if (bin >= num_bins)
-                    bin = -1;
-            }
-        }
-    };
-
-
-    // Scales samples to evenly-spaced bins
-    struct ScaleTransform
-    {
-        int    num_bins;    // Number of levels in array
-        LevelT max;         // Max sample level (exclusive)
-        LevelT min;         // Min sample level (inclusive)
-        LevelT scale;       // Bin scaling factor
-
-        // Initializer
-        template <typename _LevelT>
-        __host__ __device__ __forceinline__ void Init(
-            int     num_output_levels,  // Number of levels in array
-            _LevelT max,                // Max sample level (exclusive)
-            _LevelT min,                // Min sample level (inclusive)
-            _LevelT scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = scale;
-        }
-
-        // Initializer (float specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            float   max,                // Max sample level (exclusive)
-            float   min,                // Min sample level (inclusive)
-            float   scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = float(1.0) / scale;
-        }
-
-        // Initializer (double specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            double max,                 // Max sample level (exclusive)
-            double min,                 // Min sample level (inclusive)
-            double scale)               // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = double(1.0) / scale;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) / scale);
-        }
-
-        // Method for converting samples to bin-ids (float specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-
-        // Method for converting samples to bin-ids (double specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-    };
-
-
-    // Pass-through bin transform operator
-    struct PassThruTransform
-    {
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            if (valid)
-                bin = (int) sample;
-        }
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    template <int NOMINAL_ITEMS_PER_THREAD>
-    struct TScale
-    {
-        enum
-        {
-            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-        };
-    };
-
-
-    /// SM11
-    struct Policy110
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                (NUM_CHANNELS == 1) ? 256 : 128,
-                (NUM_CHANNELS == 1) ? 8 : 3,
-                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM35
-    struct Policy350
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                128,
-                TScale<8>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLEND,
-                true>
-            HistogramSweepPolicy;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                384,
-                TScale<16>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t InitConfigs(
-        int             ptx_version,
-        KernelConfig    &histogram_sweep_config)
-    {
-        cudaError_t result = cudaErrorNotSupported;
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                result = histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 500)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-                }
-                else if (ptx_version >= 350)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-                }
-                else if (ptx_version >= 300)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
-                }
-                else if (ptx_version >= 200)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
-                }
-                else if (ptx_version >= 110)
-                {
-                    result = histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
-                }
-                else
-                {
-                    // No global atomic support
-                    result = cudaErrorNotSupported;
-                }
-            #endif
-        }
-        return result;
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration
-     */
-    struct KernelConfig
-    {
-        int                             block_threads;
-        int                             pixels_per_thread;
-
-        template <typename BlockPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t Init()
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
-
-            return cudaSuccess;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Privatization-based dispatch routine
-     */
-    template <
-        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
-        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t PrivatizedDispatch(
-        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
-        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
-        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
-        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
-        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
-        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-    #else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for histogram_sweep_kernel
-            int histogram_sweep_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                histogram_sweep_sm_occupancy,
-                histogram_sweep_kernel,
-                histogram_sweep_config.block_threads))) break;
-
-            // Get device occupancy for histogram_sweep_kernel
-            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
-
-            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
-            {
-                // Treat as a single linear array of samples
-                num_row_pixels      *= num_rows;
-                num_rows            = 1;
-                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
-            }
-
-            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
-            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
-            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
-            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
-            int blocks_per_col      = (blocks_per_row > 0) ?
-                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
-                                        0;
-            int num_thread_blocks   = blocks_per_row * blocks_per_col;
-
-            dim3 sweep_grid_dims;
-            sweep_grid_dims.x = (unsigned int) blocks_per_row;
-            sweep_grid_dims.y = (unsigned int) blocks_per_col;
-            sweep_grid_dims.z = 1;
-
-            // Temporary storage allocation requirements
-            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
-            void*       allocations[NUM_ALLOCATIONS] = {};
-            size_t      allocation_sizes[NUM_ALLOCATIONS];
-
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
-
-            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the grid queue descriptor
-            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
-
-            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
-
-            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
-
-            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
-
-            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
-
-            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
-
-            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
-
-            int histogram_init_block_threads    = 256;
-            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
-
-            // Log DeviceHistogramInitKernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
-                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
-
-            // Invoke histogram_init_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                histogram_init_grid_dims, histogram_init_block_threads, 0,
-                stream
-            ).doit(histogram_init_kernel,
-                num_output_bins_wrapper,
-                d_output_histograms_wrapper,
-                tile_queue);
-
-            // Return if empty problem
-            if ((blocks_per_row == 0) || (blocks_per_col == 0))
-                break;
-
-            // Log histogram_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
-                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
-                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
-
-            // Invoke histogram_sweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream
-            ).doit(histogram_sweep_kernel,
-                d_samples,
-                num_output_bins_wrapper,
-                num_privatized_bins_wrapper,
-                d_output_histograms_wrapper,
-                d_privatized_histograms_wrapper,
-                output_decode_op_wrapper,
-                privatized_decode_op_wrapper,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                tiles_per_row,
-                tile_queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-    #endif // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the search transform op for converting samples to privatized bins
-            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            // Dispatch
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Too many bins to keep in shared memory.
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the search transform op for converting privatized bins to output bins
-            typedef SearchTransform<LevelT*> OutputDecodeOpT;
-
-            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the scale transform op for converting samples to privatized bins
-            typedef ScaleTransform PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-
-                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the scale transform op for converting privatized bins to output bins
-            typedef ScaleTransform OutputDecodeOpT;
-
-            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh
deleted file mode 100644
index 2b0919fa1..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_radix_sort.cuh
+++ /dev/null
@@ -1,1660 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_radix_sort_upsweep.cuh"
-#include "../../agent/agent_radix_sort_downsweep.cuh"
-#include "../../agent/agent_scan.cuh"
-#include "../../block/block_radix_sort.cuh"
-#include "../../config.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortUpsweepKernel(
-    const KeyT              *d_keys,                        ///< [in] Input keys buffer
-    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    typedef typename If<
-            (ALT_DIGIT_BITS),
-            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
-        ::Type ActiveUpsweepPolicyT;
-
-    typedef typename If<
-            (ALT_DIGIT_BITS),
-            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
-        ::Type ActiveDownsweepPolicyT;
-
-    enum {
-        TILE_ITEMS = CUB_MAX(
-            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
-            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
-    };
-
-    // Parameterize AgentRadixSortUpsweep type for the current configuration
-    typedef AgentRadixSortUpsweep<
-            ActiveUpsweepPolicyT,
-            KeyT,
-            OffsetT>
-        AgentRadixSortUpsweepT;
-
-    // Shared memory storage
-    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
-
-    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
-
-    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
-
-    CTA_SYNC();
-
-    // Write out digit counts (striped)
-    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
-}
-
-
-/**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortScanBinsKernel(
-    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int                     num_counts)                     ///< [in] Total number of bin-counts
-{
-    // Parameterize the AgentScan type for the current configuration
-    typedef AgentScan<
-            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
-            OffsetT*,
-            OffsetT*,
-            cub::Sum,
-            OffsetT,
-            OffsetT>
-        AgentScanT;
-
-    // Shared memory storage
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Block scan instance
-    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
-
-    // Process full input tiles
-    int block_offset = 0;
-    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
-    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
-    {
-        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
-        block_offset += AgentScanT::TILE_ITEMS;
-    }
-}
-
-
-/**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortDownsweepKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    typedef typename If<
-            (ALT_DIGIT_BITS),
-            typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-            typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>
-        ::Type ActiveUpsweepPolicyT;
-
-    typedef typename If<
-            (ALT_DIGIT_BITS),
-            typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-            typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>
-        ::Type ActiveDownsweepPolicyT;
-
-    enum {
-        TILE_ITEMS = CUB_MAX(
-            ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
-            ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
-    };
-
-    // Parameterize AgentRadixSortDownsweep type for the current configuration
-    typedef AgentRadixSortDownsweep<
-            ActiveDownsweepPolicyT,
-            IS_DESCENDING,
-            KeyT,
-            ValueT,
-            OffsetT>
-        AgentRadixSortDownsweepT;
-
-    // Shared memory storage
-    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    // Process input tiles
-    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end);
-}
-
-
-/**
- * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceRadixSortSingleTileKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-{
-    // Constants
-    enum
-    {
-        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // BlockRadixSort type
-    typedef BlockRadixSort<
-            KeyT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            ValueT,
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
-            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
-        BlockRadixSortT;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        KeyT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
-
-    // Unsigned word for key bits
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
-
-    // Shared memory storage
-    __shared__ union TempStorage
-    {
-        typename BlockRadixSortT::TempStorage       sort;
-        typename BlockLoadKeys::TempStorage         load_keys;
-        typename BlockLoadValues::TempStorage       load_values;
-
-    } temp_storage;
-
-    // Keys and values for the block
-    KeyT            keys[ITEMS_PER_THREAD];
-    ValueT          values[ITEMS_PER_THREAD];
-
-    // Get default (min/max) value for out-of-bounds keys
-    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
-    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
-
-    // Load keys
-    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
-
-    CTA_SYNC();
-
-    // Load values
-    if (!KEYS_ONLY)
-    {
-        // Register pressure work-around: moving num_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
-
-        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
-
-        CTA_SYNC();
-    }
-
-    // Sort tile
-    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
-        keys,
-        values,
-        current_bit,
-        end_bit,
-        Int2Type<IS_DESCENDING>(),
-        Int2Type<KEYS_ONLY>());
-
-    // Store keys and values
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
-        if (item_offset < num_items)
-        {
-            d_keys_out[item_offset] = keys[ITEM];
-            if (!KEYS_ONLY)
-                d_values_out[item_offset] = values[ITEM];
-        }
-    }
-}
-
-
-/**
- * Segmented radix sorting pass (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedRadixSortKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
-{
-    //
-    // Constants
-    //
-
-    typedef typename If<(ALT_DIGIT_BITS),
-        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
-        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
-
-    enum
-    {
-        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
-        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        RADIX_DIGITS        = 1 << RADIX_BITS,
-        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Upsweep type
-    typedef AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT> BlockUpsweepT;
-
-    // Digit-scan type
-    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
-
-    // Downsweep type
-    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
-    };
-
-    //
-    // Process input tiles
-    //
-
-    // Shared memory storage
-    __shared__ union
-    {
-        typename BlockUpsweepT::TempStorage     upsweep;
-        typename BlockDownsweepT::TempStorage   downsweep;
-        struct
-        {
-            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
-            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
-            typename DigitScanT::TempStorage        scan;
-        };
-
-    } temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-    OffsetT num_items       = segment_end - segment_begin;
-
-    // Check if empty segment
-    if (num_items <= 0)
-        return;
-
-    // Upsweep
-    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
-    upsweep.ProcessRegion(segment_begin, segment_end);
-
-    CTA_SYNC();
-
-    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
-    upsweep.ExtractCounts(bin_count);
-
-    CTA_SYNC();
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin counts
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    // Scan
-    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
-
-    #pragma unroll
-    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-    {
-        bin_offset[track] += segment_begin;
-    }
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin offsets
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    CTA_SYNC();
-
-    // Downsweep
-    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
-    downsweep.ProcessRegion(segment_begin, segment_end);
-}
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-/**
- * Tuning policy for kernel specialization
- */
-template <
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
-struct DeviceRadixSortPolicy
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-    // Dominant-sized key/value type
-    typedef typename If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>::Type DominantT;
-
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-
-            // Relative size of KeyT type to a 4-byte word
-            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
-        };
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 9, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, 15, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-
-
-    };
-
-
-    /// SM50
-    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <160, 39, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, 31, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, 11, DominantT,  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
-    };
-
-
-    /// SM60 (GP100)
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, 25, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-
-    };
-
-
-    /// SM61 (GP104)
-    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <384, 31, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, 35, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
-        typedef AgentRadixSortUpsweepPolicy <128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// SM62 (Tegra, less RF)
-    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, 16, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM70 (GV100)
-    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
-        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy700 MaxPolicy;
-
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT,       ///< Signed integer type for global offsets
-    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
-struct DispatchRadixSort :
-    SelectedPolicy
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version),
-        is_overwrite_okay(is_overwrite_okay)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block to sort in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Log single_tile_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
-                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
-            ).doit(single_tile_kernel,
-                d_keys.Current(),
-                d_keys.Alternate(),
-                d_values.Current(),
-                d_values.Alternate(),
-                num_items,
-                begin_bit,
-                end_bit);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update selector
-            d_keys.selector ^= 1;
-            d_values.selector ^= 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation
-    //------------------------------------------------------------------------------
-
-    /**
-     * Invoke a three-kernel sorting pass at the current bit.
-     */
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        OffsetT         *d_spine,
-        int             spine_length,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log upsweep_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
-                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                pass_config.even_share.grid_size,
-                pass_config.upsweep_config.block_threads, 0, stream
-            ).doit(pass_config.upsweep_kernel,
-                d_keys_in,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log scan_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
-
-            // Invoke scan_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                1, pass_config.scan_config.block_threads, 0, stream
-            ).doit(pass_config.scan_kernel,
-                d_spine,
-                spine_length);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log downsweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
-                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
-
-            // Invoke downsweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                pass_config.even_share.grid_size,
-                pass_config.downsweep_config.block_threads, 0, stream
-            ).doit(pass_config.downsweep_kernel,
-                d_keys_in,
-                d_keys_out,
-                d_values_in,
-                d_values_out,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-
-    /// Pass configuration structure
-    template <
-        typename UpsweepKernelT,
-        typename ScanKernelT,
-        typename DownsweepKernelT>
-    struct PassConfig
-    {
-        UpsweepKernelT          upsweep_kernel;
-        KernelConfig            upsweep_config;
-        ScanKernelT             scan_kernel;
-        KernelConfig            scan_config;
-        DownsweepKernelT        downsweep_kernel;
-        KernelConfig            downsweep_config;
-        int                     radix_bits;
-        int                     radix_digits;
-        int                     max_downsweep_grid_size;
-        GridEvenShare<OffsetT>  even_share;
-
-        /// Initialize pass configuration
-        template <
-            typename UpsweepPolicyT,
-            typename ScanPolicyT,
-            typename DownsweepPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(
-            UpsweepKernelT      upsweep_kernel,
-            ScanKernelT         scan_kernel,
-            DownsweepKernelT    downsweep_kernel,
-            int                 ptx_version,
-            int                 sm_count,
-            int                 num_items)
-        {
-            cudaError error = cudaSuccess;
-            do
-            {
-                this->upsweep_kernel    = upsweep_kernel;
-                this->scan_kernel       = scan_kernel;
-                this->downsweep_kernel  = downsweep_kernel;
-                radix_bits              = DownsweepPolicyT::RADIX_BITS;
-                radix_digits            = 1 << radix_bits;
-
-                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
-                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
-                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
-
-                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-
-                even_share.DispatchInit(
-                    num_items,
-                    max_downsweep_grid_size,
-                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
-
-            }
-            while (0);
-            return error;
-        }
-
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
-        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)upsweep_kernel;
-        (void)alt_upsweep_kernel;
-        (void)scan_kernel;
-        (void)downsweep_kernel;
-        (void)alt_downsweep_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular and alternate-digit kernel configurations
-            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template InitPassConfig<
-                    typename ActivePolicyT::UpsweepPolicy,
-                    typename ActivePolicyT::ScanPolicy,
-                    typename ActivePolicyT::DownsweepPolicy>(
-                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            if ((error = alt_pass_config.template InitPassConfig<
-                    typename ActivePolicyT::AltUpsweepPolicy,
-                    typename ActivePolicyT::ScanPolicy,
-                    typename ActivePolicyT::AltDownsweepPolicy>(
-                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            // Get maximum spine length
-            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
-            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[3] = {};
-            size_t allocation_sizes[3] =
-            {
-                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
-
-            // Alias the temporary storage allocations
-            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                d_spine, spine_length, current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_spine, spine_length, current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
-
-                // Invert selectors
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
-                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        OffsetT                 num_items,              ///< [in] Number of items to sort
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,              ///< Key type
-    typename ValueT,            ///< Value type
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
-struct DispatchSegmentedRadixSort :
-    SelectedPolicy
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Parameter members
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructors
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        is_overwrite_okay(is_overwrite_okay),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Multi-segment invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a three-kernel sorting pass at the current bit.
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log kernel configuration
-            if (debug_synchronous)
-            {
-              _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
-                      "%lld items per thread, %lld SM occupancy, "
-                      "current bit %d, bit_grain %d\n",
-                      (long long)num_segments,
-                      (long long)pass_config.segmented_config.block_threads,
-                      (long long)stream,
-                      (long long)pass_config.segmented_config.items_per_thread,
-                      (long long)pass_config.segmented_config.sm_occupancy,
-                      current_bit,
-                      pass_bits);
-            }
-
-            thrust::cuda_cub::launcher::triple_chevron(
-                num_segments, pass_config.segmented_config.block_threads, 0,
-                stream
-            ).doit(pass_config.segmented_kernel,
-                d_keys_in, d_keys_out,
-                d_values_in,  d_values_out,
-                d_begin_offsets, d_end_offsets, num_segments,
-                current_bit, pass_bits);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /// PassConfig data structure
-    template <typename SegmentedKernelT>
-    struct PassConfig
-    {
-        SegmentedKernelT    segmented_kernel;
-        KernelConfig        segmented_config;
-        int                 radix_bits;
-        int                 radix_digits;
-
-        /// Initialize pass configuration
-        template <typename SegmentedPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
-        {
-            this->segmented_kernel  = segmented_kernel;
-            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
-            this->radix_digits      = 1 << radix_bits;
-
-            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
-        }
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-      (void)segmented_kernel;
-      (void)alt_segmented_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Init regular and alternate kernel configurations
-            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
-            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
-
-            // Temporary storage allocation requirements
-            void* allocations[2] = {};
-            size_t allocation_sizes[2] =
-            {
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                if (temp_storage_bytes == 0)
-                    temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
-            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-                // Invert selectors and update current bit
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-
-    /// Internal dispatch routine
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,              ///< [in] Number of items to sort
-        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, num_segments, d_begin_offsets, d_end_offsets,
-                begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh
deleted file mode 100644
index c9a5e4fbe..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce.cuh
+++ /dev/null
@@ -1,885 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_reduce.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../config.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
-
-    // Output result
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = block_aggregate;
-}
-
-
-/**
- * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceReduceSingleTileKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OutputT                  init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Check if empty problem
-    if (num_items == 0)
-    {
-        if (threadIdx.x == 0)
-            *d_out = init;
-        return;
-    }
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        OffsetT(0),
-        num_items);
-
-    // Output result
-    if (threadIdx.x == 0)
-        *d_out = reduction_op(init, block_aggregate);
-}
-
-
-/// Normalize input iterator to segment offset
-template <typename T, typename OffsetT, typename IteratorT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    T &/*val*/,
-    OffsetT /*base_offset*/,
-    IteratorT /*itr*/)
-{}
-
-
-/// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    KeyValuePairT &val,
-    OffsetT base_offset,
-    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
-{
-    val.key -= base_offset;
-}
-
-
-/**
- * Segmented reduction (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OutputT                 init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-
-    // Check if empty problem
-    if (segment_begin == segment_end)
-    {
-        if (threadIdx.x == 0)
-            d_out[blockIdx.x] = init;
-        return;
-    }
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        segment_begin,
-        segment_end);
-
-    // Normalize as needed
-    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
-
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
-}
-
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <
-    typename InputT,            ///< Input data type
-    typename OutputT,           ///< Compute/output data type
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct DeviceReducePolicy
-{
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
-    {
-        // ReducePolicy
-        typedef AgentReducePolicy<
-                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
-                2,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
-    {
-        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                128, 8, InputT,                        ///< Threads per block, items per thread, compute type
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
-        typedef AgentReducePolicy<
-                256, 20, InputT,                       ///< Threads per block, items per thread, compute type, compute type
-                2,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                256, 20, InputT,                       ///< Threads per block, items per thread, compute type
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-    /// SM60
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-    {
-        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-        typedef AgentReducePolicy<
-                256, 16, InputT,                       ///< Threads per block, items per thread, compute type
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy600 MaxPolicy;
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OutputT =          ///< Data type of the output iterator
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
-    typename SelectedPolicy = DeviceReducePolicy<
-        typename std::iterator_traits<InputIteratorT>::value_type,
-        OutputT,
-        OffsetT,
-        ReductionOpT> >
-struct DispatchReduce :
-    SelectedPolicy
-{
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
-    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
-    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor
-    OutputT             init;                           ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;                    ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_items,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_items(num_items),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block block to reduce in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke single_reduce_sweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
-            ).doit(single_tile_kernel,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation (two-pass)
-    //------------------------------------------------------------------------------
-
-    /// Invoke two-passes to reduce
-    template <
-        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
-        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
-        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)                  reduce_kernel;
-        (void)                  single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular kernel configuration
-            KernelConfig reduce_config;
-            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
-            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-            GridEvenShare<OffsetT> even_share;
-            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
-
-            // Temporary storage allocation requirements
-            void* allocations[1] = {};
-            size_t allocation_sizes[1] =
-            {
-                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocation for the privatized per-block reductions
-            OutputT *d_block_reductions = (OutputT*) allocations[0];
-
-            // Get grid size for device_reduce_sweep_kernel
-            int reduce_grid_size = even_share.grid_size;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                reduce_grid_size,
-                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
-                reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                0, stream
-            ).doit(reduce_kernel,
-                d_in,
-                d_block_reductions,
-                num_items,
-                even_share,
-                reduction_op);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke DeviceReduceSingleTileKernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream
-            ).doit(single_tile_kernel,
-                d_block_reductions,
-                d_out,
-                reduce_grid_size,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
-                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out, num_items, reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OutputT =          ///< Data type of the output iterator
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
-    typename SelectedPolicy = DeviceReducePolicy<
-        typename std::iterator_traits<InputIteratorT>::value_type,
-        OutputT,
-        OffsetT,
-        ReductionOpT> >
-struct DispatchSegmentedReduce :
-    SelectedPolicy
-{
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
-    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor
-    OutputT             init;                   ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;            ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <
-        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
-        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)segmented_reduce_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Init kernel configuration
-            KernelConfig segmented_reduce_config;
-            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                num_segments,
-                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
-                segmented_reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                num_segments,
-                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream
-            ).doit(segmented_reduce_kernel,
-                d_in,
-                d_out,
-                d_begin_offsets,
-                d_end_offsets,
-                num_segments,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        if (num_segments <= 0)
-            return cudaSuccess;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out,
-                num_segments, d_begin_offsets, d_end_offsets,
-                reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh
deleted file mode 100644
index d8d8dcac4..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ /dev/null
@@ -1,560 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_reduce_by_key.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
-    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
-    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
-    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
-    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
-    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
-    typename            ScanTileStateT,                         ///< Tile status interface type
-    typename            EqualityOpT,                            ///< KeyT equality operator type
-    typename            ReductionOpT,                           ///< ValueT reduction operator type
-    typename            OffsetT>                                ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
-__global__ void DeviceReduceByKeyKernel(
-    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
-    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
-    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
-    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
-    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-    ScanTileStateT              tile_state,                     ///< Tile status interface
-    int                         start_tile,                     ///< The starting tile for the current grid
-    EqualityOpT                 equality_op,                    ///< KeyT equality operator
-    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
-    OffsetT                     num_items)                      ///< Total number of items to select from
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentReduceByKey<
-            AgentReduceByKeyPolicyT,
-            KeysInputIteratorT,
-            UniqueOutputIteratorT,
-            ValuesInputIteratorT,
-            AggregatesOutputIteratorT,
-            NumRunsOutputIteratorT,
-            EqualityOpT,
-            ReductionOpT,
-            OffsetT>
-        AgentReduceByKeyT;
-
-    // Shared memory for AgentReduceByKey
-    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
- */
-template <
-    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
-    typename    EqualityOpT,                ///< KeyT equality operator type
-    typename    ReductionOpT,               ///< ValueT reduction operator type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchReduceByKey
-{
-    //-------------------------------------------------------------------------
-    // Types and constants
-    //-------------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS     = 128,
-        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
-        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-
-    //-------------------------------------------------------------------------
-    // Tuning policies
-    //-------------------------------------------------------------------------
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 11,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM11
-    struct Policy110
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            ReduceByKeyPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &reduce_by_key_config)
-    {
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)ptx_version;
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 350)
-                {
-                    reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-                }
-                else if (ptx_version >= 300)
-                {
-                    reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
-                }
-                else if (ptx_version >= 200)
-                {
-                    reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
-                }
-                else if (ptx_version >= 130)
-                {
-                    reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
-                }
-                else
-                {
-                    reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
-                }
-            #endif
-        }
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduce-by-key using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
-        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                  ///< [in] Total number of items to select from
-        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
-        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
-        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-      (void)d_temp_storage;
-      (void)temp_storage_bytes;
-      (void)d_keys_in;
-      (void)d_unique_out;
-      (void)d_values_in;
-      (void)d_aggregates_out;
-      (void)d_num_runs_out;
-      (void)equality_op;
-      (void)reduction_op;
-      (void)num_items;
-      (void)stream;
-      (void)debug_synchronous;
-      (void)init_kernel;
-      (void)reduce_by_key_kernel;
-      (void)reduce_by_key_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            thrust::cuda_cub::launcher::triple_chevron(
-                init_grid_size, INIT_KERNEL_THREADS, 0, stream
-            ).doit(init_kernel,
-                tile_state,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for reduce_by_key_kernel
-            int reduce_by_key_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                reduce_by_key_sm_occupancy,            // out
-                reduce_by_key_kernel,
-                reduce_by_key_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log reduce_by_key_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
-
-                // Invoke reduce_by_key_kernel
-                thrust::cuda_cub::launcher::triple_chevron(
-                    scan_grid_size, reduce_by_key_config.block_threads, 0,
-                    stream
-                ).doit(reduce_by_key_kernel,
-                    d_keys_in,
-                    d_unique_out,
-                    d_values_in,
-                    d_aggregates_out,
-                    d_num_runs_out,
-                    tile_state,
-                    start_tile,
-                    equality_op,
-                    reduction_op,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig reduce_by_key_config;
-            InitConfigs(ptx_version, reduce_by_key_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_keys_in,
-                d_unique_out,
-                d_values_in,
-                d_aggregates_out,
-                d_num_runs_out,
-                equality_op,
-                reduction_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
-                reduce_by_key_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh
deleted file mode 100644
index b68f166de..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_rle.cuh
+++ /dev/null
@@ -1,542 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_rle.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            ScanTileStateT,              ///< Tile status interface type
-    typename            EqualityOpT,                 ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
-__global__ void DeviceRleSweepKernel(
-    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
-    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
-    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-    ScanTileStateT              tile_status,        ///< [in] Tile status interface
-    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
-    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentRle<
-        AgentRlePolicyT,
-        InputIteratorT,
-        OffsetsOutputIteratorT,
-        LengthsOutputIteratorT,
-        EqualityOpT,
-        OffsetT> AgentRleT;
-
-    // Shared memory for AgentRle
-    __shared__ typename AgentRleT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_runs_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
- */
-template <
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            EqualityOpT,                ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-struct DeviceRleDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    // The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig&   device_rle_config)
-    {
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                device_rle_config.template Init<PtxRleSweepPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 350)
-                {
-                    device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-                }
-                else if (ptx_version >= 300)
-                {
-                    device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
-                }
-                else if (ptx_version >= 200)
-                {
-                    device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-                }
-                else if (ptx_version >= 130)
-                {
-                    device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
-                }
-                else
-                {
-                    device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
-                }
-            #endif
-        }
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    store_warp_time_slicing;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename AgentRlePolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
-            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
-            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
-            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
-            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_warp_time_slicing,
-                scan_algorithm);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide run-length-encode using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
-        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            thrust::cuda_cub::launcher::triple_chevron(
-                init_grid_size, INIT_KERNEL_THREADS, 0, stream
-            ).doit(device_scan_init_kernel,
-                tile_status,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for device_rle_sweep_kernel
-            int device_rle_kernel_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                device_rle_kernel_sm_occupancy,            // out
-                device_rle_sweep_kernel,
-                device_rle_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log device_rle_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
-
-            // Invoke device_rle_sweep_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                scan_grid_size, device_rle_config.block_threads, 0, stream
-            ).doit(device_rle_sweep_kernel,
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                tile_status,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_rle_config;
-            InitConfigs(ptx_version, device_rle_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
-                device_rle_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh
deleted file mode 100644
index 24b30f102..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_scan.cuh
+++ /dev/null
@@ -1,493 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_scan.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../config.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename            ScanTileStateT>     ///< Tile status interface type
-__global__ void DeviceScanInitKernel(
-    ScanTileStateT      tile_state,         ///< [in] Tile status interface
-    int                 num_tiles)          ///< [in] Number of tiles
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-}
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename                ScanTileStateT,         ///< Tile status interface type
-    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
-__global__ void DeviceCompactInitKernel(
-    ScanTileStateT          tile_state,             ///< [in] Tile status interface
-    int                     num_tiles,              ///< [in] Number of tiles
-    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-
-    // Initialize d_num_selected_out
-    if ((blockIdx.x == 0) && (threadIdx.x == 0))
-        *d_num_selected_out = 0;
-}
-
-
-/**
- * Scan kernel entry point (multi-block)
- */
-template <
-    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
-    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename            ScanTileStateT,     ///< Tile status interface type
-    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
-    typename            OffsetT>            ///< Signed integer type for global offsets
-__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
-__global__ void DeviceScanKernel(
-    InputIteratorT      d_in,               ///< Input data
-    OutputIteratorT     d_out,              ///< Output data
-    ScanTileStateT      tile_state,         ///< Tile status interface
-    int                 start_tile,         ///< The starting tile for the current grid
-    ScanOpT             scan_op,            ///< Binary scan functor
-    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
-    OffsetT             num_items)          ///< Total number of scan items for the entire problem
-{
-    // Thread block type for scanning input tiles
-    typedef AgentScan<
-        ScanPolicyT,
-        InputIteratorT,
-        OutputIteratorT,
-        ScanOpT,
-        InitValueT,
-        OffsetT> AgentScanT;
-
-    // Shared memory for AgentScan
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <
-    typename OutputT> ///< Data type
-struct DeviceScanPolicy
-{
-
-    /// SM10
-    struct Policy100 : ChainedPolicy<100, Policy100, Policy100>
-    {
-        typedef AgentScanPolicy<
-                64, 9,                                          ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy100>
-    {
-        typedef AgentScanPolicy<
-                96, 21,                                         ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            ScanPolicyT;
-    };
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
-    {
-        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                128, 12,                                        ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        typedef AgentScanPolicy<
-                256, 9,                                         ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                128, 12,                                        ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                BLOCK_SCAN_RAKING>
-            ScanPolicyT;
-    };
-
-    /// SM520
-    struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
-    {
-        // Titan X: 32.47B items/s @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                128, 12,                                        ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM600
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
-    {
-        typedef AgentScanPolicy<
-                128, 15,                                        ///< Threads per block, items per thread
-                OutputT,
-                BLOCK_LOAD_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// MaxPolicy
-    typedef Policy600 MaxPolicy;
-};
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
- */
-template <
-    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
-    typename OffsetT,            ///< Signed integer type for global offsets
-    typename SelectedPolicy = DeviceScanPolicy<
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type> >
-struct DispatchScan:
-    SelectedPolicy
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-
-    void*           d_temp_storage;         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t&         temp_storage_bytes;     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT  d_in;                   ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT d_out;                  ///< [out] Pointer to the output sequence of data items
-    ScanOpT         scan_op;                ///< [in] Binary scan functor
-    InitValueT      init_value;             ///< [in] Initial value to seed the exclusive scan
-    OffsetT         num_items;              ///< [in] Total number of input items (i.e., the length of \p d_in)
-    cudaStream_t    stream;                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool            debug_synchronous;
-    int             ptx_version;
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchScan(
-        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
-        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        ScanOpT         scan_op,                ///< [in] Binary scan functor
-        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
-        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous,
-        int             ptx_version
-    ):
-    d_temp_storage(d_temp_storage),
-    temp_storage_bytes(temp_storage_bytes),
-    d_in(d_in),
-    d_out(d_out),
-    num_items(num_items),
-    scan_op(scan_op),
-    init_value(init_value),
-    stream(stream),
-    debug_synchronous(debug_synchronous),
-    ptx_version(ptx_version)
-    {}
-
-    template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
-    CUB_RUNTIME_FUNCTION __host__  __forceinline__
-    cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        (void)init_kernel;
-        (void)scan_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        typedef typename ActivePolicyT::ScanPolicyT Policy;
-        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            thrust::cuda_cub::launcher::triple_chevron(
-                init_grid_size, INIT_KERNEL_THREADS, 0, stream
-            ).doit(init_kernel,
-                tile_state,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-
-            // Get SM occupancy for scan_kernel
-            int scan_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                scan_sm_occupancy,            // out
-                scan_kernel,
-                Policy::BLOCK_THREADS))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log scan_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, Policy::BLOCK_THREADS, (long long) stream, Policy::ITEMS_PER_THREAD, scan_sm_occupancy);
-
-                // Invoke scan_kernel
-                thrust::cuda_cub::launcher::triple_chevron(
-                    scan_grid_size, Policy::BLOCK_THREADS, 0, stream
-                ).doit(scan_kernel,
-                    d_in,
-                    d_out,
-                    tile_state,
-                    start_tile,
-                    scan_op,
-                    init_value,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __host__  __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename ActivePolicyT::ScanPolicyT Policy;
-        typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
-        // Ensure kernels are instantiated.
-        return Invoke<ActivePolicyT>(
-            DeviceScanInitKernel<ScanTileStateT>,
-            DeviceScanKernel<Policy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>
-        );
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                ///< [in] Binary scan functor
-        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
-        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchScan::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchScan dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            scan_op,
-            init_value,
-            stream,
-            debug_synchronous,
-            ptx_version
-            );
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh
deleted file mode 100644
index 5fec4cff7..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_select_if.cuh
+++ /dev/null
@@ -1,546 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../config.cuh"
-#include "../../agent/agent_select_if.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
-    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
-    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
-    typename            ScanTileStateT,             ///< Tile status interface type
-    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename            OffsetT,                    ///< Signed integer type for global offsets
-    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
-__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
-__global__ void DeviceSelectSweepKernel(
-    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
-    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
-    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-    ScanTileStateT          tile_status,            ///< [in] Tile status interface
-    SelectOpT               select_op,              ///< [in] Selection operator
-    EqualityOpT             equality_op,            ///< [in] Equality operator
-    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentSelectIf<
-        AgentSelectIfPolicyT,
-        InputIteratorT,
-        FlagsInputIteratorT,
-        SelectedOutputIteratorT,
-        SelectOpT,
-        EqualityOpT,
-        OffsetT,
-        KEEP_REJECTS> AgentSelectIfT;
-
-    // Shared memory for AgentSelectIf
-    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_selected_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
- */
-template <
-    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
-    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct DispatchSelectIf
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 10,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SelectIfPolicyT;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            SelectIfPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &select_if_config)
-    {
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)ptx_version;
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                select_if_config.template Init<PtxSelectIfPolicyT>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 350)
-                {
-                    select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-                }
-                else if (ptx_version >= 300)
-                {
-                    select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
-                }
-                else if (ptx_version >= 200)
-                {
-                    select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
-                }
-                else if (ptx_version >= 130)
-                {
-                    select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
-                }
-                else
-                {
-                    select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
-                }
-            #endif
-        }
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide selection using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
-        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
-        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
-        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_flags;
-        (void)d_selected_out;
-        (void)d_num_selected_out;
-        (void)select_op;
-        (void)equality_op;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)scan_init_kernel;
-        (void)select_if_kernel;
-        (void)select_if_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke scan_init_kernel to initialize tile descriptors
-            thrust::cuda_cub::launcher::triple_chevron(
-                init_grid_size, INIT_KERNEL_THREADS, 0, stream
-            ).doit(scan_init_kernel,
-                tile_status,
-                num_tiles,
-                d_num_selected_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for select_if_kernel
-            int range_select_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_select_sm_occupancy,            // out
-                select_if_kernel,
-                select_if_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log select_if_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
-
-            // Invoke select_if_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                scan_grid_size, select_if_config.block_threads, 0, stream
-            ).doit(select_if_kernel,
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                tile_status,
-                select_op,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig select_if_config;
-            InitConfigs(ptx_version, select_if_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                select_op,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
-                select_if_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh
deleted file mode 100644
index fb431df2c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ /dev/null
@@ -1,850 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/single_pass_scan_operators.cuh"
-#include "../../agent/agent_segment_fixup.cuh"
-#include "../../agent/agent_spmv_orig.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../config.cuh"
-
-#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * SpMV kernel entry points
- *****************************************************************************/
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for sequence offsets
-__global__ void DeviceSpmv1ColKernel(
-    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
-
-    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (row_idx < spmv_params.num_rows)
-    {
-        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
-        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
-
-        ValueT value = 0.0;
-        if (end_nonzero_idx != nonzero_idx)
-        {
-            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
-        }
-
-        spmv_params.d_vector_y[row_idx] = value;
-    }
-}
-
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
-    typename    OffsetT,                        ///< Signed integer type for sequence offsets
-    typename    CoordinateT,                    ///< Merge path coordinate type
-    typename    SpmvParamsT>                    ///< SpmvParams type
-__global__ void DeviceSpmvSearchKernel(
-    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
-    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
-    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    typedef CacheModifiedInputIterator<
-            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
-    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tile_idx < num_merge_tiles + 1)
-    {
-        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
-        CoordinateT                     tile_coordinate;
-        CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-        // Search the merge path
-        MergePathSearch(
-            diagonal,
-            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-            nonzero_indices,
-            spmv_params.num_rows,
-            spmv_params.num_nonzeros,
-            tile_coordinate);
-
-        // Output starting offset
-        d_tile_coordinates[tile_idx] = tile_coordinate;
-    }
-}
-
-
-/**
- * Spmv agent entry point
- */
-template <
-    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
-    typename        ScanTileStateT,             ///< Tile status interface type
-    typename        ValueT,                     ///< Matrix and vector value type
-    typename        OffsetT,                    ///< Signed integer type for sequence offsets
-    typename        CoordinateT,                ///< Merge path coordinate type
-    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
-    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
-__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
-__global__ void DeviceSpmvKernel(
-    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
-    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
-    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-    int                             num_tiles,                  ///< [in] Number of merge tiles
-    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
-    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
-{
-    // Spmv agent type specialization
-    typedef AgentSpmv<
-            SpmvPolicyT,
-            ValueT,
-            OffsetT,
-            HAS_ALPHA,
-            HAS_BETA>
-        AgentSpmvT;
-
-    // Shared memory for AgentSpmv
-    __shared__ typename AgentSpmvT::TempStorage temp_storage;
-
-    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
-        d_tile_coordinates,
-        d_tile_carry_pairs,
-        num_tiles);
-
-    // Initialize fixup tile status
-    tile_state.InitializeStatus(num_segment_fixup_tiles);
-
-}
-
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    typename    ScanTileStateT>                 ///< Tile status interface type
-__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
-__global__ void DeviceSegmentFixupKernel(
-    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
-    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
-    OffsetT                     num_items,          ///< [in] Total number of items to select from
-    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
-    ScanTileStateT              tile_state)         ///< [in] Tile status interface
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentSegmentFixup<
-            AgentSegmentFixupPolicyT,
-            PairsInputIteratorT,
-            AggregatesOutputIteratorT,
-            cub::Equality,
-            cub::Sum,
-            OffsetT>
-        AgentSegmentFixupT;
-
-    // Shared memory for AgentSegmentFixup
-    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
-        num_items,
-        num_tiles,
-        tile_state);
-}
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
- */
-template <
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchSpmv
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // SpmvParams bundle type
-    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
-
-    // 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    /// SM11
-    struct Policy110
-    {
-        typedef AgentSpmvPolicy<
-                128,
-                1,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                18,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_RAKING>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-
-    /// SM30
-    struct Policy300
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                6,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-    /// SM35
-    struct Policy350
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 96 : 128,
-                (sizeof(ValueT) > 4) ? 4 : 7,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM37
-    struct Policy370
-    {
-
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 128 : 128,
-                (sizeof(ValueT) > 4) ? 9 : 14,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 6 : 7,
-                LOAD_LDG,
-                LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM60
-    struct Policy600
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 5 : 7,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-    typedef Policy600 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 370)
-    typedef Policy370 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
-    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &spmv_config,
-        KernelConfig    &segment_fixup_config)
-    {
-        if (CUB_IS_DEVICE_CODE)
-        {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-                spmv_config.template Init<PtxSpmvPolicyT>();
-                segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-            #endif
-        }
-        else
-        {
-            #if CUB_INCLUDE_HOST_CODE
-                // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-                if (ptx_version >= 600)
-                {
-                    spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 500)
-                {
-                    spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 370)
-                {
-                    spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 350)
-                {
-                    spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 300)
-                {
-                    spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
-                }
-                else if (ptx_version >= 200)
-                {
-                    spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
-                }
-                else
-                {
-                    spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
-                    segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
-                }
-            #endif
-        }
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using the
-     * specified kernel functions.
-     *
-     * If the input is larger than a single tile, this method uses two-passes of
-     * kernel invocations.
-     */
-    template <
-        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
-        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
-        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
-        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
-        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
-        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
-        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
-        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
-        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            if (spmv_params.num_cols == 1)
-            {
-                if (d_temp_storage == NULL)
-                {
-                    // Return if the caller is simply requesting the size of the storage allocation
-                    temp_storage_bytes = 1;
-                    break;
-                }
-
-                // Get search/init grid dims
-                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
-                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
-
-                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                thrust::cuda_cub::launcher::triple_chevron(
-                    degen_col_kernel_grid_size, degen_col_kernel_block_size, 0,
-                    stream
-                ).doit(spmv_1col_kernel,
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                break;
-            }
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Total number of spmv work items
-            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
-
-            // Tile sizes of kernels
-            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
-            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
-
-            // Number of tiles for kernels
-            int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
-            int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
-
-            // Get SM occupancy for kernels
-            int spmv_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                spmv_sm_occupancy,
-                spmv_kernel,
-                spmv_config.block_threads))) break;
-
-            int segment_fixup_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                segment_fixup_sm_occupancy,
-                segment_fixup_kernel,
-                segment_fixup_config.block_threads))) break;
-
-            // Get grid dimensions
-            dim3 spmv_grid_size(
-                CUB_MIN(num_merge_tiles, max_dim_x),
-                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            dim3 segment_fixup_grid_size(
-                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
-                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            // Get the temporary storage allocation requirements
-            size_t allocation_sizes[3];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
-            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
-            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            void* allocations[3] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Alias the other allocations
-            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
-            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
-
-            // Get search/init grid dims
-            int search_block_size   = INIT_KERNEL_THREADS;
-            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
-
-            #if CUB_INCLUDE_HOST_CODE
-                if (CUB_IS_HOST_CODE)
-                {
-                    // Init textures
-                    if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
-                }
-            #endif
-
-            if (search_grid_size < sm_count)
-//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
-            {
-                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
-                d_tile_coordinates = NULL;
-            }
-            else
-            {
-                // Use separate search kernel if we have enough spmv tiles to saturate the device
-
-                // Log spmv_search_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    search_grid_size, search_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                thrust::cuda_cub::launcher::triple_chevron(
-                    search_grid_size, search_block_size, 0, stream
-                ).doit(spmv_search_kernel,
-                    num_merge_tiles,
-                    d_tile_coordinates,
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            // Log spmv_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
-
-            // Invoke spmv_kernel
-            thrust::cuda_cub::launcher::triple_chevron(
-                spmv_grid_size, spmv_config.block_threads, 0, stream
-            ).doit(spmv_kernel,
-                spmv_params,
-                d_tile_coordinates,
-                d_tile_carry_pairs,
-                num_merge_tiles,
-                tile_state,
-                num_segment_fixup_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Run reduce-by-key fixup if necessary
-            if (num_merge_tiles > 1)
-            {
-                // Log segment_fixup_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
-
-                // Invoke segment_fixup_kernel
-                thrust::cuda_cub::launcher::triple_chevron(
-                    segment_fixup_grid_size, segment_fixup_config.block_threads,
-                    0, stream
-                ).doit(segment_fixup_kernel,
-                    d_tile_carry_pairs,
-                    spmv_params.d_vector_y,
-                    num_merge_tiles,
-                    num_segment_fixup_tiles,
-                    tile_state);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            #if CUB_INCLUDE_HOST_CODE
-                if (CUB_IS_HOST_CODE)
-                {
-                    // Free textures
-                    if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
-                }
-            #endif
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig spmv_config, segment_fixup_config;
-            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
-
-            if (CubDebug(error = Dispatch(
-                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
-                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
-                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
-                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                spmv_config, segment_fixup_config))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh
deleted file mode 100644
index 1bcb533ee..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_barrier.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-
-#pragma once
-
-#include "../util_debug.cuh"
-#include "../config.cuh"
-#include "../thread/thread_load.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-class GridBarrier
-{
-protected :
-
-    typedef unsigned int SyncFlag;
-
-    // Counters in global device memory
-    SyncFlag* d_sync;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrier() : d_sync(NULL) {}
-
-
-    /**
-     * Synchronize
-     */
-    __device__ __forceinline__ void Sync() const
-    {
-        volatile SyncFlag *d_vol_sync = d_sync;
-
-        // Threadfence and syncthreads to make sure global writes are visible before
-        // thread-0 reports in with its sync counter
-        __threadfence();
-        CTA_SYNC();
-
-        if (blockIdx.x == 0)
-        {
-            // Report in ourselves
-            if (threadIdx.x == 0)
-            {
-                d_vol_sync[blockIdx.x] = 1;
-            }
-
-            CTA_SYNC();
-
-            // Wait for everyone else to report in
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-
-            // Let everyone know it's safe to proceed
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                d_vol_sync[peer_block] = 0;
-            }
-        }
-        else
-        {
-            if (threadIdx.x == 0)
-            {
-                // Report in
-                d_vol_sync[blockIdx.x] = 1;
-
-                // Wait for acknowledgment
-                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-        }
-    }
-};
-
-
-/**
- * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
- *
- * Uses RAII for lifetime, i.e., device resources are reclaimed when
- * the destructor is called.
- */
-class GridBarrierLifetime : public GridBarrier
-{
-protected:
-
-    // Number of bytes backed by d_sync
-    size_t sync_bytes;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
-
-
-    /**
-     * DeviceFrees and resets the progress counters
-     */
-    cudaError_t HostReset()
-    {
-        cudaError_t retval = cudaSuccess;
-        if (d_sync)
-        {
-            CubDebug(retval = cudaFree(d_sync));
-            d_sync = NULL;
-        }
-        sync_bytes = 0;
-        return retval;
-    }
-
-
-    /**
-     * Destructor
-     */
-    virtual ~GridBarrierLifetime()
-    {
-        HostReset();
-    }
-
-
-    /**
-     * Sets up the progress counters for the next kernel launch (lazily
-     * allocating and initializing them if necessary)
-     */
-    cudaError_t Setup(int sweep_grid_size)
-    {
-        cudaError_t retval = cudaSuccess;
-        do {
-            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
-            if (new_sync_bytes > sync_bytes)
-            {
-                if (d_sync)
-                {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
-                }
-
-                sync_bytes = new_sync_bytes;
-
-                // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
-            }
-        } while (0);
-
-        return retval;
-    }
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh
deleted file mode 100644
index 4ff81bb3c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_even_share.cuh
+++ /dev/null
@@ -1,224 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
- */
-
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "grid_mapping.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridEvenShare is a descriptor utility for distributing input among
- * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
- * the same number of input tiles.
- *
- * \par Overview
- * Each thread block is assigned a consecutive sequence of input tiles.  To help
- * preserve alignment and eliminate the overhead of guarded loads for all but the
- * last thread block, to GridEvenShare assigns one of three different amounts of
- * work to a given thread block: "big", "normal", or "last".  The "big" workloads
- * are one scheduling grain larger than "normal".  The "last" work unit for the
- * last thread block may be partially-full if the input is not an even multiple of
- * the scheduling grain size.
- *
- * \par
- * Before invoking a child grid, a parent thread will typically construct an
- * instance of GridEvenShare.  The instance can be passed to child thread blocks
- * which can initialize their per-thread block offsets using \p BlockInit().
- */
-template <typename OffsetT>
-struct GridEvenShare
-{
-private:
-
-    OffsetT     total_tiles;
-    int         big_shares;
-    OffsetT     big_share_items;
-    OffsetT     normal_share_items;
-    OffsetT     normal_base_offset;
-
-public:
-
-    /// Total number of input items
-    OffsetT     num_items;
-
-    /// Grid size in thread blocks
-    int         grid_size;
-
-    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
-    OffsetT     block_offset;
-
-    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
-    OffsetT     block_end;
-
-    /// Stride between input tiles
-    OffsetT     block_stride;
-
-
-    /**
-     * \brief Constructor.
-     */
-    __host__ __device__ __forceinline__ GridEvenShare() :
-        total_tiles(0),
-        big_shares(0),
-        big_share_items(0),
-        normal_share_items(0),
-        normal_base_offset(0),
-        num_items(0),
-        grid_size(0),
-        block_offset(0),
-        block_end(0),
-        block_stride(0)
-    {}
-
-
-    /**
-     * \brief Dispatch initializer. To be called prior prior to kernel launch.
-     */
-    __host__ __device__ __forceinline__ void DispatchInit(
-        OffsetT num_items,          ///< Total number of input items
-        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the number of input items)
-        int     tile_items)         ///< Number of data items per input tile
-    {
-        this->block_offset          = num_items;    // Initialize past-the-end
-        this->block_end             = num_items;    // Initialize past-the-end
-        this->num_items             = num_items;
-        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
-        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
-        OffsetT avg_tiles_per_block = total_tiles / grid_size;
-        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
-        this->normal_share_items    = avg_tiles_per_block * tile_items;
-        this->normal_base_offset    = big_shares * tile_items;
-        this->big_share_items       = normal_share_items + tile_items;
-    }
-
-
-    /**
-     * \brief Initializes ranges for the specified thread block index.  Specialized
-     * for a "raking" access pattern in which each thread block is assigned a
-     * consecutive sequence of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
-    {
-        block_stride = TILE_ITEMS;
-        if (block_id < big_shares)
-        {
-            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
-            block_offset = (block_id * big_share_items);
-            block_end = block_offset + big_share_items;
-        }
-        else if (block_id < total_tiles)
-        {
-            // This thread block gets a normal share of grains (avg_tiles_per_block)
-            block_offset = normal_base_offset + (block_id * normal_share_items);
-            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
-        }
-        // Else default past-the-end
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
-    {
-        block_stride = grid_size * TILE_ITEMS;
-        block_offset = (block_id * TILE_ITEMS);
-        block_end = num_items;
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for "strip mining" access
-     * pattern in which the input tiles assigned to each thread block are
-     * separated by a stride equal to the extent of the grid.
-     */
-    template <
-        int TILE_ITEMS,
-        GridMappingStrategy STRATEGY>
-    __device__ __forceinline__ void BlockInit()
-    {
-        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        this->block_offset = block_offset;
-        this->block_end = block_end;
-        this->block_stride = TILE_ITEMS;
-    }
-
-
-};
-
-
-
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh
deleted file mode 100644
index 966daa3a6..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_mapping.cuh
+++ /dev/null
@@ -1,113 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-
-#pragma once
-
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/******************************************************************************
- * Mapping policies
- *****************************************************************************/
-
-
-/**
- * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-enum GridMappingStrategy
-{
-    /**
-     * \brief An a "raking" access pattern in which each thread block is
-     * assigned a consecutive sequence of input tiles
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p segments, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each segment is comprised of
-     * consecutive tiles, where a tile is a small, constant-sized unit of input
-     * to be processed to completion before the thread block terminates or
-     * obtains more work.  The kernel invokes \p p thread blocks, each
-     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
-     * in tile-size increments.
-     */
-    GRID_MAPPING_RAKE,
-
-    /**
-     * \brief An a "strip mining" access pattern in which the input tiles assigned
-     * to each thread block are separated by a stride equal to the extent of
-     * the grid.
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p sets, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each set is comprised of
-     * data tiles separated by stride \p tiles, where a tile is a small,
-     * constant-sized unit of input to be processed to completion before the
-     * thread block terminates or obtains more work.  The kernel invokes \p p
-     * thread blocks, each of which iteratively consumes a segment of
-     * <em>n</em>/<em>p</em> elements in tile-size increments.
-     */
-    GRID_MAPPING_STRIP_MINE,
-
-    /**
-     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is treated as a queue to be dynamically consumed by a grid of
-     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
-     * unit of input to be processed to completion before the thread block
-     * terminates or obtains more work.  The grid size \p p is constant,
-     * loosely corresponding to the number of thread blocks that may actively
-     * reside on the target device.
-     */
-    GRID_MAPPING_DYNAMIC,
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh
deleted file mode 100644
index 6b5f676b0..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/grid/grid_queue.cuh
+++ /dev/null
@@ -1,244 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridQueue is a descriptor utility for dynamic queue management.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_debug.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
- *
- * \par Overview
- * GridQueue descriptors provides abstractions for "filling" or
- * "draining" globally-shared vectors.
- *
- * \par
- * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
- * returning a unique offset for the calling thread to write its items.
- * The GridQueue maintains the total "fill-size".  The fill counter must be reset
- * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
- * will be filling.
- *
- * \par
- * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
- * zero-initialized counter, returning a unique offset for the calling thread to
- * read its items. Threads can safely drain until the array's logical fill-size is
- * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
- * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
- * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
- * is simply the number of elements in the array.)
- *
- * \par
- * Iterative work management can be implemented simply with a pair of flip-flopping
- * work buffers, each with an associated set of fill and drain GridQueue descriptors.
- *
- * \tparam OffsetT Signed integer type for global offsets
- */
-template <typename OffsetT>
-class GridQueue
-{
-private:
-
-    /// Counter indices
-    enum
-    {
-        FILL    = 0,
-        DRAIN   = 1,
-    };
-
-    /// Pair of counters
-    OffsetT *d_counters;
-
-public:
-
-    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
-    __host__ __device__ __forceinline__
-    static size_t AllocationSize()
-    {
-        return sizeof(OffsetT) * 2;
-    }
-
-
-    /// Constructs an invalid GridQueue descriptor
-    __host__ __device__ __forceinline__ GridQueue()
-    :
-        d_counters(NULL)
-    {}
-
-
-    /// Constructs a GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((OffsetT*) d_storage)
-    {}
-
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
-        OffsetT fill_size,
-        cudaStream_t stream = 0)
-    {
-        cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[FILL] = fill_size;
-                d_counters[DRAIN] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                OffsetT counters[2];
-                counters[FILL] = fill_size;
-                counters[DRAIN] = 0;
-                result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
-            #endif
-        }
-        return result;
-    }
-
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
-    {
-        cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[DRAIN] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
-            #endif
-        }
-        return result;
-    }
-
-
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
-    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
-    {
-        cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                d_counters[FILL] = 0;
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
-            #endif
-        }
-        return result;
-    }
-
-
-    /// Returns the fill-size established by the parent or by the previous kernel.
-    __host__ __device__ __forceinline__ cudaError_t FillSize(
-        OffsetT &fill_size,
-        cudaStream_t stream = 0)
-    {
-        cudaError_t result = cudaErrorUnknown;
-        if (CUB_IS_DEVICE_CODE) {
-            #if CUB_INCLUDE_DEVICE_CODE
-                (void)stream;
-                fill_size = d_counters[FILL];
-                result = cudaSuccess;
-            #endif
-        } else {
-            #if CUB_INCLUDE_HOST_CODE
-                result = CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
-            #endif
-        }
-        return result;
-    }
-
-
-    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + DRAIN, num_items);
-    }
-
-
-    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + FILL, num_items);
-    }
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reset grid queue (call with 1 block of 1 thread)
- */
-template <typename OffsetT>
-__global__ void FillAndResetDrainKernel(
-    GridQueue<OffsetT>   grid_queue,
-    OffsetT              num_items)
-{
-    grid_queue.FillAndResetDrain(num_items);
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh
deleted file mode 100644
index 39ed4e9ac..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/host/mutex.cuh
+++ /dev/null
@@ -1,172 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple portable mutex
- */
-
-#include "../util_cpp_dialect.cuh"
-
-#pragma once
-
-#if CUB_CPP_DIALECT >= 2011
-    #include <mutex>
-#else
-    #if defined(_WIN32) || defined(_WIN64)
-        #include <intrin.h>
-
-        #define WIN32_LEAN_AND_MEAN
-        #define NOMINMAX
-        #include <windows.h>
-        #undef WIN32_LEAN_AND_MEAN
-        #undef NOMINMAX
-
-        /**
-         * Compiler read/write barrier
-         */
-        #pragma intrinsic(_ReadWriteBarrier)
-
-    #endif
-#endif
-
-#include "../config.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Simple portable mutex
- *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
- *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
- */
-struct Mutex
-{
-#if CUB_CPP_DIALECT >= 2011
-
-    std::mutex mtx;
-
-    void Lock()
-    {
-        mtx.lock();
-    }
-
-    void Unlock()
-    {
-        mtx.unlock();
-    }
-
-    void TryLock()
-    {
-        mtx.try_lock();
-    }
-
-#else       // C++11
-
-    #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-
-        // Microsoft VC++
-        typedef long Spinlock;
-
-    #else
-
-        // GNU g++
-        typedef int Spinlock;
-
-        /**
-         * Compiler read/write barrier
-         */
-        __forceinline__ void _ReadWriteBarrier()
-        {
-            __sync_synchronize();
-        }
-
-        /**
-         * Atomic exchange
-         */
-        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
-        {
-            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
-            _ReadWriteBarrier();
-            return __sync_lock_test_and_set(Target, Value);
-        }
-
-        /**
-         * Pause instruction to prevent excess processor bus usage
-         */
-        __forceinline__ void YieldProcessor()
-        {
-        }
-
-    #endif  // MSVC
-
-        /// Lock member
-        volatile Spinlock lock;
-
-        /**
-         * Constructor
-         */
-        Mutex() : lock(0) {}
-
-        /**
-         * Return when the specified spinlock has been acquired
-         */
-        __forceinline__ void Lock()
-        {
-            while (1)
-            {
-                if (!_InterlockedExchange(&lock, 1)) return;
-                while (lock) YieldProcessor();
-            }
-        }
-
-
-        /**
-         * Release the specified spinlock
-         */
-        __forceinline__ void Unlock()
-        {
-            _ReadWriteBarrier();
-            lock = 0;
-        }
-
-#endif      // C++11
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh
deleted file mode 100644
index f16fab8c2..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/arg_index_input_iterator.cuh
+++ /dev/null
@@ -1,259 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../config.cuh"
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-
-#include <thrust/version.h>
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
- *
- * \par Overview
- * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
- *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
- *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
- * dereference an array of doubles
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::ArgIndexInputIterator<double*> itr(d_in);
- *
- * // Within device code:
- * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
- * Tuple item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 8.0 @ 0
- *
- * itr = itr + 6;
- * item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 9.0 @ 6
- *
- * \endcode
- *
- * \tparam InputIteratorT       The value type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
- */
-template <
-    typename    InputIteratorT,
-    typename    OffsetT             = ptrdiff_t,
-    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
-class ArgIndexInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ArgIndexInputIterator                       self_type;              ///< My own type
-    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
-    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    InputIteratorT  itr;
-    difference_type offset;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArgIndexInputIterator(
-        InputIteratorT  itr,            ///< Input iterator to wrap
-        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
-    :
-        itr(itr),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        value_type retval;
-        retval.value = itr[offset];
-        retval.key = offset;
-        return retval;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(itr, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(itr, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((itr == rhs.itr) && (offset == rhs.offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((itr != rhs.itr) || (offset != rhs.offset));
-    }
-
-    /// Normalize
-    __host__ __device__ __forceinline__ void normalize()
-    {
-        itr += offset;
-        offset = 0;
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh
deleted file mode 100644
index 7a41a5d31..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_input_iterator.cuh
+++ /dev/null
@@ -1,240 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../config.cuh"
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
- *
- * \par Overview
- * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by reading \p ValueType values through loads modified by \p MODIFIER.
- * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
- *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
- * dereference a device array of double using the "ldg" PTX load modifier
- * (i.e., load values through texture cache).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 8.0
- * printf("%f\n", itr[1]);  // 6.0
- * printf("%f\n", itr[6]);  // 9.0
- *
- * \endcode
- *
- * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedInputIterator          self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-
-public:
-
-    /// Wrapped native pointer
-    ValueType* ptr;
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __device__ __forceinline__ reference operator*() const
-    {
-        return ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return ThreadLoad<MODIFIER>(ptr + n);
-    }
-
-    /// Structure dereference
-    __device__ __forceinline__ pointer operator->()
-    {
-        return &ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh
deleted file mode 100644
index e1697013c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/cache_modified_output_iterator.cuh
+++ /dev/null
@@ -1,254 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
- *
- * \par Overview
- * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by writing \p ValueType values through stores modified by \p MODIFIER.
- * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
- *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
- * dereference a device array of doubles using the "wt" PTX load modifier
- * (i.e., write-through to system memory).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_out;              // e.g., [, , , , , , ]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
- *
- * // Within device code:
- * itr[0]  = 8.0;
- * itr[1]  = 66.0;
- * itr[55] = 24.0;
- *
- * \endcode
- *
- * \par Usage Considerations
- * - Can only be dereferenced within device code
- *
- * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedOutputIterator
-{
-private:
-
-    // Proxy object
-    struct Reference
-    {
-        ValueType* ptr;
-
-        /// Constructor
-        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
-
-        /// Assignment
-        __device__ __forceinline__ ValueType operator =(ValueType val)
-        {
-            ThreadStore<MODIFIER>(ptr, val);
-            return val;
-        }
-    };
-
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                                value_type;             ///< The type of the element the iterator can point to
-    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType* ptr;
-
-public:
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return Reference(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return Reference(ptr + n);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh
deleted file mode 100644
index 44fb56c92..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/constant_input_iterator.cuh
+++ /dev/null
@@ -1,235 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../config.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of homogeneous values
- *
- * \par Overview
- * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
- *   of type \p ValueType.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ConstantInputIteratorTto
- * dereference a sequence of homogeneous doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
- *
- * cub::ConstantInputIterator<double> itr(5.0);
- *
- * printf("%f\n", itr[0]);      // 5.0
- * printf("%f\n", itr[1]);      // 5.0
- * printf("%f\n", itr[2]);      // 5.0
- * printf("%f\n", itr[50]);     // 5.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class ConstantInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ConstantInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType   val;
-    OffsetT     offset;
-#ifdef _WIN32
-    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantInputIterator(
-        ValueType   val,            ///< Starting value for the iterator instance to report
-        OffsetT     offset = 0)     ///< Base offset
-    :
-        val(val),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
-    {
-        return val;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset) && ((val == rhs.val));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset) || (val!= rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "," << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh
deleted file mode 100644
index c7167a706..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/counting_input_iterator.cuh
+++ /dev/null
@@ -1,228 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
- *
- * \par Overview
- * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
- *   at \p offset will return the value \p base + \p offset.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CountingInputIteratorTto
- * dereference a sequence of incrementing integers.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
- *
- * cub::CountingInputIterator<int> itr(5);
- *
- * printf("%d\n", itr[0]);      // 5
- * printf("%d\n", itr[1]);      // 6
- * printf("%d\n", itr[2]);      // 7
- * printf("%d\n", itr[50]);     // 55
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class CountingInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CountingInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType val;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CountingInputIterator(
-        const ValueType &val)          ///< Starting value for the iterator instance to report
-    :
-        val(val)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        val++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        val++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val + (ValueType) n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        val += (ValueType) n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val - (ValueType) n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        val -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return (difference_type) (val - other.val);
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return val + (ValueType) n;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (val == rhs.val);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (val != rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "]";
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh
deleted file mode 100644
index e665c784e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/discard_output_iterator.cuh
+++ /dev/null
@@ -1,219 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../config.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A discard iterator
- */
-template <typename OffsetT = ptrdiff_t>
-class DiscardOutputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef DiscardOutputIterator   self_type;              ///< My own type
-    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                    value_type;             ///< The type of the element the iterator can point to
-    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    OffsetT offset;
-
-#if defined(_WIN32) || !defined(_WIN64)
-    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ DiscardOutputIterator(
-        OffsetT offset = 0)     ///< Base offset
-    :
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ self_type& operator*()
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return;
-    }
-
-    /// Assignment to self (no-op)
-    __host__ __device__ __forceinline__ void operator=(self_type const& other)
-    {
-        offset = other.offset;
-    }
-
-    /// Assignment to anything else (no-op)
-    template<typename T>
-    __host__ __device__ __forceinline__ void operator=(T const&)
-    {}
-
-    /// Cast to void* operator
-    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh
deleted file mode 100644
index 2bd3a607e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_obj_input_iterator.cuh
+++ /dev/null
@@ -1,318 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../config.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
- *
- * \par Overview
- * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
- *   created by the host thread, but can be used by any descendant kernel.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIterator to
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexObjInputIterator<double> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    typename    OffsetT = ptrdiff_t>
-class TexObjInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexObjInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    // Largest texture word we can use in device
-    typedef typename UnitWord<T>::TextureWord TextureWord;
-
-    // Number of texture words per T
-    enum {
-        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-    };
-
-private:
-
-    T*                  ptr;
-    difference_type     tex_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TexObjInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0),
-        tex_obj(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
-        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        this->tex_offset = tex_offset;
-
-        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
-        cudaResourceDesc        res_desc;
-        cudaTextureDesc         tex_desc;
-        memset(&res_desc, 0, sizeof(cudaResourceDesc));
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        res_desc.resType                = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr      = this->ptr;
-        res_desc.res.linear.desc        = channel_desc;
-        res_desc.res.linear.sizeInBytes = bytes;
-        tex_desc.readMode               = cudaReadModeElementType;
-        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return CubDebug(cudaDestroyTextureObject(tex_obj));
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        if (CUB_IS_HOST_CODE) {
-            #if CUB_INCLUDE_HOST_CODE
-                // Simply dereference the pointer on the host
-                return ptr[tex_offset];
-            #endif
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // Move array of uninitialized words, then alias and assign to return value
-                TextureWord words[TEXTURE_MULTIPLE];
-
-                #pragma unroll
-                for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-                {
-                    words[i] = tex1Dfetch<TextureWord>(
-                        tex_obj,
-                        (tex_offset * TEXTURE_MULTIPLE) + i);
-                }
-
-                // Load from words
-                return *reinterpret_cast<T*>(words);
-            #else
-                // This is dead code which will never be executed.  It is here
-                // only to avoid warnings about missing return statements.
-                return ptr[tex_offset];
-            #endif
-        }
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh
deleted file mode 100644
index e1e436194..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/tex_ref_input_iterator.cuh
+++ /dev/null
@@ -1,380 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../config.cuh"
-
-#if (CUDART_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
-
-#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Static file-scope Tesla/Fermi-style texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Global texture reference specialized by type
-template <typename T>
-struct IteratorTexRef
-{
-    /// And by unique ID
-    template <int UNIQUE_ID>
-    struct TexId
-    {
-        // Largest texture word we can use in device
-        typedef typename UnitWord<T>::DeviceWord DeviceWord;
-        typedef typename UnitWord<T>::TextureWord TextureWord;
-
-        // Number of texture words per T
-        enum {
-            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
-            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-        };
-
-        // Texture reference type
-        typedef texture<TextureWord> TexRef;
-
-        // Texture reference
-        static TexRef ref;
-
-        /// Bind texture
-        static cudaError_t BindTexture(void *d_in, size_t &offset)
-        {
-            if (d_in)
-            {
-                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
-                ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
-            }
-
-            return cudaSuccess;
-        }
-
-        /// Unbind texture
-        static cudaError_t UnbindTexture()
-        {
-            return CubDebug(cudaUnbindTexture(ref));
-        }
-
-        /// Fetch element
-        template <typename Distance>
-        static __device__ __forceinline__ T Fetch(Distance tex_offset)
-        {
-            DeviceWord temp[DEVICE_MULTIPLE];
-            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
-
-            #pragma unroll
-            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-            {
-                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
-            }
-
-            return reinterpret_cast<T&>(temp);
-        }
-    };
-};
-
-// Texture reference definitions
-template <typename  T>
-template <int       UNIQUE_ID>
-typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
-
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
- *
- * \par Overview
- * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
- *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
- *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
- *   thread, and (4) compilation .o unit.
- * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
- *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
- *   from the host).
- * - Compatible with Thrust API v1.7 or newer.
- * - Compatible with CUDA toolkit v5.5 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIterator to
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexRefInputIterator<double, __LINE__> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    int         UNIQUE_ID,
-    typename    OffsetT = ptrdiff_t>
-class TexRefInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexRefInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    T*              ptr;
-    difference_type tex_offset;
-
-    // Texture reference wrapper (old Tesla/Fermi-style textures)
-    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
-
-public:
-/*
-    /// Constructor
-    __host__ __device__ __forceinline__ TexRefInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0)
-    {}
-*/
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
-        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        size_t offset;
-        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
-        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
-        return retval;
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return TexId::UnbindTexture();
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        if (CUB_IS_HOST_CODE) {
-            // Simply dereference the pointer on the host
-            return ptr[tex_offset];
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // Use the texture reference
-                return TexId::Fetch(tex_offset);
-            #else
-                // This is dead code that will never be executed.  It is here
-                // only to avoid warnings about missing returns.
-                return ptr[tex_offset];
-            #endif
-        }
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-#endif // CUDART_VERSION
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh
deleted file mode 100644
index dee2fea9b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/iterator/transform_input_iterator.cuh
+++ /dev/null
@@ -1,252 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for transforming dereferenced values.
- *
- * \par Overview
- * - TransformInputIteratorTwraps a unary conversion functor of type \p
- *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
- *   using the former to produce references of type \p ValueType from the latter.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TransformInputIteratorTto
- * dereference an array of integers, tripling the values and converting them to doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
- *
- * // Functor for tripling integer values and converting to doubles
- * struct TripleDoubler
- * {
- *     __host__ __device__ __forceinline__
- *     double operator()(const int &a) const {
- *         return double(a * 3);
- *     }
- * };
- *
- * // Declare, allocate, and initialize a device array
- * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
- * TripleDoubler conversion_op;
- *
- * // Create an iterator wrapper
- * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 24.0
- * printf("%f\n", itr[1]);  // 18.0
- * printf("%f\n", itr[6]);  // 27.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
- * \tparam InputIteratorT       The type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- *
- */
-template <
-    typename ValueType,
-    typename ConversionOp,
-    typename InputIteratorT,
-    typename OffsetT = ptrdiff_t>
-class TransformInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TransformInputIterator              self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ConversionOp    conversion_op;
-    InputIteratorT  input_itr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TransformInputIterator(
-        InputIteratorT      input_itr,          ///< Input iterator to wrap
-        ConversionOp        conversion_op)      ///< Conversion functor to wrap
-    :
-        conversion_op(conversion_op),
-        input_itr(input_itr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        input_itr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        input_itr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return conversion_op(*input_itr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(input_itr + n, conversion_op);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        input_itr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(input_itr - n, conversion_op);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        input_itr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return input_itr - other.input_itr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return conversion_op(input_itr[n]);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &conversion_op(*input_itr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (input_itr == rhs.input_itr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (input_itr != rhs.input_itr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh
deleted file mode 100644
index 31e759602..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_load.cuh
+++ /dev/null
@@ -1,427 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for reading memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory load operations.
- */
-enum CacheLoadModifier
-{
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
- *
- * // 32-bit load using cache-global modifier:
- * int *d_in;
- * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
- *
- * // 16-bit load using default modifier
- * short *d_in;
- * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
- *
- * // 256-bit load using cache-volatile modifier
- * double4 *d_in;
- * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
- *
- * // 96-bit load using cache-streaming modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated load iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadLoad
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
-    {
-        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
-        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
-    }
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
-    {
-        vals[COUNT] = itr[COUNT];
-        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
-    }
-};
-
-
-/// Helper structure for templated load iteration (termination case)
-template <int MAX>
-struct IterateThreadLoad<MAX, MAX>
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
-    {                                                                                       \
-        uint4 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y),                                                                 \
-            "=r"(retval.z),                                                                 \
-            "=r"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
-    {                                                                                       \
-        ulonglong2 retval;                                                                  \
-        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
-            "=l"(retval.x),                                                                 \
-            "=l"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
-    {                                                                                       \
-        ushort4 retval;                                                                     \
-        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
-            "=h"(retval.x),                                                                 \
-            "=h"(retval.y),                                                                 \
-            "=h"(retval.z),                                                                 \
-            "=h"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
-    {                                                                                       \
-        uint2 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
-    {                                                                                       \
-        unsigned long long retval;                                                          \
-        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
-            "=l"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
-    {                                                                                       \
-        unsigned int retval;                                                                \
-        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
-            "=r"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
-        "    cvt.u16.u8 %0, datum;"                                                         \
-        "}" :                                                                               \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return (unsigned char) retval;                                                      \
-    }
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
- */
-#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
-    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
-    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_LOAD_ALL(LOAD_CA, ca)
-    _CUB_LOAD_ALL(LOAD_CG, cg)
-    _CUB_LOAD_ALL(LOAD_CS, cs)
-    _CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    _CUB_LOAD_ALL(LOAD_CA, global)
-    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
-    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
-    _CUB_LOAD_ALL(LOAD_CS, global)
-    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-
-#if CUB_PTX_ARCH >= 350
-    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#else
-    _CUB_LOAD_ALL(LOAD_LDG, global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_LOAD_ALL
-#undef _CUB_LOAD_1
-#undef _CUB_LOAD_2
-#undef _CUB_LOAD_4
-#undef _CUB_LOAD_8
-#undef _CUB_LOAD_16
-
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
- */
-template <typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
-    InputIteratorT          itr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<false>         /*is_pointer*/)
-{
-    return *itr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    return *ptr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<true>          /*is_primitive*/)
-{
-    T retval = *reinterpret_cast<volatile T*>(ptr);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<false>         /*is_primitive*/)
-{
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-
-    T retval;
-    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_VOLATILE> /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ T ThreadLoad(
-    T const                 *ptr,
-    Int2Type<MODIFIER>      /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoad(
-        itr,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<InputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh
deleted file mode 100644
index 6a3192bca..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_operators.cuh
+++ /dev/null
@@ -1,316 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple binary operator functor types
- */
-
-/******************************************************************************
- * Simple functor operators
- ******************************************************************************/
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \brief Default equality functor
- */
-struct Equality
-{
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a == b;
-    }
-};
-
-
-/**
- * \brief Default inequality functor
- */
-struct Inequality
-{
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a != b;
-    }
-};
-
-
-/**
- * \brief Inequality functor (wraps equality functor)
- */
-template <typename EqualityOp>
-struct InequalityWrapper
-{
-    /// Wrapped equality operator
-    EqualityOp op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    InequalityWrapper(EqualityOp op) : op(op) {}
-
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return !op(a, b);
-    }
-};
-
-
-/**
- * \brief Default sum functor
- */
-struct Sum
-{
-    /// Boolean sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return a + b;
-    }
-};
-
-
-/**
- * \brief Default max functor
- */
-struct Max
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-/**
- * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
- */
-struct ArgMax
-{
-    /// Boolean max operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default min functor
- */
-struct Min
-{
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MIN(a, b);
-    }
-};
-
-
-/**
- * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
- */
-struct ArgMin
-{
-    /// Boolean min operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default cast functor
- */
-template <typename B>
-struct CastOp
-{
-    /// Cast operator, returns <tt>(B) a</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a) const
-    {
-        return (B) a;
-    }
-};
-
-
-/**
- * \brief Binary operator wrapper for switching non-commutative scan arguments
- */
-template <typename ScanOp>
-class SwizzleScanOp
-{
-private:
-
-    /// Wrapped scan operator
-    ScanOp scan_op;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-    /// Switch the scan arguments
-    template <typename T>
-    __host__ __device__ __forceinline__
-    T operator()(const T &a, const T &b)
-    {
-      T _a(a);
-      T _b(b);
-
-      return scan_op(_b, _a);
-    }
-};
-
-
-/**
- * \brief Reduce-by-segment functor.
- *
- * Given two cub::KeyValuePair inputs \p a and \p b and a
- * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
- * an instance of this functor returns a cub::KeyValuePair whose \p key
- * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
- * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
- *
- * ReduceBySegmentOp is an associative, non-commutative binary combining operator
- * for input sequences of cub::KeyValuePair pairings.  Such
- * sequences are typically used to represent a segmented set of values to be reduced
- * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
- * first value of each segment.
- *
- */
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceBySegmentOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,         ///< First partial reduction
-        const KeyValuePairT &second)        ///< Second partial reduction
-    {
-        KeyValuePairT retval;
-        retval.key = first.key + second.key;
-        retval.value = (second.key) ?
-                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
-        return retval;
-    }
-};
-
-
-
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceByKeyOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,       ///< First partial reduction
-        const KeyValuePairT &second)      ///< Second partial reduction
-    {
-        KeyValuePairT retval = second;
-
-        if (first.key == second.key)
-            retval.value = op(first.value, retval.value);
-
-        return retval;
-    }
-};
-
-
-
-
-
-
-
-/** @} */       // end group UtilModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh
deleted file mode 100644
index 41063f971..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_reduce.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential reduction over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-/**
- * Sequential reduction over statically-sized array types
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<LENGTH>    /*length*/)
-{
-    T retval = prefix;
-
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-        retval = reduction_op(retval, input[i]);
-
-    return retval;
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    T prefix = input[0];
-    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Serial reduction with the specified operator
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    return ThreadReduce<LENGTH>((T*) input, reduction_op);
-}
-
-
-}               // internal namespace
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh
deleted file mode 100644
index fd907fcae..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_scan.cuh
+++ /dev/null
@@ -1,268 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential prefix scan over statically-sized array types
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../thread/thread_operators.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \name Sequential prefix scan over statically-sized array types
- * @{
- */
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(exclusive, input[i]);
-        output[i] = exclusive;
-        exclusive = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = prefix;
-    T exclusive = inclusive;
-
-    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-
-
-
-
-
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(inclusive, input[i]);
-        output[i] = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    T inclusive = input[0];
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilModule
-
-
-}               // internal namespace
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh
deleted file mode 100644
index 96b9e65a5..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_search.cuh
+++ /dev/null
@@ -1,156 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential search
- */
-
-#pragma once
-
-#include <iterator>
-#include "../util_namespace.cuh"
-#include "../config.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Computes the begin offsets into A and B for the specific diagonal
- */
-template <
-    typename AIteratorT,
-    typename BIteratorT,
-    typename OffsetT,
-    typename CoordinateT>
-__host__ __device__ __forceinline__ void MergePathSearch(
-    OffsetT         diagonal,
-    AIteratorT      a,
-    BIteratorT      b,
-    OffsetT         a_len,
-    OffsetT         b_len,
-    CoordinateT&    path_coordinate)
-{
-    /// The value type of the input iterator
-    typedef typename std::iterator_traits<AIteratorT>::value_type T;
-
-    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
-    OffsetT split_max = CUB_MIN(diagonal, a_len);
-
-    while (split_min < split_max)
-    {
-        OffsetT split_pivot = (split_min + split_max) >> 1;
-        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
-        {
-            // Move candidate split range up A, down B
-            split_min = split_pivot + 1;
-        }
-        else
-        {
-            // Move candidate split range up B, down A
-            split_max = split_pivot;
-        }
-    }
-
-    path_coordinate.x = CUB_MIN(split_min, a_len);
-    path_coordinate.y = diagonal - split_min;
-}
-
-
-
-/**
- * \brief Returns the offset of the first value within \p input which does not compare less than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT LowerBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (input[retval + half] < val)
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-        else
-        {
-            num_items = half;
-        }
-    }
-
-    return retval;
-}
-
-
-/**
- * \brief Returns the offset of the first value within \p input which compares greater than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT UpperBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (val < input[retval + half])
-        {
-            num_items = half;
-        }
-        else
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-    }
-
-    return retval;
-}
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh
deleted file mode 100644
index 47d6c6145..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/thread/thread_store.cuh
+++ /dev/null
@@ -1,420 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for writing memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory store operations.
- */
-enum CacheStoreModifier
-{
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
- *
- * // 32-bit store using cache-global modifier:
- * int *d_out;
- * int val;
- * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
- *
- * // 16-bit store using default modifier
- * short *d_out;
- * short val;
- * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
- *
- * // 256-bit store using write-through modifier
- * double4 *d_out;
- * double4 val;
- * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
- *
- * // 96-bit store using cache-streaming cache modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val;
- * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
- * \tparam T                    <b>[inferred]</b> Data type of output value
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            OutputIteratorT,
-    typename            T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated store iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadStore
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals)
-    {
-        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
-    }
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
-    {
-        ptr[COUNT] = vals[COUNT];
-        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
-    }
-
-};
-
-/// Helper structure for templated store iteration (termination case)
-template <int MAX>
-struct IterateThreadStore<MAX, MAX>
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y),                                                                     \
-            "r"(val.z),                                                                     \
-            "r"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val.x),                                                                     \
-            "l"(val.y));                                                                    \
-    }
-
-
-/**
- * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val.x),                                                                     \
-            "h"(val.y),                                                                     \
-            "h"(val.z),                                                                     \
-            "h"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val));                                                                      \
-    }
-
-/**
- * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
-    {                                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "   cvt.u8.u16 datum, %1;"                                                          \
-        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
-        "}" : :                                                                             \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"((unsigned short) val));                                                               \
-    }
-
-/**
- * Define powers-of-two ThreadStore specializations for the given Cache load modifier
- */
-#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
-    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
-    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
-
-
-/**
- * Define ThreadStore specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_STORE_ALL(STORE_WB, wb)
-    _CUB_STORE_ALL(STORE_CG, cg)
-    _CUB_STORE_ALL(STORE_CS, cs)
-    _CUB_STORE_ALL(STORE_WT, wt)
-#else
-    _CUB_STORE_ALL(STORE_WB, global)
-    _CUB_STORE_ALL(STORE_CG, global)
-    _CUB_STORE_ALL(STORE_CS, global)
-    _CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_STORE_ALL
-#undef _CUB_STORE_1
-#undef _CUB_STORE_2
-#undef _CUB_STORE_4
-#undef _CUB_STORE_8
-#undef _CUB_STORE_16
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on iterator types
- */
-template <typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(
-    OutputIteratorT             itr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<false>             /*is_pointer*/)
-{
-    *itr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    *ptr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<true>              /*is_primitive*/)
-{
-    *reinterpret_cast<volatile T*>(ptr) = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<false>             /*is_primitive*/)
-{
-    // Create a temporary using shuffle-words, then store using volatile-words
-    typedef typename UnitWord<T>::VolatileWord  VolatileWord;
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_VOLATILE>    /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadStore definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<MODIFIER>          /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    // Create a temporary using shuffle-words, then store using device-words
-    typedef typename UnitWord<T>::DeviceWord    DeviceWord;
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for generic modifiers
- */
-template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
-{
-    ThreadStore(
-        itr,
-        val,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh
deleted file mode 100644
index fa03996f0..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_allocator.cuh
+++ /dev/null
@@ -1,709 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-#include <set>
-#include <map>
-
-#include "host/mutex.cuh"
-#include <math.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-/**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and stream-safe and is capable of managing cached
- * device allocations on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
- *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-struct CachingDeviceAllocator
-{
-
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int) -1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t) -1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor
-    {
-        void*           d_ptr;              // Device pointer
-        size_t          bytes;              // Size of allocation in bytes
-        unsigned int    bin;                // Bin enumeration
-        int             device;             // device ordinal
-        cudaStream_t    associated_stream;  // Associated associated_stream
-        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
-
-        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
-        BlockDescriptor(int device) :
-            d_ptr(NULL),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Comparison functor for comparing device pointers
-        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.d_ptr < b.d_ptr);
-            else
-                return (a.device < b.device);
-        }
-
-        // Comparison functor for comparing allocation sizes
-        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.bytes < b.bytes);
-            else
-                return (a.device < b.device);
-        }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-        size_t free;
-        size_t live;
-        TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
-
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(
-        unsigned int base,
-        unsigned int exp)
-    {
-        unsigned int retval = 1;
-        while (exp > 0)
-        {
-            if (exp & 1) {
-                retval = retval * base;        // multiply the result by the current base
-            }
-            base = base * base;                // square the base
-            exp = exp >> 1;                    // divide the exponent in half
-        }
-        return retval;
-    }
-
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(
-        unsigned int    &power,
-        size_t          &rounded_bytes,
-        unsigned int    base,
-        size_t          value)
-    {
-        power = 0;
-        rounded_bytes = 1;
-
-        if (value * base < value)
-        {
-            // Overflow
-            power = sizeof(size_t) * 8;
-            rounded_bytes = size_t(0) - 1;
-            return;
-        }
-
-        while (rounded_bytes < value)
-        {
-            rounded_bytes *= base;
-            power++;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex      mutex;              /// Mutex for thread-safety
-
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
-
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
-
-    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
-
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
-        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
-        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-    :
-        bin_growth(bin_growth),
-        min_bin(min_bin),
-        max_bin(max_bin),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes(max_cached_bytes),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(
-        bool skip_cleanup = false,
-        bool debug = false)
-    :
-        bin_growth(8),
-        min_bin(3),
-        max_bin(7),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes((max_bin_bytes * 3) - 1),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    cudaError_t SetMaxCachedBytes(
-        size_t max_cached_bytes)
-    {
-        // Lock
-        mutex.Lock();
-
-        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
-
-        this->max_cached_bytes = max_cached_bytes;
-
-        // Unlock
-        mutex.Unlock();
-
-        return cudaSuccess;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        int             device,             ///< [in] Device on which to place the allocation
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        *d_ptr                          = NULL;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            device = entrypoint_device;
-        }
-
-        // Create a block descriptor for the requested allocation
-        bool found = false;
-        BlockDescriptor search_key(device);
-        search_key.associated_stream = active_stream;
-        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-        if (search_key.bin > max_bin)
-        {
-            // Bin is greater than our maximum bin: allocate the request
-            // exactly and give out-of-bounds bin.  It will not be cached
-            // for reuse when returned.
-            search_key.bin      = INVALID_BIN;
-            search_key.bytes    = bytes;
-        }
-        else
-        {
-            // Search for a suitable cached allocation: lock
-            mutex.Lock();
-
-            if (search_key.bin < min_bin)
-            {
-                // Bin is less than minimum bin: round up
-                search_key.bin      = min_bin;
-                search_key.bytes    = min_bin_bytes;
-            }
-
-            // Iterate through the range of cached blocks on the same device in the same bin
-            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-            while ((block_itr != cached_blocks.end())
-                    && (block_itr->device == device)
-                    && (block_itr->bin == search_key.bin))
-            {
-                // To prevent races with reusing blocks returned by the host but still
-                // in use by the device, only consider cached blocks that are
-                // either (from the active stream) or (from an idle stream)
-                if ((active_stream == block_itr->associated_stream) ||
-                    (CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
-                {
-                    // Reuse existing cache block.  Insert into live blocks.
-                    found = true;
-                    search_key = *block_itr;
-                    search_key.associated_stream = active_stream;
-                    live_blocks.insert(search_key);
-
-                    // Remove from free blocks
-                    cached_bytes[device].free -= search_key.bytes;
-                    cached_bytes[device].live += search_key.bytes;
-
-                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
-                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
-
-                    cached_blocks.erase(block_itr);
-
-                    break;
-                }
-                block_itr++;
-            }
-
-            // Done searching: unlock
-            mutex.Unlock();
-        }
-
-        // Allocate the block if necessary
-        if (!found)
-        {
-            // Set runtime's current device to specified device (entrypoint may not be set)
-            if (device != entrypoint_device)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-                if (CubDebug(error = cudaSetDevice(device))) return error;
-            }
-
-            // Attempt to allocate
-            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
-            {
-                // The allocation attempt failed: free all cached blocks on device and retry
-                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-                error = cudaSuccess;    // Reset the error we will return
-                cudaGetLastError();     // Reset CUDART's error
-
-                // Lock
-                mutex.Lock();
-
-                // Iterate the range of free blocks on the same device
-                BlockDescriptor free_key(device);
-                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
-
-                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
-                {
-                    // No need to worry about synchronization with the device: cudaFree is
-                    // blocking and will synchronize across all kernels executing
-                    // on the current device
-
-                    // Free device memory and destroy stream event.
-                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
-                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
-
-                    // Reduce balance and erase entry
-                    cached_bytes[device].free -= block_itr->bytes;
-
-                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-                    cached_blocks.erase(block_itr);
-
-                    block_itr++;
-                }
-
-                // Unlock
-                mutex.Unlock();
-
-                // Return under error
-                if (error) return error;
-
-                // Try to allocate again
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
-            }
-
-            // Create ready event
-            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
-
-            // Insert into live blocks
-            mutex.Lock();
-            live_blocks.insert(search_key);
-            cached_bytes[device].live += search_key.bytes;
-            mutex.Unlock();
-
-            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
-                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-            // Attempt to revert back to previous device if necessary
-            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-            {
-                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-            }
-        }
-
-        // Copy device pointer to output parameter
-        *d_ptr = search_key.d_ptr;
-
-        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-        return error;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        int             device,
-        void*           d_ptr)
-    {
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-                return error;
-            device = entrypoint_device;
-        }
-
-        // Lock
-        mutex.Lock();
-
-        // Find corresponding block descriptor
-        bool recached = false;
-        BlockDescriptor search_key(d_ptr, device);
-        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-        if (block_itr != live_blocks.end())
-        {
-            // Remove from live blocks
-            search_key = *block_itr;
-            live_blocks.erase(block_itr);
-            cached_bytes[device].live -= search_key.bytes;
-
-            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
-            {
-                // Insert returned allocation into free blocks
-                recached = true;
-                cached_blocks.insert(search_key);
-                cached_bytes[device].free += search_key.bytes;
-
-                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
-                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-            }
-        }
-
-        // First set to specified device (entrypoint may not be set)
-        if (device != entrypoint_device)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            if (CubDebug(error = cudaSetDevice(device))) return error;
-        }
-
-        if (recached)
-        {
-            // Insert the ready event in the associated stream (must have current device set properly)
-            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
-        }
-
-        // Unlock
-        mutex.Unlock();
-
-        if (!recached)
-        {
-            // Free the allocation from the runtime and cleanup the event.
-            if (CubDebug(error = cudaFree(d_ptr))) return error;
-            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-        }
-
-        // Reset device
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        void*           d_ptr)
-    {
-        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
-    }
-
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached()
-    {
-        cudaError_t error         = cudaSuccess;
-        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        int current_device        = INVALID_DEVICE_ORDINAL;
-
-        mutex.Lock();
-
-        while (!cached_blocks.empty())
-        {
-            // Get first block
-            CachedBlocks::iterator begin = cached_blocks.begin();
-
-            // Get entry-point device ordinal if necessary
-            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            }
-
-            // Set current device ordinal if necessary
-            if (begin->device != current_device)
-            {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
-                current_device = begin->device;
-            }
-
-            // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
-
-            // Reduce balance and erase entry
-            cached_bytes[current_device].free -= begin->bytes;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
-
-            cached_blocks.erase(begin);
-        }
-
-        mutex.Unlock();
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator()
-    {
-        if (!skip_cleanup)
-            FreeAllCached();
-    }
-
-};
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh
deleted file mode 100644
index 58d0c7388..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_arch.cuh
+++ /dev/null
@@ -1,186 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Static architectural properties by SM version.
- */
-
-#pragma once
-
-#include "util_cpp_dialect.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
-        !defined(CUB_USE_COOPERATIVE_GROUPS)
-    #define CUB_USE_COOPERATIVE_GROUPS
-#endif
-
-/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
-/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
-#ifndef CUB_PTX_ARCH
-    #if defined(__NVCOMPILER_CUDA__)
-        // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
-        // when compiling both host code and device code. Currently, only one
-        // PTX version can be targeted.
-        #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
-    #elif !defined(__CUDA_ARCH__)
-        #define CUB_PTX_ARCH 0
-    #else
-        #define CUB_PTX_ARCH __CUDA_ARCH__
-    #endif
-#endif
-
-#ifndef CUB_IS_DEVICE_CODE
-    #if defined(__NVCOMPILER_CUDA__)
-        #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
-        #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
-        #define CUB_INCLUDE_DEVICE_CODE 1
-        #define CUB_INCLUDE_HOST_CODE 1
-    #elif CUB_PTX_ARCH > 0
-        #define CUB_IS_DEVICE_CODE 1
-        #define CUB_IS_HOST_CODE 0
-        #define CUB_INCLUDE_DEVICE_CODE 1
-        #define CUB_INCLUDE_HOST_CODE 0
-    #else
-        #define CUB_IS_DEVICE_CODE 0
-        #define CUB_IS_HOST_CODE 1
-        #define CUB_INCLUDE_DEVICE_CODE 0
-        #define CUB_INCLUDE_HOST_CODE 1
-    #endif
-#endif
-
-/// Maximum number of devices supported.
-#ifndef CUB_MAX_DEVICES
-    #define CUB_MAX_DEVICES 128
-#endif
-
-#if CUB_CPP_DIALECT >= 2011
-    static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
-#endif
-
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#ifndef CUB_RUNTIME_FUNCTION
-    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
-        #define CUB_RUNTIME_ENABLED
-        #define CUB_RUNTIME_FUNCTION __host__ __device__
-    #else
-        #define CUB_RUNTIME_FUNCTION __host__
-    #endif
-#endif
-
-
-/// Number of threads per warp
-#ifndef CUB_LOG_WARP_THREADS
-    #define CUB_LOG_WARP_THREADS(arch)                      \
-        (5)
-    #define CUB_WARP_THREADS(arch)                          \
-        (1 << CUB_LOG_WARP_THREADS(arch))
-
-    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
-    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
-#endif
-
-
-/// Number of smem banks
-#ifndef CUB_LOG_SMEM_BANKS
-    #define CUB_LOG_SMEM_BANKS(arch)                        \
-        ((arch >= 200) ?                                    \
-            (5) :                                           \
-            (4))
-    #define CUB_SMEM_BANKS(arch)                            \
-        (1 << CUB_LOG_SMEM_BANKS(arch))
-
-    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
-    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
-#endif
-
-
-/// Oversubscription factor
-#ifndef CUB_SUBSCRIPTION_FACTOR
-    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
-        ((arch >= 300) ?                                    \
-            (5) :                                           \
-            ((arch >= 200) ?                                \
-                (3) :                                       \
-                (10)))
-    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
-#endif
-
-
-/// Prefer padding overhead vs X-way conflicts greater than this threshold
-#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
-    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
-        ((arch >= 300) ?                                    \
-            (1) :                                           \
-            (4))
-    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
-#endif
-
-
-template <
-    int NOMINAL_4B_BLOCK_THREADS,
-    int NOMINAL_4B_ITEMS_PER_THREAD,
-    typename T>
-struct RegBoundScaling
-{
-    enum {
-        ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
-        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
-    };
-};
-
-
-template <
-    int NOMINAL_4B_BLOCK_THREADS,
-    int NOMINAL_4B_ITEMS_PER_THREAD,
-    typename T>
-struct MemBoundScaling
-{
-    enum {
-        ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
-        BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
-    };
-};
-
-
-
-
-#endif  // Do not document
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh
deleted file mode 100644
index b4cbe9237..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_cpp_dialect.cuh
+++ /dev/null
@@ -1,135 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/*! \file
- *  \brief Detect the version of the C++ standard used by the compiler.
- */
-
-#pragma once
-
-#include "util_compiler.cuh"
-
-// Deprecation warnings may be silenced by defining the following macros. These
-// may be combined.
-// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
-//   Ignore all deprecated C++ dialects and outdated compilers.
-// - CUB_IGNORE_DEPRECATED_CPP_11:
-//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
-//   compilers will still issue warnings.
-// - CUB_IGNORE_DEPRECATED_COMPILER
-//   Ignore deprecation warnings when using deprecated compilers. Compiling
-//   with C++03 and C++11 will still issue warnings.
-
-// Check for the thrust opt-outs as well:
-#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && \
-     defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
-#  define    CUB_IGNORE_DEPRECATED_CPP_DIALECT
-#endif
-#if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && \
-     defined(THRUST_IGNORE_DEPRECATED_CPP_11)
-#  define    CUB_IGNORE_DEPRECATED_CPP_11
-#endif
-#if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && \
-     defined(THRUST_IGNORE_DEPRECATED_COMPILER)
-#  define    CUB_IGNORE_DEPRECATED_COMPILER
-#endif
-
-#ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
-#  define CUB_IGNORE_DEPRECATED_CPP_11
-#  define CUB_IGNORE_DEPRECATED_COMPILER
-#endif
-
-// Define this to override the built-in detection.
-#ifndef CUB_CPP_DIALECT
-
-// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
-// This macro is only defined in MSVC 2015U3+.
-#  ifdef _MSVC_LANG // Do not replace with CUB_HOST_COMPILER test (see above)
-// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
-#    if CUB_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
-#      define CUB_CPLUSPLUS 201103L /* Fix to 2011 */
-#    else
-#      define CUB_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
-#    endif // MSVC 2015 C++14 fix
-#  else
-#    define CUB_CPLUSPLUS __cplusplus
-#  endif
-
-// Detect current dialect:
-#  if CUB_CPLUSPLUS < 201103L
-#    define CUB_CPP_DIALECT 2003
-#  elif CUB_CPLUSPLUS < 201402L
-#    define CUB_CPP_DIALECT 2011
-#  elif CUB_CPLUSPLUS < 201703L
-#    define CUB_CPP_DIALECT 2014
-#  elif CUB_CPLUSPLUS == 201703L
-#    define CUB_CPP_DIALECT 2017
-#  elif CUB_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
-#    define CUB_CPP_DIALECT 2020
-#  endif
-
-#  undef CUB_CPLUSPLUS // cleanup
-
-#endif // !CUB_CPP_DIALECT
-
-// Define CUB_COMPILER_DEPRECATION macro:
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#  define CUB_COMP_DEPR_IMPL(msg) \
-    __pragma(message(__FILE__ ":" CUB_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
-#  define CUB_COMP_DEPR_IMPL0(x) CUB_COMP_DEPR_IMPL1(x)
-#  define CUB_COMP_DEPR_IMPL1(x) #x
-#else // clang / gcc:
-#  define CUB_COMP_DEPR_IMPL(msg) CUB_COMP_DEPR_IMPL0(GCC warning #msg)
-#  define CUB_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
-#  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
-#endif
-
-#define CUB_COMPILER_DEPRECATION(REQ, FIX) \
-  CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
-
-// Minimum required compiler checks:
-#ifndef CUB_IGNORE_DEPRECATED_COMPILER
-#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
-     CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
-#  endif
-#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
-     CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
-#  endif
-#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
-     CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
-#  endif
-#endif
-
-#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
-    (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
-  CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
-#endif
-
-#undef CUB_COMPILER_DEPRECATION
-#undef CUB_COMP_DEPR_IMPL
-#undef CUB_COMP_DEPR_IMPL0
-#undef CUB_COMP_DEPR_IMPL1
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh
deleted file mode 100644
index 354eab6cb..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_debug.cuh
+++ /dev/null
@@ -1,162 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Error and event logging routines.
- *
- * The following macros definitions are supported:
- * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/// CUB error reporting macro (prints error messages to stderr)
-#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
-    #define CUB_STDERR
-#endif
-
-
-
-/**
- * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
-{
-    (void)filename;
-    (void)line;
-
-#ifdef CUB_RUNTIME_ENABLED
-    // Clear the global CUDA error state which may have been set by the last
-    // call. Otherwise, errors may "leak" to unrelated kernel launches.
-    cudaGetLastError();
-#endif
-
-#ifdef CUB_STDERR
-    if (error)
-    {
-        if (CUB_IS_HOST_CODE) {
-            #if CUB_INCLUDE_HOST_CODE
-                fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-                fflush(stderr);
-            #endif
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-            #endif
-        }
-    }
-#endif
-    return error;
-}
-
-
-/**
- * \brief Debug macro
- */
-#ifndef CubDebug
-    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
-#endif
-
-
-/**
- * \brief Debug macro with exit
- */
-#ifndef CubDebugExit
-    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
-#endif
-
-
-/**
- * \brief Log macro for printf statements.
- */
-#if !defined(_CubLog)
-    #if defined(__NVCOMPILER_CUDA__)
-        #define _CubLog(format, ...) (__builtin_is_device_code() \
-            ? printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
-                     blockIdx.z, blockIdx.y, blockIdx.x, \
-                     threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__) \
-            : printf(format, __VA_ARGS__));
-    #elif !(defined(__clang__) && defined(__CUDA__))
-        #if (CUB_PTX_ARCH == 0)
-            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
-        #elif (CUB_PTX_ARCH >= 200)
-            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
-        #endif
-    #else
-        // XXX shameless hack for clang around variadic printf...
-        //     Compilies w/o supplying -std=c++11 but shows warning,
-        //     so we sielence them :)
-        #pragma clang diagnostic ignored "-Wc++11-extensions"
-        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
-            template <class... Args>
-            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
-            {
-        #ifdef __CUDA_ARCH__
-              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
-        #else
-              printf(format, args...);
-        #endif
-            }
-        #ifndef __CUDA_ARCH__
-            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
-        #else
-            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
-        #endif
-    #endif
-#endif
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh
deleted file mode 100644
index 5196f408c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_device.cuh
+++ /dev/null
@@ -1,715 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_cpp_dialect.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-#include <atomic>
-#include <array>
-#include <cassert>
-#endif
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
-
-
-/**
- * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
-template <int ALLOCATIONS>
-__host__ __device__ __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t& temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
-{
-    const int ALIGN_BYTES   = 256;
-    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
-
-    // Compute exclusive prefix sum over allocation requests
-    size_t allocation_offsets[ALLOCATIONS];
-    size_t bytes_needed = 0;
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-        allocation_offsets[i] = bytes_needed;
-        bytes_needed += allocation_bytes;
-    }
-    bytes_needed += ALIGN_BYTES - 1;
-
-    // Check if the caller is simply requesting the size of the storage allocation
-    if (!d_temp_storage)
-    {
-        temp_storage_bytes = bytes_needed;
-        return cudaSuccess;
-    }
-
-    // Check if enough storage provided
-    if (temp_storage_bytes < bytes_needed)
-    {
-        return CubDebug(cudaErrorInvalidValue);
-    }
-
-    // Alias
-    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
-template <typename T>
-__global__ void EmptyKernel(void) { }
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Returns the current device or -1 if an error occurred.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
-{
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
-    int device = -1;
-    if (CubDebug(cudaGetDevice(&device))) return -1;
-    return device;
-
-#else // Device code without the CUDA runtime.
-
-    return -1;
-
-#endif
-}
-
-/**
- * \brief RAII helper which saves the current device and switches to the
- *        specified device on construction and switches to the saved device on
- *        destruction.
- */
-struct SwitchDevice
-{
-private:
-    int const old_device;
-    bool const needs_reset;
-public:
-    __host__ __forceinline__ SwitchDevice(int new_device)
-      : old_device(CurrentDevice()), needs_reset(old_device != new_device)
-    {
-        if (needs_reset)
-            CubDebug(cudaSetDevice(new_device));
-    }
-
-    __host__ __forceinline__ ~SwitchDevice()
-    {
-        if (needs_reset)
-            CubDebug(cudaSetDevice(old_device));
-    }
-};
-
-/**
- * \brief Returns the number of CUDA devices available or -1 if an error
- *        occurred.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
-{
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
-    int count = -1;
-    if (CubDebug(cudaGetDeviceCount(&count)))
-        // CUDA makes no guarantees about the state of the output parameter if
-        // `cudaGetDeviceCount` fails; in practice, they don't, but out of
-        // paranoia we'll reset `count` to `-1`.
-        count = -1;
-    return count;
-
-#else // Device code without the CUDA runtime.
-
-    return -1;
-
-#endif
-}
-
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
-/**
- * \brief Cache for an arbitrary value produced by a nullary function.
- */
-template <typename T, T(*Function)()>
-struct ValueCache
-{
-    T const value;
-
-    /**
-     * \brief Call the nullary function to produce the value and construct the
-     *        cache.
-     */
-    __host__ __forceinline__ ValueCache() : value(Function()) {}
-};
-
-#endif
-
-#if CUB_CPP_DIALECT >= 2011
-// Host code, only safely usable in C++11 or newer, where thread-safe
-// initialization of static locals is guaranteed.  This is a separate function
-// to avoid defining a local static in a host/device function.
-__host__ __forceinline__ int DeviceCountCachedValue()
-{
-    static ValueCache<int, DeviceCountUncached> cache;
-    return cache.value;
-}
-#endif
-
-/**
- * \brief Returns the number of CUDA devices available.
- *
- * \note This function may cache the result internally.
- *
- * \note This function is thread safe.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
-{
-    int result = -1;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11.
-                result = DeviceCountCachedValue();
-            #else
-                // Host code and C++98.
-                result = DeviceCountUncached();
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // Device code.
-            result = DeviceCountUncached();
-        #endif
-    }
-    return result;
-}
-
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
-/**
- * \brief Per-device cache for a CUDA attribute value; the attribute is queried
- *        and stored for each device upon construction.
- */
-struct PerDeviceAttributeCache
-{
-    struct DevicePayload
-    {
-        int         attribute;
-        cudaError_t error;
-    };
-
-    // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
-    // `DeviceEntryInitializing` state, and then proceeds to the
-    // `DeviceEntryReady` state. These are the only state transitions allowed;
-    // e.g. a linear sequence of transitions.
-    enum DeviceEntryStatus
-    {
-        DeviceEntryEmpty = 0,
-        DeviceEntryInitializing,
-        DeviceEntryReady
-    };
-
-    struct DeviceEntry
-    {
-        std::atomic<DeviceEntryStatus> flag;
-        DevicePayload                  payload;
-    };
-
-private:
-    std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
-
-public:
-    /**
-     * \brief Construct the cache.
-     */
-    __host__ __forceinline__ PerDeviceAttributeCache() : entries_()
-    {
-        assert(DeviceCount() <= CUB_MAX_DEVICES);
-    }
-
-    /**
-     * \brief Retrieves the payload of the cached function \p f for \p device.
-     *
-     * \note You must pass a morally equivalent function in to every call or
-     *       this function has undefined behavior.
-     */
-    template <typename Invocable>
-    __host__ DevicePayload operator()(Invocable&& f, int device)
-    {
-        if (device >= DeviceCount())
-            return DevicePayload{0, cudaErrorInvalidDevice};
-
-        auto& entry   = entries_[device];
-        auto& flag    = entry.flag;
-        auto& payload = entry.payload;
-
-        DeviceEntryStatus old_status = DeviceEntryEmpty;
-
-        // First, check for the common case of the entry being ready.
-        if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
-        {
-            // Assume the entry is empty and attempt to lock it so we can fill
-            // it by trying to set the state from `DeviceEntryReady` to
-            // `DeviceEntryInitializing`.
-            if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
-                                             std::memory_order_acq_rel,
-                                             std::memory_order_acquire))
-            {
-                // We successfully set the state to `DeviceEntryInitializing`;
-                // we have the lock and it's our job to initialize this entry
-                // and then release it.
-
-                // We don't use `CubDebug` here because we let the user code
-                // decide whether or not errors are hard errors.
-                if (payload.error = std::forward<Invocable>(f)(payload.attribute))
-                    // Clear the global CUDA error state which may have been
-                    // set by the last call. Otherwise, errors may "leak" to
-                    // unrelated kernel launches.
-                    cudaGetLastError();
-
-                // Release the lock by setting the state to `DeviceEntryReady`.
-                flag.store(DeviceEntryReady, std::memory_order_release);
-            }
-
-            // If the `compare_exchange_weak` failed, then `old_status` has
-            // been updated with the value of `flag` that it observed.
-
-            else if (old_status == DeviceEntryInitializing)
-            {
-                // Another execution agent is initializing this entry; we need
-                // to wait for them to finish; we'll know they're done when we
-                // observe the entry status as `DeviceEntryReady`.
-                do { old_status = flag.load(std::memory_order_acquire); }
-                while (old_status != DeviceEntryReady);
-                // FIXME: Use `atomic::wait` instead when we have access to
-                // host-side C++20 atomics. We could use libcu++, but it only
-                // supports atomics for SM60 and up, even if you're only using
-                // them in host code.
-            }
-        }
-
-        // We now know that the state of our entry is `DeviceEntryReady`, so
-        // just return the entry's payload.
-        return entry.payload;
-    }
-};
-
-#endif
-
-/**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
-{
-    // Instantiate `EmptyKernel<void>` in both host and device code to ensure
-    // it can be called.
-    typedef void (*EmptyKernelPtr)();
-    EmptyKernelPtr empty_kernel = EmptyKernel<void>;
-
-    // This is necessary for unused variable warnings in host compilers. The
-    // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
-    (void)reinterpret_cast<void*>(empty_kernel);
-
-    cudaError_t result = cudaSuccess;
-    if (CUB_IS_HOST_CODE) {
-       #if CUB_INCLUDE_HOST_CODE
-            cudaFuncAttributes empty_kernel_attrs;
-
-            do {
-                if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
-                    break;
-            }
-            while(0);
-
-            ptx_version = empty_kernel_attrs.ptxVersion * 10;
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // This is necessary to ensure instantiation of EmptyKernel in device code.
-            // The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
-            // This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
-            (void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);
-
-            ptx_version = CUB_PTX_ARCH;
-        #endif
-    }
-    return result;
-}
-
-/**
- * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
- */
-__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
-{
-    SwitchDevice sd(device);
-    return PtxVersionUncached(ptx_version);
-}
-
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-template <typename Tag>
-__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
-{
-    // C++11 guarantees that initialization of static locals is thread safe.
-    static PerDeviceAttributeCache cache;
-    return cache;
-}
-
-struct PtxVersionCacheTag {};
-struct SmVersionCacheTag {};
-#endif
-
-/**
- * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
- *
- * \note This function may cache the result internally.
- *
- * \note This function is thread safe.
- */
-__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
-{
-#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
-
-    auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-      // If this call fails, then we get the error code back in the payload,
-      // which we check with `CubDebug` below.
-      [=] (int& pv) { return PtxVersionUncached(pv, device); },
-      device);
-
-    if (!CubDebug(payload.error))
-        ptx_version = payload.attribute;
-
-    return payload.error;
-
-#else // Pre C++11.
-
-    return PtxVersionUncached(ptx_version, device);
-
-#endif
-}
-
-/**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
- *
- * \note This function may cache the result internally.
- *
- * \note This function is thread safe.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
-{
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11.
-                auto const device = CurrentDevice();
-
-                auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
-                  // If this call fails, then we get the error code back in the payload,
-                  // which we check with `CubDebug` below.
-                  [=] (int& pv) { return PtxVersionUncached(pv, device); },
-                  device);
-
-                if (!CubDebug(payload.error))
-                    ptx_version = payload.attribute;
-
-                result = payload.error;
-            #else
-                // Host code and C++98.
-                result = PtxVersionUncached(ptx_version);
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            // Device code.
-            result = PtxVersionUncached(ptx_version);
-        #endif
-    }
-    return result;
-}
-
-/**
- * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
-{
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        int major = 0, minor = 0;
-        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
-        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
-        sm_version = major * 100 + minor * 10;
-    }
-    while (0);
-
-    return error;
-
-#else // Device code without the CUDA runtime.
-
-    (void)sm_version;
-    (void)device;
-
-    // CUDA API calls are not supported from this device.
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#endif
-}
-
-/**
- * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
- *
- * \note This function may cache the result internally.
- *
- * \note This function is thread safe.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
-{
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            #if CUB_CPP_DIALECT >= 2011
-                // Host code and C++11
-                auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
-                  // If this call fails, then we get the error code back in the payload,
-                  // which we check with `CubDebug` below.
-                  [=] (int& pv) { return SmVersionUncached(pv, device); },
-                  device);
-
-                if (!CubDebug(payload.error))
-                    sm_version = payload.attribute;
-
-                result = payload.error;
-            #else
-                // Host code and C++98
-                result = SmVersionUncached(sm_version, device);
-            #endif
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            result = SmVersionUncached(sm_version, device);
-        #endif
-    }
-    return result;
-}
-
-/**
- * Synchronize the specified \p stream.
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
-{
-    cudaError_t result = cudaErrorUnknown;
-    if (CUB_IS_HOST_CODE) {
-        #if CUB_INCLUDE_HOST_CODE
-            result = CubDebug(cudaStreamSynchronize(stream));
-        #endif
-    } else {
-        #if CUB_INCLUDE_DEVICE_CODE
-            #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
-                (void)stream;
-                // Device can't yet sync on a specific stream
-                result = CubDebug(cudaDeviceSynchronize());
-            #else // Device code without the CUDA runtime.
-                (void)stream;
-                // CUDA API calls are not supported from this device.
-                result = CubDebug(cudaErrorInvalidConfiguration);
-            #endif
-        #endif
-    }
-    return result;
-}
-
-
-/**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
- *
- * \par Snippet
- * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
- *
- * template <typename T>
- * __global__ void ExampleKernel()
- * {
- *     // Allocate shared memory for BlockScan
- *     __shared__ volatile T buffer[4096];
- *
- *        ...
- * }
- *
- *     ...
- *
- * // Determine SM occupancy for ExampleKernel specialized for unsigned char
- * int max_sm_occupancy;
- * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
- *
- * // max_sm_occupancy  <-- 4 on SM10
- * // max_sm_occupancy  <-- 8 on SM20
- * // max_sm_occupancy  <-- 12 on SM35
- *
- * \endcode
- *
- */
-template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t MaxSmOccupancy(
-    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads,              ///< [in] Number of threads per thread block
-    int                 dynamic_smem_bytes = 0)
-{
-#ifndef CUB_RUNTIME_ENABLED
-
-    (void)dynamic_smem_bytes;
-    (void)block_threads;
-    (void)kernel_ptr;
-    (void)max_sm_occupancy;
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
-    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_sm_occupancy,
-        kernel_ptr,
-        block_threads,
-        dynamic_smem_bytes));
-
-#endif  // CUB_RUNTIME_ENABLED
-}
-
-
-/******************************************************************************
- * Policy management
- ******************************************************************************/
-
-/**
- * Kernel dispatch configuration
- */
-struct KernelConfig
-{
-    int block_threads;
-    int items_per_thread;
-    int tile_size;
-    int sm_occupancy;
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
-
-    template <typename AgentPolicyT, typename KernelPtrT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Init(KernelPtrT kernel_ptr)
-    {
-        block_threads        = AgentPolicyT::BLOCK_THREADS;
-        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
-        tile_size            = block_threads * items_per_thread;
-        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
-        return retval;
-    }
-};
-
-
-
-/// Helper for dispatching into a policy chain
-template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
-struct ChainedPolicy
-{
-   /// The policy for the active compiler pass
-   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
-
-   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-   template <typename FunctorT>
-   CUB_RUNTIME_FUNCTION __forceinline__
-   static cudaError_t Invoke(int ptx_version, FunctorT& op)
-   {
-       if (ptx_version < PTX_VERSION) {
-           return PrevPolicyT::Invoke(ptx_version, op);
-       }
-       return op.template Invoke<PolicyT>();
-   }
-};
-
-/// Helper for dispatching into a policy chain (end-of-chain specialization)
-template <int PTX_VERSION, typename PolicyT>
-struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
-{
-    /// The policy for the active compiler pass
-    typedef PolicyT ActivePolicy;
-
-    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-    template <typename FunctorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) {
-        return op.template Invoke<PolicyT>();
-    }
-};
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh
deleted file mode 100644
index ff8636542..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_macro.cuh
+++ /dev/null
@@ -1,103 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Common C/C++ macro utilities
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-#ifndef CUB_ALIGN
-    #if defined(_WIN32) || defined(_WIN64)
-        /// Align struct
-        #define CUB_ALIGN(bytes) __declspec(align(32))
-    #else
-        /// Align struct
-        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
-    #endif
-#endif
-
-#ifndef CUB_MAX
-    /// Select maximum(a, b)
-    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_MIN
-    /// Select minimum(a, b)
-    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_QUOTIENT_FLOOR
-    /// Quotient of x/y rounded down to nearest integer
-    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
-#endif
-
-#ifndef CUB_QUOTIENT_CEILING
-    /// Quotient of x/y rounded up to nearest integer
-    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#ifndef CUB_ROUND_UP_NEAREST
-    /// x rounded up to the nearest multiple of y
-    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
-#endif
-
-#ifndef CUB_ROUND_DOWN_NEAREST
-    /// x rounded down to the nearest multiple of y
-    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
-#endif
-
-
-#ifndef CUB_STATIC_ASSERT
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-        #define CUB_CAT_(a, b) a ## b
-        #define CUB_CAT(a, b) CUB_CAT_(a, b)
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// Static assert
-    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
-#endif
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh
deleted file mode 100644
index 3f20c11be..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_ptx.cuh
+++ /dev/null
@@ -1,734 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * PTX intrinsics
- */
-
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilPtx
- * @{
- */
-
-
-/******************************************************************************
- * PTX helper macros
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Register modifier for pointer-types (for inlining PTX assembly)
- */
-#if defined(_WIN64) || defined(__LP64__)
-    #define __CUB_LP64__ 1
-    // 64-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "l"
-    #define _CUB_ASM_PTR_SIZE_ "u64"
-#else
-    #define __CUB_LP64__ 0
-    // 32-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "r"
-    #define _CUB_ASM_PTR_SIZE_ "u32"
-#endif
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Inlined PTX intrinsics
- ******************************************************************************/
-
-/**
- * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHR_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-    return ret;
-}
-
-
-/**
- * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHL_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Bitfield-extract.
- */
-template <typename UnsignedBits, int BYTE_LEN>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<BYTE_LEN>      /*byte_len*/)
-{
-    unsigned int bits;
-    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-    return bits;
-}
-
-
-/**
- * Bitfield-extract for 64-bit types.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<8>             /*byte_len*/)
-{
-    const unsigned long long MASK = (1ull << num_bits) - 1;
-    return (source >> bit_start) & MASK;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
-}
-
-
-/**
- * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
- */
-__device__ __forceinline__ void BFI(
-    unsigned int &ret,
-    unsigned int x,
-    unsigned int y,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-}
-
-
-/**
- * \brief Three-operand add.  Returns \p x + \p y + \p z.
- */
-__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-    return x;
-}
-
-
-/**
- * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
- *
- * \par
- * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
- * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
- * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
- * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
- *
- * \par Snippet
- * The code snippet below illustrates byte-permute.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     int a        = 0x03020100;
- *     int b        = 0x07060504;
- *     int index    = 0x00007531;
- *
- *     int selected = PRMT(a, b, index);    // 0x07050301
- *
- * \endcode
- *
- */
-__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-    int ret;
-    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Sync-threads barrier.
- */
-__device__ __forceinline__ void BAR(int count)
-{
-    asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-/**
- * CTA barrier
- */
-__device__  __forceinline__ void CTA_SYNC()
-{
-    __syncthreads();
-}
-
-
-/**
- * CTA barrier with predicate
- */
-__device__  __forceinline__ int CTA_SYNC_AND(int p)
-{
-    return __syncthreads_and(p);
-}
-
-
-/**
- * Warp barrier
- */
-__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    __syncwarp(member_mask);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __any_sync(member_mask, predicate);
-#else
-    return ::__any(predicate);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __all_sync(member_mask, predicate);
-#else
-    return ::__all(predicate);
-#endif
-}
-
-
-/**
- * Warp ballot
- */
-__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __ballot_sync(member_mask, predicate);
-#else
-    return __ballot(predicate);
-#endif
-}
-
-/**
- * Warp synchronous shfl_up
- */
-__device__ __forceinline__ 
-unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
-#else
-    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_down
- */
-__device__ __forceinline__ 
-unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
-#else
-    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_idx
- */
-__device__ __forceinline__ 
-unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
-#else
-    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
-#endif
-    return word;
-}
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FMUL_RZ(float a, float b)
-{
-    float d;
-    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-    return d;
-}
-
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
-{
-    float d;
-    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-    return d;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Terminates the calling thread
- */
-__device__ __forceinline__ void ThreadExit() {
-    asm volatile("exit;");
-}    
-
-
-/**
- * \brief  Abort execution and generate an interrupt to the host CPU
- */
-__device__ __forceinline__ void ThreadTrap() {
-    asm volatile("trap;");
-}
-
-
-/**
- * \brief Returns the row-major linear thread identifier for a multidimensional thread block
- */
-__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
-{
-    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
-            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
-            threadIdx.x;
-}
-
-
-/**
- * \brief Returns the warp lane ID of the calling thread
- */
-__device__ __forceinline__ unsigned int LaneId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
-    return ret;
-}
-
-
-/**
- * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
- */
-__device__ __forceinline__ unsigned int WarpId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
-    return ret;
-}
-
-/** @} */       // end group UtilPtx
-
-
-
-
-/**
- * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
- * \ingroup WarpModule
- *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * predecessor of its predecessor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
- *
- */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
-    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-    enum {
-        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
-    };
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
- 
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
- * \ingroup WarpModule
- *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * successor of its successor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
- *
- */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleDown(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
-    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-    enum {
-        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
-    };
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
- * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
- * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
- *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
- *
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
- *
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from thread 0
- *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
- *
- */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleIndex(
-    T               input,                  ///< [in] The value to broadcast
-    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
-    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
-{
-    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-    enum {
-        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
-    };
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
-                                 src_lane,
-                                 SHFL_C,
-                                 member_mask);
-
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
-                                     src_lane,
-                                     SHFL_C,
-                                     member_mask);
-
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-
-/**
- * Compute a 32b mask of threads having the same least-significant
- * LABEL_BITS of \p label as the calling thread.
- */
-template <int LABEL_BITS>
-inline __device__ unsigned int MatchAny(unsigned int label)
-{
-    unsigned int retval;
-
-    // Extract masks of common threads for each bit
-    #pragma unroll
-    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
-    {
-        unsigned int mask;
-        unsigned int current_bit = 1 << BIT;
-        asm ("{\n"
-            "    .reg .pred p;\n"
-            "    and.b32 %0, %1, %2;"
-            "    setp.eq.u32 p, %0, %2;\n"
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
-#else
-            "    vote.ballot.b32 %0, p;\n"
-#endif
-            "    @!p not.b32 %0, %0;\n"
-            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
-
-        // Remove peers who differ
-        retval = (BIT == 0) ? mask : retval & mask;
-    }
-
-    return retval;
-
-//  // VOLTA match
-//    unsigned int retval;
-//    asm ("{\n"
-//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
-//         "}\n" : "=r"(retval) : "r"(label));
-//    return retval;
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh
deleted file mode 100644
index 0ba41e1ed..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_type.cuh
+++ /dev/null
@@ -1,1167 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Common type manipulation (metaprogramming) utilities
- */
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-#include <cfloat>
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    #include <cuda_fp16.h>
-#endif
-
-#include "util_macro.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
-/******************************************************************************
- * Type equality
- ******************************************************************************/
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Conditional types
- ******************************************************************************/
-
-/**
- * \brief Type equality test
- */
-template <typename A, typename B>
-struct Equals
-{
-    enum {
-        VALUE = 0,
-        NEGATE = 1
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename A>
-struct Equals <A, A>
-{
-    enum {
-        VALUE = 1,
-        NEGATE = 0
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Static math
- ******************************************************************************/
-
-/**
- * \brief Statically determine log2(N), rounded up.
- *
- * For example:
- *     Log2<8>::VALUE   // 3
- *     Log2<3>::VALUE   // 2
- */
-template <int N, int CURRENT_VAL = N, int COUNT = 0>
-struct Log2
-{
-    /// Static logarithm value
-    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <int N, int COUNT>
-struct Log2<N, 0, COUNT>
-{
-    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
-        COUNT :
-        COUNT - 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Statically determine if N is a power-of-two
- */
-template <int N>
-struct PowerOfTwo
-{
-    enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-
-
-/******************************************************************************
- * Pointer vs. iterator detection
- ******************************************************************************/
-
-/**
- * \brief Pointer vs. iterator
- */
-template <typename Tp>
-struct IsPointer
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsPointer<Tp*>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Qualifier detection
- ******************************************************************************/
-
-/**
- * \brief Volatile modifier test
- */
-template <typename Tp>
-struct IsVolatile
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsVolatile<Tp volatile>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Qualifier removal
- ******************************************************************************/
-
-/**
- * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
- *
- * For example:
- *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
- */
-template <typename Tp, typename Up = Tp>
-struct RemoveQualifiers
-{
-    /// Type without \p const and \p volatile qualifiers
-    typedef Up Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, volatile Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const volatile Up>
-{
-    typedef Up Type;
-};
-
-
-/******************************************************************************
- * Marker types
- ******************************************************************************/
-
-/**
- * \brief A simple "NULL" marker type
- */
-struct NullType
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <typename T>
-    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
-
-    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
-
-    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-};
-
-
-/**
- * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
- */
-template <int A>
-struct Int2Type
-{
-   enum {VALUE = A};
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/******************************************************************************
- * Size and alignment
- ******************************************************************************/
-
-/// Structure alignment
-template <typename T>
-struct AlignBytes
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The "true CUDA" alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-
-    /// The "truly aligned" type
-    typedef T Type;
-};
-
-// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
-// with device C++ compilers (EDG) on types passed as template parameters through
-// kernel functions
-
-#define __CUB_ALIGN_BYTES(t, b)         \
-    template <> struct AlignBytes<t>    \
-    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
-
-__CUB_ALIGN_BYTES(short4, 8)
-__CUB_ALIGN_BYTES(ushort4, 8)
-__CUB_ALIGN_BYTES(int2, 8)
-__CUB_ALIGN_BYTES(uint2, 8)
-__CUB_ALIGN_BYTES(long long, 8)
-__CUB_ALIGN_BYTES(unsigned long long, 8)
-__CUB_ALIGN_BYTES(float2, 8)
-__CUB_ALIGN_BYTES(double, 8)
-#ifdef _WIN32
-    __CUB_ALIGN_BYTES(long2, 8)
-    __CUB_ALIGN_BYTES(ulong2, 8)
-#else
-    __CUB_ALIGN_BYTES(long2, 16)
-    __CUB_ALIGN_BYTES(ulong2, 16)
-#endif
-__CUB_ALIGN_BYTES(int4, 16)
-__CUB_ALIGN_BYTES(uint4, 16)
-__CUB_ALIGN_BYTES(float4, 16)
-__CUB_ALIGN_BYTES(long4, 16)
-__CUB_ALIGN_BYTES(ulong4, 16)
-__CUB_ALIGN_BYTES(longlong2, 16)
-__CUB_ALIGN_BYTES(ulonglong2, 16)
-__CUB_ALIGN_BYTES(double2, 16)
-__CUB_ALIGN_BYTES(longlong4, 16)
-__CUB_ALIGN_BYTES(ulonglong4, 16)
-__CUB_ALIGN_BYTES(double4, 16)
-
-template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
-
-
-/// Unit-words of data movement
-template <typename T>
-struct UnitWord
-{
-    enum {
-        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-    };
-
-    template <typename Unit>
-    struct IsMultiple
-    {
-        enum {
-            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
-        };
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
-        unsigned int,
-        typename If<IsMultiple<short>::IS_MULTIPLE,
-            unsigned short,
-            unsigned char>::Type>::Type         ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
-        unsigned long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
-        ulonglong2,
-        VolatileWord>::Type                     DeviceWord;
-
-    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
-        uint4,
-        typename If<IsMultiple<int2>::IS_MULTIPLE,
-            uint2,
-            ShuffleWord>::Type>::Type           TextureWord;
-};
-
-
-// float2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float2>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float       VolatileWord;
-    typedef uint2       DeviceWord;
-#else
-    typedef unsigned long long   VolatileWord;
-    typedef unsigned long long   DeviceWord;
-#endif
-    typedef float2      TextureWord;
-};
-
-// float4 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float4>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float               VolatileWord;
-    typedef uint4               DeviceWord;
-#else
-    typedef unsigned long long  VolatileWord;
-    typedef ulonglong2          DeviceWord;
-#endif
-    typedef float4              TextureWord;
-};
-
-
-// char2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <char2>
-{
-    typedef unsigned short      ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef unsigned short      VolatileWord;
-    typedef short               DeviceWord;
-#else
-    typedef unsigned short      VolatileWord;
-    typedef unsigned short      DeviceWord;
-#endif
-    typedef unsigned short      TextureWord;
-};
-
-
-template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Vector type inference utilities.
- ******************************************************************************/
-
-/**
- * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
- */
-template <typename T, int vec_elements> struct CubVector;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-enum
-{
-    /// The maximum number of elements in CUDA vector types
-    MAX_VEC_ELEMENTS = 4,
-};
-
-
-/**
- * Generic vector-1 type
- */
-template <typename T>
-struct CubVector<T, 1>
-{
-    T x;
-
-    typedef T BaseType;
-    typedef CubVector<T, 1> Type;
-};
-
-/**
- * Generic vector-2 type
- */
-template <typename T>
-struct CubVector<T, 2>
-{
-    T x;
-    T y;
-
-    typedef T BaseType;
-    typedef CubVector<T, 2> Type;
-};
-
-/**
- * Generic vector-3 type
- */
-template <typename T>
-struct CubVector<T, 3>
-{
-    T x;
-    T y;
-    T z;
-
-    typedef T BaseType;
-    typedef CubVector<T, 3> Type;
-};
-
-/**
- * Generic vector-4 type
- */
-template <typename T>
-struct CubVector<T, 4>
-{
-    T x;
-    T y;
-    T z;
-    T w;
-
-    typedef T BaseType;
-    typedef CubVector<T, 4> Type;
-};
-
-
-/**
- * Macro for expanding partially-specialized built-in vector types
- */
-#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
-                                                                                                        \
-    template<> struct CubVector<base_type, 1> : short_type##1                                           \
-    {                                                                                                   \
-      typedef base_type       BaseType;                                                                 \
-      typedef short_type##1   Type;                                                                     \
-      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x + other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x - other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 2> : short_type##2                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##2   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 3> : short_type##3                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##3   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 4> : short_type##4                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##4   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            retval.w = w + other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            retval.w = w - other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };
-
-
-
-// Expand CUDA vector types for built-in primitives
-CUB_DEFINE_VECTOR_TYPE(char,               char)
-CUB_DEFINE_VECTOR_TYPE(signed char,        char)
-CUB_DEFINE_VECTOR_TYPE(short,              short)
-CUB_DEFINE_VECTOR_TYPE(int,                int)
-CUB_DEFINE_VECTOR_TYPE(long,               long)
-CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
-CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-CUB_DEFINE_VECTOR_TYPE(float,              float)
-CUB_DEFINE_VECTOR_TYPE(double,             double)
-CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
-
-// Undefine macros
-#undef CUB_DEFINE_VECTOR_TYPE
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Wrapper types
- ******************************************************************************/
-
-/**
- * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
- */
-template <typename T>
-struct Uninitialized
-{
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        WORDS = sizeof(T) / sizeof(DeviceWord)
-    };
-
-    /// Backing storage
-    DeviceWord storage[WORDS];
-
-    /// Alias
-    __host__ __device__ __forceinline__ T& Alias()
-    {
-        return reinterpret_cast<T&>(*this);
-    }
-};
-
-
-/**
- * \brief A key identifier paired with a corresponding value
- */
-template <
-    typename    _Key,
-    typename    _Value
-#if defined(_WIN32) && !defined(_WIN64)
-    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
-    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-    >
-struct KeyValuePair
-{
-    typedef _Key    Key;                ///< Key data type
-    typedef _Value  Value;              ///< Value data type
-
-    Key     key;                        ///< Item key
-    Value   value;                      ///< Item value
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#if defined(_WIN32) && !defined(_WIN64)
-
-/**
- * Win32 won't do 16B alignment.  This can present two problems for
- * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
- * 1) If a smaller-aligned item were to be listed first, the host compiler places the
- *    should-be-16B item at too early an offset (and disagrees with device compiler)
- * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
- *    of the struct wrong (and disagrees with device compiler)
- *
- * So we put the larger-should-be-aligned item first, and explicitly pad the
- * end of the struct
- */
-
-/// Smaller key specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, true, false>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
-
-    Value   value;  // Value has larger would-be alignment and goes first
-    Key     key;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-
-/// Smaller value specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, false, true>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
-
-    Key     key;    // Key has larger would-be alignment and goes first
-    Value   value;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * \brief A wrapper for passing simple static arrays as kernel parameters
- */
-template <typename T, int COUNT>
-struct ArrayWrapper
-{
-
-    /// Statically-sized array of type \p T
-    T array[COUNT];
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArrayWrapper() {}
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
- *
- * Many multi-pass computations require a pair of "ping-pong" storage
- * buffers (e.g., one for reading from and the other for writing to, and then
- * vice-versa for the subsequent pass).  This structure wraps a set of device
- * buffers and a "selector" member to track which is "current".
- */
-template <typename T>
-struct DoubleBuffer
-{
-    /// Pair of device buffer pointers
-    T *d_buffers[2];
-
-    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-    int selector;
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer()
-    {
-        selector = 0;
-        d_buffers[0] = NULL;
-        d_buffers[1] = NULL;
-    }
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer(
-        T *d_current,         ///< The currently valid buffer
-        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
-    {
-        selector = 0;
-        d_buffers[0] = d_current;
-        d_buffers[1] = d_alternate;
-    }
-
-    /// \brief Return pointer to the currently valid buffer
-    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
-
-    /// \brief Return pointer to the currently invalid buffer
-    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
-
-};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-
-/**
- * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
- */
-#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
-    template <typename T>                                               \
-    struct detector_name                                                \
-    {                                                                   \
-        template <typename C>                                           \
-        static char& test(typename C::nested_type_name*);               \
-        template <typename>                                             \
-        static int& test(...);                                          \
-        enum                                                            \
-        {                                                               \
-            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
-        };                                                              \
-    };
-
-
-
-/******************************************************************************
- * Simple enable-if (similar to Boost)
- ******************************************************************************/
-
-/**
- * \brief Simple enable-if (similar to Boost)
- */
-template <bool Condition, class T = void>
-struct EnableIf
-{
-    /// Enable-if type for SFINAE dummy variables
-    typedef T Type;
-};
-
-
-template <class T>
-struct EnableIf<false, T> {};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-/**
- * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
- */
-template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
-{
-private:
-/*
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
-*/
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
-/*
-    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-*/
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static int Test(...);
-
-public:
-
-    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
-};
-
-
-
-
-/******************************************************************************
- * Simple type traits utilities.
- *
- * For example:
- *     Traits<int>::CATEGORY             // SIGNED_INTEGER
- *     Traits<NullType>::NULL_TYPE       // true
- *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
- *     Traits<uint4>::PRIMITIVE;         // false
- *
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
-enum Category
-{
-    NOT_A_NUMBER,
-    SIGNED_INTEGER,
-    UNSIGNED_INTEGER,
-    FLOATING_POINT
-};
-
-
-/**
- * \brief Basic type traits
- */
-template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
-struct BaseTraits
-{
-    /// Category
-    static const Category CATEGORY      = _CATEGORY;
-    enum
-    {
-        PRIMITIVE       = _PRIMITIVE,
-        NULL_TYPE       = _NULL_TYPE,
-    };
-};
-
-
-/**
- * Basic type traits (unsigned primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = UNSIGNED_INTEGER;
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-
-/**
- * Basic type traits (signed primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = SIGNED_INTEGER;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-template <typename _T>
-struct FpLimits;
-
-template <>
-struct FpLimits<float>
-{
-    static __host__ __device__ __forceinline__ float Max() {
-        return FLT_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ float Lowest() {
-        return FLT_MAX * float(-1);
-    }
-};
-
-template <>
-struct FpLimits<double>
-{
-    static __host__ __device__ __forceinline__ double Max() {
-        return DBL_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ double Lowest() {
-        return DBL_MAX  * double(-1);
-    }
-};
-
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-template <>
-struct FpLimits<__half>
-{
-    static __host__ __device__ __forceinline__ __half Max() {
-        unsigned short max_word = 0x7BFF;
-        return reinterpret_cast<__half&>(max_word);
-    }
-
-    static __host__ __device__ __forceinline__ __half Lowest() {
-        unsigned short lowest_word = 0xFBFF;
-        return reinterpret_cast<__half&>(lowest_word);
-    }
-};
-#endif
-
-
-/**
- * Basic type traits (fp primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = FLOATING_POINT;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
-        return key ^ mask;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
-        return key ^ mask;
-    };
-
-    static __host__ __device__ __forceinline__ T Max() {
-        return FpLimits<T>::Max();
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest() {
-        return FpLimits<T>::Lowest();
-    }
-};
-
-
-/**
- * \brief Numeric type traits
- */
-template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
-
-template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
-
-template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
-template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
-template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
-template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
-template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
-template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
-
-template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
-template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
-template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
-template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
-template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
-
-template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
-template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
-#endif
-
-template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
-
-
-
-/**
- * \brief Type traits
- */
-template <typename T>
-struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh
deleted file mode 100644
index 40e5586c7..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_shfl.cuh
+++ /dev/null
@@ -1,542 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_type.cuh"
-
-#include <stdint.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to specialize this collective
-struct WarpReduceShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp reduction steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// Number of logical warps in a PTX warp
-        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
-
-        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
-
-    };
-
-    template <typename S>
-    struct IsInteger
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    /// Lane index in logical warp
-    int lane_id;
-
-    /// Logical warp index in 32-thread physical warp
-    int warp_id;
-
-    /// 32-thread physical warp member mask of logical warp
-    uint32_t member_mask;
-
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceShfl(
-        TempStorage &/*temp_storage*/)
-    {
-        lane_id = static_cast<int>(LaneId());
-        warp_id = 0;
-        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
-
-        if (!IS_ARCH_WARP)
-        {
-            warp_id = lane_id / LOGICAL_WARP_THREADS;
-            lane_id = lane_id % LOGICAL_WARP_THREADS;
-            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Reduction steps
-    //---------------------------------------------------------------------
-
-    /// Reduction (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int ReduceStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across fp32 types)
-    __device__ __forceinline__ float ReduceStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long ReduceStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
-
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across long long types)
-    __device__ __forceinline__ long long ReduceStep(
-        long long           input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across double types)
-    __device__ __forceinline__ double ReduceStep(
-        double              input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
-    template <typename ValueT, typename KeyT>
-    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
-        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                         last_lane,          ///< [in] Index of last lane in segment
-        int                                         offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<KeyT, ValueT> output;
-
-        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
-
-        output.key = input.key;
-        output.value = ReduceStep(
-            input.value,
-            cub::Sum(),
-            last_lane,
-            offset,
-            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key != other_key)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-
-    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
-    template <typename ValueT, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
-        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                           last_lane,          ///< [in] Index of last lane in segment
-        int                                           offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, ValueT> output;
-
-        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-    /// Reduction step (generic)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T                  input,              ///< [in] Calling thread's input item.
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        _T output = input;
-
-        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
-
-        // Perform reduction op if valid
-        if (offset + lane_id <= last_lane)
-            output = reduction_op(input, temp);
-
-        return output;
-    }
-
-
-    /// Reduction step (specialized for small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
-    //---------------------------------------------------------------------
-
-    template <typename ReductionOp, int STEP>
-    __device__ __forceinline__ void ReduceStep(
-        T&              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        Int2Type<STEP>  /*step*/)
-    {
-        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-
-        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
-    }
-
-    template <typename ReductionOp>
-    __device__ __forceinline__ void ReduceStep(
-        T&              /*input*/,              ///< [in] Calling thread's input item.
-        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int             /*last_lane*/,          ///< [in] Index of last lane in segment
-        Int2Type<STEPS> /*step*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Reduction operations
-    //---------------------------------------------------------------------
-
-    /// Reduction
-    template <
-        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
-    {
-        int last_lane = (ALL_LANES_VALID) ?
-                            LOGICAL_WARP_THREADS - 1 :
-                            valid_items - 1;
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        // Convert to tail-segmented
-        if (HEAD_SEGMENTED)
-            warp_flags >>= 1;
-
-        // Mask out the bits below the current thread
-        warp_flags &= LaneMaskGe();
-
-        // Mask of physical lanes outside the logical warp and convert to logical lanemask
-        if (!IS_ARCH_WARP)
-        {
-            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
-        }
-
-        // Mask in the last lane of logical warp
-        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
-
-        // Find the next set flag
-        int last_lane = __clz(__brev(warp_flags));
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh
deleted file mode 100644
index 65e23792e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_reduce_smem.cuh
+++ /dev/null
@@ -1,372 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to specialize this collective
-struct WarpReduceSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-
-        /// FlagT status (when not using ballot)
-        UNSET   = 0x0,  // Is initially unset
-        SET     = 0x1,  // Is initially set
-        SEEN    = 0x2,  // Has seen another head flag from a successor peer
-    };
-
-    /// Shared memory flag type
-    typedef unsigned char SmemFlag;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    struct _TempStorage
-    {
-        T           reduce[WARP_SMEM_ELEMENTS];
-        SmemFlag    flags[WARP_SMEM_ELEMENTS];
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Regular reduction
-    //---------------------------------------------------------------------
-
-    /**
-     * Reduction step
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp,
-        int                 STEP>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEP>      /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share input through buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-        WARP_SYNC(member_mask);
-
-        // Update input if peer_addend is in range
-        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
-        {
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-            input = reduction_op(input, peer_addend);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
-    }
-
-
-    /**
-     * Reduction step (terminate)
-     */
-    template <
-        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                      ///< [in] Calling thread's input
-        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
-        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
-        Int2Type<STEPS>     /*step*/)
-    {
-        return input;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Segmented reduction
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Ballot-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if (!IS_ARCH_WARP)
-        {
-            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
-        }
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Update input if peer_addend is in range
-            if (OFFSET + lane_id < next_flag)
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-
-            WARP_SYNC(member_mask);
-        }
-
-        return input;
-    }
-
-
-    /**
-     * Smem-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        enum
-        {
-            UNSET   = 0x0,  // Is initially unset
-            SET     = 0x1,  // Is initially set
-            SEEN    = 0x2,  // Has seen another head flag from a successor peer
-        };
-
-        // Alias flags onto shared data storage
-        volatile SmemFlag *flag_storage = temp_storage.flags;
-
-        SmemFlag flag_status = (flag) ? SET : UNSET;
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Get peer from buffer
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-
-            WARP_SYNC(member_mask);
-
-            // Share flag through buffer
-            flag_storage[lane_id] = flag_status;
-
-            // Get peer flag from buffer
-            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
-
-            // Update input if peer was in range
-            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-            {
-                if (HEAD_SEGMENTED)
-                {
-                    // Head-segmented
-                    if ((flag_status & SEEN) == 0)
-                    {
-                        // Has not seen a more distant head flag
-                        if (peer_flag_status & SET)
-                        {
-                            // Has now seen a head flag
-                            flag_status |= SEEN;
-                        }
-                        else
-                        {
-                            // Peer is not a head flag: grab its count
-                            input = reduction_op(input, peer_addend);
-                        }
-
-                        // Update seen status to include that of peer
-                        flag_status |= (peer_flag_status & SEEN);
-                    }
-                }
-                else
-                {
-                    // Tail-segmented.  Simply propagate flag status
-                    if (!flag_status)
-                    {
-                        input = reduction_op(input, peer_addend);
-                        flag_status |= peer_flag_status;
-                    }
-
-                }
-            }
-        }
-
-        return input;
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * Reduction
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
-    {
-        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
-    }
-
-
-    /**
-     * Segmented reduction
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
-    {
-        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh
deleted file mode 100644
index 38257a6d9..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_shfl.cuh
+++ /dev/null
@@ -1,632 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../util_type.cuh"
-#include "../../util_ptx.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to specialize this collective
-struct WarpScanShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
-    };
-
-    template <typename S>
-    struct IntegerTraits
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    /// Lane index in logical warp
-    unsigned int lane_id;
-
-    /// Logical warp index in 32-thread physical warp
-    unsigned int warp_id;
-
-    /// 32-thread physical warp member mask of logical warp
-    unsigned int member_mask;
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanShfl(
-        TempStorage &/*temp_storage*/)
-    {
-        lane_id = LaneId();
-        warp_id = 0;
-        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
-
-        if (!IS_ARCH_WARP)
-        {
-            warp_id = lane_id / LOGICAL_WARP_THREADS;
-            lane_id = lane_id % LOGICAL_WARP_THREADS;
-            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scan steps
-    //---------------------------------------------------------------------
-
-    /// Inclusive prefix scan step (specialized for summation across int32 types)
-    __device__ __forceinline__ int InclusiveScanStep(
-        int             input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-    /// Inclusive prefix scan step (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int InclusiveScanStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp32 types)
-    __device__ __forceinline__ float InclusiveScanStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long InclusiveScanStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across long long types)
-    __device__ __forceinline__ long long InclusiveScanStep(
-        long long       input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp64 types)
-    __device__ __forceinline__ double InclusiveScanStep(
-        double          input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-/*
-    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
-    template <typename Value, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
-        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
-        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
-        int                             first_lane,         ///< [in] Index of first lane in segment
-        int                             offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, Value> output;
-
-        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
-        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-*/
-
-    /// Inclusive prefix scan step (generic)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
-
-        // Perform scan op if from a valid peer
-        _T output = scan_op(temp, input);
-        if (static_cast<int>(lane_id) < first_lane + offset)
-            output = input;
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-
-    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        _T              input,              ///< [in] Calling thread's input item.
-        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        // Iterate scan steps
-        int segment_first_lane = 0;
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output = InclusiveScanStep(
-                inclusive_output,
-                scan_op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-
-    }
-
-    /// Inclusive scan, specialized for reduce-value-by-key
-    template <typename KeyT, typename ValueT, typename ReductionOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
-        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
-
-        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
-
-        // Mask away all lanes greater than ours
-        ballot = ballot & LaneMaskLe();
-
-        // Find index of first set bit
-        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output.value = InclusiveScanStep(
-                inclusive_output.value,
-                scan_op.op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,          ///< [in]
-        T                       &inclusive,         ///< [in, out]
-        T                       &exclusive,         ///< [out]
-        ScanOpT                 /*scan_op*/,        ///< [in]
-        IsIntegerT              /*is_integer*/)     ///< [in]
-    {
-        // initial value unknown
-        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
-
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
-        Update(input, inclusive, exclusive, scan_op, is_integer);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
-        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh
deleted file mode 100644
index c54076637..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/specializations/warp_scan_smem.cuh
+++ /dev/null
@@ -1,397 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../config.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to specialize this collective
-struct WarpScanSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-    };
-
-    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
-    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        int         STEP,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &partial,
-        ScanOp                  scan_op,
-        Int2Type<STEP>          /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share partial into buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
-
-        WARP_SYNC(member_mask);
-
-        // Update partial if addend is in range
-        if (HAS_IDENTITY || (lane_id >= OFFSET))
-        {
-            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
-            partial = scan_op(addend, partial);
-        }
-        WARP_SYNC(member_mask);
-
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
-    }
-
-
-    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &/*partial*/,
-        ScanOp                  /*scan_op*/,
-        Int2Type<STEPS>         /*step*/)
-    {}
-
-
-    /// Inclusive prefix scan (specialized for summation across primitive types)
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        Sum                     scan_op,            ///< [in] Binary scan operator
-        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        T identity = 0;
-        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
-
-        WARP_SYNC(member_mask);
-
-        // Iterate scan steps
-        output = input;
-        ScanStep<true>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /// Inclusive prefix scan
-    template <typename ScanOp, int IS_PRIMITIVE>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp                  scan_op,            ///< [in] Binary scan operator
-        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        // Iterate scan steps
-        output = input;
-        ScanStep<false>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        if (lane_id == src_lane)
-        {
-            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Retrieve aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,      ///< [in]
-        T                       &inclusive,     ///< [in, out]
-        T                       &exclusive,     ///< [out]
-        ScanOpT                 /*scan_op*/,    ///< [in]
-        IsIntegerT              /*is_integer*/) ///< [in]
-    {
-        // initial value unknown
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 /*scan_op*/,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        cub::Sum                /*scan_o*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Broadcast warp aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-
-        // Update inclusive with initial value
-        inclusive = scan_op(initial_value, inclusive);
-
-        // Get exclusive from exclusive
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
-
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh
deleted file mode 100644
index 50ee7056c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_reduce.cuh
+++ /dev/null
@@ -1,611 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "specializations/warp_reduce_shfl.cuh"
-#include "specializations/warp_reduce_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
- *
- * \tparam T                        The reduction input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic reduction)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpReduce}
- * \par
- * The code snippet below illustrates four concurrent warp sum reductions within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for 4 warps
- *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
- *     int warp_id = threadIdx.x / 32;
- *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
- * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
- * \p 2544, and \p 3568, respectively (and is undefined in other threads).
- *
- * \par
- * The code snippet below illustrates a single warp sum reduction within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for one warp
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a reduction
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Return the warp-wide sum to lane0
- *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
- * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-    };
-
-public:
-
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-private:
-
-    /// Shared memory storage layout type for WarpReduce
-    typedef typename InternalWarpReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias())
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp sum reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
-     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input)              ///< [in] Calling thread's input
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(
-     *         thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
-     * undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
-     *         thread_data, head_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return HeadSegmentedReduce(input, head_flag, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
-     *         thread_data, tail_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return TailSegmentedReduce(input, tail_flag, cub::Sum());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp max reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
-     *         thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
-     * \p 95, and \p 127, respectively  (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max(), valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
-     * undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
-     *         thread_data, head_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
-     *         thread_data, tail_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
-    }
-
-
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh
deleted file mode 100644
index e9e95008a..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/warp/warp_scan.cuh
+++ /dev/null
@@ -1,935 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../config.cuh"
-#include "specializations/warp_scan_shfl.cuh"
-#include "specializations/warp_scan_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_type.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
- *
- * \tparam T                        The scan input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - Supports non-commutative scan operators
- * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic scan)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpScan}
- * \par
- * The code snippet below illustrates four concurrent warp prefix sums within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for 4 warps
- *     __shared__ typename WarpScan::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Compute warp-wide prefix sums
- *     int warp_id = threadIdx.x / 32;
- *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data in each of the four warps of threads will be
- * <tt>0, 1, 2, 3, ..., 31}</tt>.
- *
- * \par
- * The code snippet below illustrates a single warp prefix sum within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for one warp
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a prefix sum
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Compute warp-wide prefix sums
- *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// Whether the data type is an integer (which has fully-associative addition)
-        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
-    };
-
-    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
-
-    /// Shared memory storage layout type for WarpScan
-    typedef typename InternalWarpScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
-     *         thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Combination (inclusive & exclusive) prefix scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data exchange
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the warp-wide broadcasts of values from
-     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Broadcast from lane0 in each warp to all other threads in the warp
-     *     int warp_id = threadIdx.x / 32;
-     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p thread_data will be
-     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
-     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
-     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
-     */
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
-    }
-
-    //@}  end member group
-
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml
deleted file mode 100644
index 3ca7f771c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/eclipse code style profile.xml	
+++ /dev/null
@@ -1,155 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<profiles version="1">
-<profile kind="CodeFormatterProfile" name="B40C" version="1">
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.lineSplit" value="80"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.tabulation.size" value="4"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_enumerator_list" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_declarator_list" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_empty_lines" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_method_declaration" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_type_declaration" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.continuation_indentation" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_expression_list" value="0"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_conditional_expression" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indentation.size" value="4"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_array_initializer" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments" value="do not insert"/>
-</profile>
-</profiles>
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu
deleted file mode 100644
index 2fbeda901..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_radix_sort.cu
+++ /dev/null
@@ -1,323 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockRadixSort
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-#include <algorithm>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_radix_sort.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-/// Uniform key samples
-bool g_uniform_keys;
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide sorting over integers
- */
-template <
-    typename    Key,
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD>
-__launch_bounds__ (BLOCK_THREADS)
-__global__ void BlockSortKernel(
-    Key         *d_in,          // Tile of input
-    Key         *d_out,         // Tile of output
-    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
-{
-    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
-
-    // Specialize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockLoadT::TempStorage        load;
-        typename BlockRadixSortT::TempStorage   sort;
-    } temp_storage;
-
-    // Per-thread tile items
-    Key items[ITEMS_PER_THREAD];
-
-    // Our current block's offset
-    int block_offset = blockIdx.x * TILE_SIZE;
-
-    // Load items into a blocked arrangement
-    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Sort keys
-    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Store output in striped fashion
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-    // Store elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize sorting problem (and solution).
- */
-template <typename Key>
-void Initialize(
-    Key *h_in,
-    Key *h_reference,
-    int num_items,
-    int tile_size)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (g_uniform_keys)
-        {
-            h_in[i] = 0;
-        }
-        else
-        {
-            RandomBits(h_in[i]);
-        }
-        h_reference[i] = h_in[i];
-    }
-
-    // Only sort the first tile
-    std::sort(h_reference, h_reference + tile_size);
-}
-
-
-/**
- * Test BlockScan
- */
-template <
-    typename    Key,
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    Key *h_in               = new Key[TILE_SIZE * g_grid_size];
-    Key *h_reference        = new Key[TILE_SIZE * g_grid_size];
-    clock_t *h_elapsed      = new clock_t[g_grid_size];
-
-    // Initialize problem and reference output on host
-    Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
-
-    // Initialize device arrays
-    Key *d_in       = NULL;
-    Key *d_out      = NULL;
-    clock_t *d_elapsed  = NULL;
-    CubDebugExit(cudaMalloc((void**)&d_in,          sizeof(Key) * TILE_SIZE * g_grid_size));
-    CubDebugExit(cudaMalloc((void**)&d_out,         sizeof(Key) * TILE_SIZE * g_grid_size));
-    CubDebugExit(cudaMalloc((void**)&d_elapsed,     sizeof(clock_t) * g_grid_size));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            std::cout << h_in[i] << ", ";
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
-
-    // Copy problem to device
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
-
-    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-    fflush(stdout);
-
-    // Run kernel once to prime caches and check result
-    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    printf("\tOutput items: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    fflush(stdout);
-
-    // Run this several times and average the performance results
-    GpuTimer            timer;
-    float               elapsed_millis          = 0.0;
-    unsigned long long  elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        timer.Start();
-
-        // Run kernel
-        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
-        for (int i = 0; i < g_grid_size; i++)
-            elapsed_clocks += h_elapsed[i];
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
-    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-    fflush(stdout);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_elapsed) delete[] h_elapsed;
-    if (d_in) CubDebugExit(cudaFree(d_in));
-    if (d_out) CubDebugExit(cudaFree(d_out));
-    if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_uniform_keys = args.CheckCmdLineFlag("uniform");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations (default:%d)>]"
-            "[--grid-size=<grid size (default:%d)>]"
-            "[--v] "
-            "\n", argv[0], g_timing_iterations, g_grid_size);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    fflush(stdout);
-
-    // Run tests
-    printf("\nuint32:\n"); fflush(stdout);
-    Test<unsigned int, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    printf("\nfp32:\n"); fflush(stdout);
-    Test<float, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    printf("\nuint8:\n"); fflush(stdout);
-    Test<unsigned char, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu
deleted file mode 100644
index bad800130..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_reduce.cu
+++ /dev/null
@@ -1,290 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockReduce
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_reduce.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide exclusive prefix sum over integers
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockReduceAlgorithm    ALGORITHM>
-__global__ void BlockSumKernel(
-    int         *d_in,          // Tile of input
-    int         *d_out,         // Tile aggregate
-    clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
-{
-    // Specialize BlockReduce type for our thread block
-    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
-
-    // Shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    // Per-thread tile data
-    int data[ITEMS_PER_THREAD];
-    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Compute sum
-    int aggregate = BlockReduceT(temp_storage).Sum(data);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Store aggregate and elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-        *d_out = aggregate;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-/**
- * Initialize reduction problem (and solution).
- * Returns the aggregate
- */
-int Initialize(int *h_in, int num_items)
-{
-    int inclusive = 0;
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_in[i] = i % 17;
-        inclusive += h_in[i];
-    }
-
-    return inclusive;
-}
-
-
-/**
- * Test thread block reduction
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockReduceAlgorithm    ALGORITHM>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    int *h_in           = new int[TILE_SIZE];
-    int *h_gpu          = new int[TILE_SIZE + 1];
-
-    // Initialize problem and reference output on host
-    int h_aggregate = Initialize(h_in, TILE_SIZE);
-
-    // Initialize device arrays
-    int *d_in           = NULL;
-    int *d_out          = NULL;
-    clock_t *d_elapsed  = NULL;
-    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
-    cudaMalloc((void**)&d_out,         sizeof(int) * 1);
-    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            printf("%d, ", h_in[i]);
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
-
-    // Copy problem to device
-    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-
-    // Run aggregate/prefix kernel
-    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check total aggregate
-    printf("\tAggregate: ");
-    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Run this several times and average the performance results
-    GpuTimer    timer;
-    float       elapsed_millis          = 0.0;
-    clock_t     elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Copy problem to device
-        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-        timer.Start();
-
-        // Run aggregate/prefix kernel
-        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        clock_t clocks;
-        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
-        elapsed_clocks += clocks;
-
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
-    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_gpu) delete[] h_gpu;
-    if (d_in) cudaFree(d_in);
-    if (d_out) cudaFree(d_out);
-    if (d_elapsed) cudaFree(d_elapsed);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations>] "
-            "[--grid-size=<grid size>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run tests
-    Test<1024, 1, BLOCK_REDUCE_RAKING>();
-    Test<512, 2, BLOCK_REDUCE_RAKING>();
-    Test<256, 4, BLOCK_REDUCE_RAKING>();
-    Test<128, 8, BLOCK_REDUCE_RAKING>();
-    Test<64, 16, BLOCK_REDUCE_RAKING>();
-    Test<32, 32, BLOCK_REDUCE_RAKING>();
-    Test<16, 64, BLOCK_REDUCE_RAKING>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu
deleted file mode 100644
index fa709a56c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/example_block_scan.cu
+++ /dev/null
@@ -1,334 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockScan
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_scan.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide exclusive prefix sum over integers
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockScanAlgorithm      ALGORITHM>
-__global__ void BlockPrefixSumKernel(
-    int         *d_in,          // Tile of input
-    int         *d_out,         // Tile of output
-    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
-{
-    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
-
-    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
-
-    // Specialize BlockScan type for our thread block
-    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockLoadT::TempStorage    load;
-        typename BlockStoreT::TempStorage   store;
-        typename BlockScanT::TempStorage    scan;
-    } temp_storage;
-
-    // Per-thread tile data
-    int data[ITEMS_PER_THREAD];
-
-    // Load items into a blocked arrangement
-    BlockLoadT(temp_storage.load).Load(d_in, data);
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Compute exclusive prefix sum
-    int aggregate;
-    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Store items from a blocked arrangement
-    BlockStoreT(temp_storage.store).Store(d_out, data);
-
-    // Store aggregate and elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive prefix sum problem (and solution).
- * Returns the aggregate
- */
-int Initialize(
-    int *h_in,
-    int *h_reference,
-    int num_items)
-{
-    int inclusive = 0;
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_in[i] = i % 17;
-
-        h_reference[i] = inclusive;
-        inclusive += h_in[i];
-    }
-
-    return inclusive;
-}
-
-
-/**
- * Test thread block scan
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockScanAlgorithm  ALGORITHM>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    int *h_in           = new int[TILE_SIZE];
-    int *h_reference    = new int[TILE_SIZE];
-    int *h_gpu          = new int[TILE_SIZE + 1];
-
-    // Initialize problem and reference output on host
-    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
-
-    // Initialize device arrays
-    int *d_in           = NULL;
-    int *d_out          = NULL;
-    clock_t *d_elapsed  = NULL;
-    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
-    cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
-    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            printf("%d, ", h_in[i]);
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
-
-    // Copy problem to device
-    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
-        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-
-    // Run aggregate/prefix kernel
-    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check results
-    printf("\tOutput items: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check total aggregate
-    printf("\tAggregate: ");
-    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Run this several times and average the performance results
-    GpuTimer    timer;
-    float       elapsed_millis          = 0.0;
-    clock_t     elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Copy problem to device
-        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-        timer.Start();
-
-        // Run aggregate/prefix kernel
-        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        clock_t clocks;
-        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
-        elapsed_clocks += clocks;
-
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
-    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_gpu) delete[] h_gpu;
-    if (d_in) cudaFree(d_in);
-    if (d_out) cudaFree(d_out);
-    if (d_elapsed) cudaFree(d_elapsed);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations (default:%d)>]"
-            "[--grid-size=<grid size (default:%d)>]"
-            "[--v] "
-            "\n", argv[0], g_timing_iterations, g_grid_size);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run tests
-    Test<1024, 1, BLOCK_SCAN_RAKING>();
-    Test<512, 2, BLOCK_SCAN_RAKING>();
-    Test<256, 4, BLOCK_SCAN_RAKING>();
-    Test<128, 8, BLOCK_SCAN_RAKING>();
-    Test<64, 16, BLOCK_SCAN_RAKING>();
-    Test<32, 32, BLOCK_SCAN_RAKING>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
-    Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
-    Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
-    Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
-    Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
-    Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
-
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu
deleted file mode 100644
index d74e16244..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/reduce_by_key.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-#include <cub/cub.cuh>
-
-
-template <
-    int         BLOCK_THREADS,          ///< Number of CTA threads
-    typename    KeyT,                   ///< Key type
-    typename    ValueT>                 ///< Value type
-__global__ void Kernel()
-{
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef cub::KeyValuePair<int, ValueT> OffsetValuePairT;
-
-    // Reduce-value-by-segment scan operator
-    typedef cub::ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Parameterized BlockDiscontinuity type for setting head flags
-    typedef cub::BlockDiscontinuity<
-            KeyT,
-            BLOCK_THREADS>
-        BlockDiscontinuityKeysT;
-
-    // Parameterized BlockScan type
-    typedef cub::BlockScan<
-            OffsetValuePairT,
-            BLOCK_THREADS,
-            cub::BLOCK_SCAN_WARP_SCANS>
-        BlockScanT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockScanT::TempStorage                scan;           // Scan storage
-        typename BlockDiscontinuityKeysT::TempStorage   discontinuity;  // Discontinuity storage
-    } temp_storage;
-
-
-    // Read data (each thread gets 3 items each, every 9 items is a segment)
-    KeyT    my_keys[3]      = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3};
-    ValueT  my_values[3]    = {1, 1, 1};
-
-    // Set head segment head flags
-    int     my_flags[3];
-    BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads(
-        my_flags,
-        my_keys,
-        cub::Inequality());
-
-    __syncthreads();
-
-
-
-
-
-
-}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu
deleted file mode 100644
index 58ae87448..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_flagged.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DevicePartition::Flagged().
- *
- * Partition flagged items from a sequence of int keys using a
- * corresponding sequence of unsigned char flags.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_partition.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting flags at distances of random length
- * chosen from [1..max_segment]
- */
-void Initialize(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             num_items,
-    int             max_segment)
-{
-    unsigned short max_short = (unsigned short) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_flags[j] = 0;
-            h_in[j] = key;
-            j++;
-        }
-
-        h_flags[i] = 1;
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("Flags:\n");
-        DisplayResults(h_flags, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (h_flags[i])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int             *h_in        = new int[num_items];
-    int             *h_reference = new int[num_items];
-    unsigned char   *h_flags     = new unsigned char[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, h_flags, num_items, max_segment);
-    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
-
-    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int             *d_in = NULL;
-    unsigned char   *d_flags = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu
deleted file mode 100644
index 1a24fcc42..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_partition_if.cu
+++ /dev/null
@@ -1,244 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DevicePartition::If().
- *
- * Partitions items from a sequence of int keys using a
- * section functor (greater-than)
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_partition.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-/// Selection functor type
-struct GreaterThan
-{
-    int compare;
-
-    __host__ __device__ __forceinline__
-    GreaterThan(int compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const int &a) const {
-        return (a > compare);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <typename SelectOp>
-int Solve(
-    int             *h_in,
-    SelectOp        select_op,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (select_op(h_in[i]))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int *h_in        = new int[num_items];
-    int *h_reference = new int[num_items];
-
-    // DevicePartition a pivot index
-    unsigned int pivot_index;
-    unsigned int max_int = (unsigned int) -1;
-    RandomBits(pivot_index);
-    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
-    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    GreaterThan select_op(h_in[pivot_index]);
-
-    int num_selected = Solve(h_in, select_op, h_reference, num_items);
-
-    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu
deleted file mode 100644
index 1494ccb0b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_radix_sort.cu
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceRadixSort::SortPairs().
- *
- * Sorts an array of float keys paired with a corresponding array of int values.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Simple key-value pairing for floating point types.  Distinguishes
- * between positive and negative zero.
- */
-struct Pair
-{
-    float   key;
-    int     value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // Return true if key is negative zero and b.key is positive zero
-        unsigned int key_bits   = *reinterpret_cast<unsigned*>(const_cast<float*>(&key));
-        unsigned int b_key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&b.key));
-        unsigned int HIGH_BIT   = 1u << 31;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key-value sorting problem.
- */
-void Initialize(
-    float           *h_keys,
-    int             *h_values,
-    float           *h_reference_keys,
-    int             *h_reference_values,
-    int             num_items)
-{
-    Pair *h_pairs = new Pair[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        RandomBits(h_keys[i]);
-        RandomBits(h_values[i]);
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    if (g_verbose)
-    {
-        printf("Input keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-
-        printf("Input values:\n");
-        DisplayResults(h_values, num_items);
-        printf("\n\n");
-    }
-
-    std::stable_sort(h_pairs, h_pairs + num_items);
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_keys[i]     = h_pairs[i].key;
-        h_reference_values[i]   = h_pairs[i].value;
-    }
-
-    delete[] h_pairs;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
-        num_items, int(sizeof(float)), int(sizeof(int)));
-    fflush(stdout);
-
-    // Allocate host arrays
-    float   *h_keys             = new float[num_items];
-    float   *h_reference_keys   = new float[num_items];
-    int     *h_values           = new int[num_items];
-    int     *h_reference_values = new int[num_items];
-
-    // Initialize problem and solution on host
-    Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
-
-    // Allocate device arrays
-    DoubleBuffer<float> d_keys;
-    DoubleBuffer<int>   d_values;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
-
-    // Allocate temporary storage
-    size_t  temp_storage_bytes  = 0;
-    void    *d_temp_storage     = NULL;
-
-    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Initialize device arrays
-    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Run
-    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
-    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
-    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_keys) delete[] h_keys;
-    if (h_reference_keys) delete[] h_reference_keys;
-    if (h_values) delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-
-    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
-    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
-    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
-    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu
deleted file mode 100644
index 79900ef0c..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_flagged.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::Flagged().
- *
- * Selects flagged items from a sequence of int keys using a
- * corresponding sequence of unsigned char flags.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting flags at distances of random length
- * chosen from [1..max_segment]
- */
-void Initialize(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             num_items,
-    int             max_segment)
-{
-    unsigned short max_short = (unsigned short) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_flags[j] = 0;
-            h_in[j] = key;
-            j++;
-        }
-
-        h_flags[i] = 1;
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("Flags:\n");
-        DisplayResults(h_flags, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (h_flags[i])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int             *h_in        = new int[num_items];
-    int             *h_reference = new int[num_items];
-    unsigned char   *h_flags     = new unsigned char[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, h_flags, num_items, max_segment);
-    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
-
-    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int             *d_in = NULL;
-    unsigned char   *d_flags = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu
deleted file mode 100644
index 5ed0a423f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_if.cu
+++ /dev/null
@@ -1,242 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::If().
- *
- * Selects items from a sequence of int keys using a
- * section functor (greater-than)
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-/// Selection functor type
-struct GreaterThan
-{
-    int compare;
-
-    __host__ __device__ __forceinline__
-    GreaterThan(int compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const int &a) const {
-        return (a > compare);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <typename SelectOp>
-int Solve(
-    int             *h_in,
-    SelectOp        select_op,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (select_op(h_in[i]))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int *h_in        = new int[num_items];
-    int *h_reference = new int[num_items];
-
-    // Select a pivot index
-    unsigned int pivot_index;
-    unsigned int max_int = (unsigned int) -1;
-    RandomBits(pivot_index);
-    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
-    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    GreaterThan select_op(h_in[pivot_index]);
-
-    int num_selected = Solve(h_in, select_op, h_reference, num_items);
-
-    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu
deleted file mode 100644
index e9cefd5b8..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_select_unique.cu
+++ /dev/null
@@ -1,221 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::Unique().
- *
- * Selects the first element from each run of identical values from a sequence
- * of int keys.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int         *h_in,
-    int         *h_reference,
-    int         num_items)
-{
-    int num_selected = 0;
-    if (num_items > 0)
-    {
-        h_reference[num_selected] = h_in[0];
-        num_selected++;
-    }
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (h_in[i] != h_in[i - 1])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int*  h_in        = new int[num_items];
-    int*  h_reference = new int[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
-        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu
deleted file mode 100644
index ed7024840..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_sort_find_non_trivial_runs.cu
+++ /dev/null
@@ -1,384 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of sorting a sequence of keys and values (each pair is a
- * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
- * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Simple key-value pairing for using std::sort on key-value pairs.
- */
-template <typename Key, typename Value>
-struct Pair
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-
-/**
- * Pair ostream operator
- */
-template <typename Key, typename Value>
-std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
-{
-    os << '<' << val.key << ',' << val.value << '>';
-    return os;
-}
-
-
-/**
- * Initialize problem
- */
-template <typename Key, typename Value>
-void Initialize(
-    Key    *h_keys,
-    Value  *h_values,
-    int    num_items,
-    int    max_key)
-{
-    float scale = float(max_key) / float(UINT_MAX);
-    for (int i = 0; i < num_items; ++i)
-    {
-        Key sample;
-        RandomBits(sample);
-        h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample);
-        h_values[i] = i;
-    }
-
-    if (g_verbose)
-    {
-        printf("Keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-
-        printf("Values:\n");
-        DisplayResults(h_values, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve sorted non-trivial subrange problem.  Returns the number
- * of non-trivial runs found.
- */
-template <typename Key, typename Value>
-int Solve(
-    Key     *h_keys,
-    Value   *h_values,
-    int     num_items,
-    int     *h_offsets_reference,
-    int     *h_lengths_reference)
-{
-    // Sort
-
-    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    std::stable_sort(h_pairs, h_pairs + num_items);
-
-    if (g_verbose)
-    {
-        printf("Sorted pairs:\n");
-        DisplayResults(h_pairs, num_items);
-        printf("\n\n");
-    }
-
-    // Find non-trivial runs
-
-    Key     previous        = h_pairs[0].key;
-    int     length          = 1;
-    int     num_runs        = 0;
-    int     run_begin       = 0;
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (previous != h_pairs[i].key)
-        {
-            if (length > 1)
-            {
-                h_offsets_reference[num_runs]     = run_begin;
-                h_lengths_reference[num_runs]     = length;
-                num_runs++;
-            }
-            length = 1;
-            run_begin = i;
-        }
-        else
-        {
-            length++;
-        }
-        previous = h_pairs[i].key;
-    }
-
-    if (length > 1)
-    {
-        h_offsets_reference[num_runs]   = run_begin;
-        h_lengths_reference[num_runs]   = length;
-        num_runs++;
-    }
-
-    delete[] h_pairs;
-
-    return num_runs;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    typedef unsigned int    Key;
-    typedef int             Value;
-
-    int timing_iterations   = 0;
-    int num_items           = 40;
-    Key max_key             = 20;       // Max item
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxkey", max_key);
-    args.GetCmdLineArgument("i", timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations> "
-            "[--n=<input items, default 40> "
-            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays (problem and reference solution)
-
-    Key     *h_keys                 = new Key[num_items];
-    Value   *h_values               = new Value[num_items];
-    int     *h_offsets_reference    = new int[num_items];
-    int     *h_lengths_reference    = new int[num_items];
-
-    // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
-    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
-    fflush(stdout);
-
-    Initialize(h_keys, h_values, num_items, max_key);
-    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
-
-    printf("%d non-trivial runs\n", num_runs);
-    fflush(stdout);
-
-    // Repeat for performance timing
-    GpuTimer gpu_timer;
-    GpuTimer gpu_rle_timer;
-    float elapsed_millis = 0.0;
-    float elapsed_rle_millis = 0.0;
-    for (int i = 0; i <= timing_iterations; ++i)
-    {
-
-        // Allocate and initialize device arrays for sorting
-        DoubleBuffer<Key>       d_keys;
-        DoubleBuffer<Value>     d_values;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
-
-        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-        // Start timer
-        gpu_timer.Start();
-
-        // Allocate temporary storage for sorting
-        size_t  temp_storage_bytes  = 0;
-        void    *d_temp_storage     = NULL;
-        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Do the sort
-        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-
-        // Free unused buffers and sorting temporary storage
-        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
-        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-        // Start timer
-        gpu_rle_timer.Start();
-
-        // Allocate device arrays for enumerating non-trivial runs
-        int     *d_offests_out   = NULL;
-        int     *d_lengths_out   = NULL;
-        int     *d_num_runs      = NULL;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
-
-        // Allocate temporary storage for isolating non-trivial runs
-        d_temp_storage = NULL;
-        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys.d_buffers[d_keys.selector],
-            d_offests_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Do the isolation
-        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys.d_buffers[d_keys.selector],
-            d_offests_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items));
-
-        // Free keys buffer
-        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
-
-        //
-        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
-        //
-
-        // Stop sort timer
-        gpu_timer.Stop();
-        gpu_rle_timer.Stop();
-
-        if (i == 0)
-        {
-            // First iteration is a warmup: // Check for correctness (and display results, if specified)
-
-            printf("\nRUN OFFSETS: \n");
-            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            printf("\nRUN LENGTHS: \n");
-            compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            printf("\nNUM RUNS: \n");
-            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            AssertEquals(0, compare);
-        }
-        else
-        {
-            elapsed_millis += gpu_timer.ElapsedMillis();
-            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
-        }
-
-        // GPU cleanup
-
-        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
-        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
-        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
-        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    }
-
-    // Host cleanup
-    if (h_keys) delete[] h_keys;
-    if (h_values) delete[] h_values;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-
-    printf("\n\n");
-
-    if (timing_iterations > 0)
-    {
-        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
-            timing_iterations,
-            elapsed_millis / timing_iterations,
-            elapsed_rle_millis / timing_iterations);
-    }
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore
deleted file mode 100644
index 5e56e040e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/bin
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu
deleted file mode 100644
index 6b33e1f70..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/example_coo_spmv.cu
+++ /dev/null
@@ -1,1070 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * An implementation of COO SpMV using prefix scan to implement a
- * reduce-value-by-row strategy
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-#include "coo_graph.cuh"
-#include "../test/test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-/******************************************************************************
- * Globals, constants, and typedefs
- ******************************************************************************/
-
-typedef int         VertexId;   // uint32s as vertex ids
-typedef double      Value;      // double-precision floating point values
-
-bool                    g_verbose       = false;
-int                     g_timing_iterations    = 1;
-CachingDeviceAllocator  g_allocator;
-
-
-/******************************************************************************
- * Texture referencing
- ******************************************************************************/
-
-/**
- * Templated texture reference type for multiplicand vector
- */
-template <typename Value>
-struct TexVector
-{
-    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
-    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
-
-    // Texture reference type
-    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
-
-    static TexRef ref;
-
-    /**
-     * Bind textures
-     */
-    static void BindTexture(void *d_in, int elements)
-    {
-        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
-        if (d_in)
-        {
-            size_t offset;
-            size_t bytes = sizeof(CastType) * elements;
-            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
-        }
-    }
-
-    /**
-     * Unbind textures
-     */
-    static void UnbindTexture()
-    {
-        CubDebugExit(cudaUnbindTexture(ref));
-    }
-
-    /**
-     * Load
-     */
-    static __device__ __forceinline__ Value Load(int offset)
-    {
-        Value output;
-        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
-        return output;
-    }
-};
-
-// Texture reference definitions
-template <typename Value>
-typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
-
-
-/******************************************************************************
- * Utility types
- ******************************************************************************/
-
-
-/**
- * A partial dot-product sum paired with a corresponding row-id
- */
-template <typename VertexId, typename Value>
-struct PartialProduct
-{
-    VertexId    row;            /// Row-id
-    Value       partial;        /// PartialProduct sum
-};
-
-
-/**
- * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
- */
-template <>
-struct PartialProduct<int, double>
-{
-    long long   row;            /// Row-id
-    double      partial;        /// PartialProduct sum
-};
-
-
-/**
- * Reduce-value-by-row scan operator
- */
-struct ReduceByKeyOp
-{
-    template <typename PartialProduct>
-    __device__ __forceinline__ PartialProduct operator()(
-        const PartialProduct &first,
-        const PartialProduct &second)
-    {
-        PartialProduct retval;
-
-        retval.partial = (second.row != first.row) ?
-                second.partial :
-                first.partial + second.partial;
-
-        retval.row = second.row;
-        return retval;
-    }
-};
-
-
-/**
- * Stateful block-wide prefix operator for BlockScan
- */
-template <typename PartialProduct>
-struct BlockPrefixCallbackOp
-{
-    // Running block-wide prefix
-    PartialProduct running_prefix;
-
-    /**
-     * Returns the block-wide running_prefix in thread-0
-     */
-    __device__ __forceinline__ PartialProduct operator()(
-        const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        ReduceByKeyOp scan_op;
-
-        PartialProduct retval = running_prefix;
-        running_prefix = scan_op(running_prefix, block_aggregate);
-        return retval;
-    }
-};
-
-
-/**
- * Operator for detecting discontinuities in a list of row identifiers.
- */
-struct NewRowOp
-{
-    /// Returns true if row_b is the start of a new row
-    template <typename VertexId>
-    __device__ __forceinline__ bool operator()(
-        const VertexId& row_a,
-        const VertexId& row_b)
-    {
-        return (row_a != row_b);
-    }
-};
-
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * SpMV thread block abstraction for processing a contiguous segment of
- * sparse COO tiles.
- */
-template <
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD,
-    typename        VertexId,
-    typename        Value>
-struct PersistentBlockSpmv
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Partial dot product type
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    // Parameterized BlockScan type for reduce-value-by-row scan
-    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
-
-    // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
-    typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
-
-    // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
-    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
-
-    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
-    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
-
-    // Shared memory type for this thread block
-    struct TempStorage
-    {
-        union
-        {
-            typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
-            typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
-            struct
-            {
-                typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
-                typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
-            };
-        };
-
-        VertexId        first_block_row;    ///< The first row-ID seen by this thread block
-        VertexId        last_block_row;     ///< The last row-ID seen by this thread block
-        Value           first_product;      ///< The first dot-product written by this thread block
-    };
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    TempStorage                     &temp_storage;
-    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
-    VertexId                        *d_rows;
-    VertexId                        *d_columns;
-    Value                           *d_values;
-    Value                           *d_vector;
-    Value                           *d_result;
-    PartialProduct                  *d_block_partials;
-    int                             block_offset;
-    int                             block_end;
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    PersistentBlockSpmv(
-        TempStorage                 &temp_storage,
-        VertexId                    *d_rows,
-        VertexId                    *d_columns,
-        Value                       *d_values,
-        Value                       *d_vector,
-        Value                       *d_result,
-        PartialProduct              *d_block_partials,
-        int                         block_offset,
-        int                         block_end)
-    :
-        temp_storage(temp_storage),
-        d_rows(d_rows),
-        d_columns(d_columns),
-        d_values(d_values),
-        d_vector(d_vector),
-        d_result(d_result),
-        d_block_partials(d_block_partials),
-        block_offset(block_offset),
-        block_end(block_end)
-    {
-        // Initialize scalar shared memory values
-        if (threadIdx.x == 0)
-        {
-            VertexId first_block_row            = d_rows[block_offset];
-            VertexId last_block_row             = d_rows[block_end - 1];
-
-            temp_storage.first_block_row        = first_block_row;
-            temp_storage.last_block_row         = last_block_row;
-            temp_storage.first_product          = Value(0);
-
-            // Initialize prefix_op to identity
-            prefix_op.running_prefix.row        = first_block_row;
-            prefix_op.running_prefix.partial    = Value(0);
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Processes a COO input tile of edges, outputting dot products for each row
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        int block_offset,
-        int guarded_items = 0)
-    {
-        VertexId        columns[ITEMS_PER_THREAD];
-        VertexId        rows[ITEMS_PER_THREAD];
-        Value           values[ITEMS_PER_THREAD];
-        PartialProduct  partial_sums[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
-        if (FULL_TILE)
-        {
-            // Unguarded loads
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
-        }
-        else
-        {
-            // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
-            // vertex for out-of-bound items, but zero-valued
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
-        }
-
-        // Load the referenced values from x and compute the dot product partials sums
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-#if CUB_PTX_ARCH >= 350
-            values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
-#else
-            values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
-#endif
-        }
-
-        // Transpose from warp-striped to blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
-
-        __syncthreads();
-
-        // Transpose from warp-striped to blocked arrangement
-        BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
-
-        // Barrier for smem reuse and coherence
-        __syncthreads();
-
-        // FlagT row heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            head_flags,                     // (Out) Head flags
-            rows,                           // Original row ids
-            NewRowOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
-
-        // Assemble partial product structures
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            partial_sums[ITEM].partial = values[ITEM];
-            partial_sums[ITEM].row = rows[ITEM];
-        }
-
-        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
-        PartialProduct block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_sums,                   // Scan input
-            partial_sums,                   // Scan output
-            ReduceByKeyOp(),                // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Barrier for smem reuse and coherence
-        __syncthreads();
-
-        // Scatter an accumulated dot product if it is the head of a valid row
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
-
-                // Save off the first partial product that this thread block will scatter
-                if (partial_sums[ITEM].row == temp_storage.first_block_row)
-                {
-                    temp_storage.first_product = partial_sums[ITEM].partial;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessTiles()
-    {
-        // Process full tiles
-        while (block_offset <= block_end - TILE_ITEMS)
-        {
-            ProcessTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process the last, partially-full tile (if present)
-        int guarded_items = block_end - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, guarded_items);
-        }
-
-        if (threadIdx.x == 0)
-        {
-            if (gridDim.x == 1)
-            {
-                // Scatter the final aggregate (this kernel contains only 1 thread block)
-                d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
-            }
-            else
-            {
-                // Write the first and last partial products from this thread block so
-                // that they can be subsequently "fixed up" in the next kernel.
-
-                PartialProduct first_product;
-                first_product.row       = temp_storage.first_block_row;
-                first_product.partial   = temp_storage.first_product;
-
-                d_block_partials[blockIdx.x * 2]          = first_product;
-                d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
-            }
-        }
-    }
-};
-
-
-/**
- * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
- */
-template <
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD,
-    typename        VertexId,
-    typename        Value>
-struct FinalizeSpmvBlock
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Partial dot product type
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    // Parameterized BlockScan type for reduce-value-by-row scan
-    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
-
-    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
-    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
-
-    // Shared memory type for this thread block
-    struct TempStorage
-    {
-        typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
-        typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
-
-        VertexId last_block_row;
-    };
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    TempStorage                     &temp_storage;
-    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
-    Value                           *d_result;
-    PartialProduct                  *d_block_partials;
-    int                             num_partials;
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    FinalizeSpmvBlock(
-        TempStorage                 &temp_storage,
-        Value                       *d_result,
-        PartialProduct              *d_block_partials,
-        int                         num_partials)
-    :
-        temp_storage(temp_storage),
-        d_result(d_result),
-        d_block_partials(d_block_partials),
-        num_partials(num_partials)
-    {
-        // Initialize scalar shared memory values
-        if (threadIdx.x == 0)
-        {
-            VertexId first_block_row            = d_block_partials[0].row;
-            VertexId last_block_row             = d_block_partials[num_partials - 1].row;
-            temp_storage.last_block_row         = last_block_row;
-
-            // Initialize prefix_op to identity
-            prefix_op.running_prefix.row        = first_block_row;
-            prefix_op.running_prefix.partial    = Value(0);
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Processes a COO input tile of edges, outputting dot products for each row
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__
-    void ProcessTile(
-        int block_offset,
-        int guarded_items = 0)
-    {
-        VertexId        rows[ITEMS_PER_THREAD];
-        PartialProduct  partial_sums[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a tile of block partials from previous kernel
-        if (FULL_TILE)
-        {
-            // Full tile
-#if CUB_PTX_ARCH >= 350
-            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
-#else
-            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
-#endif
-        }
-        else
-        {
-            // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
-            PartialProduct default_sum;
-            default_sum.row = temp_storage.last_block_row;
-            default_sum.partial = Value(0);
-
-#if CUB_PTX_ARCH >= 350
-            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
-#else
-            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
-#endif
-        }
-
-        // Copy out row IDs for row-head flagging
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            rows[ITEM] = partial_sums[ITEM].row;
-        }
-
-        // FlagT row heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            rows,                           // Original row ids
-            head_flags,                     // (Out) Head flags
-            NewRowOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
-
-        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
-        PartialProduct block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_sums,                   // Scan input
-            partial_sums,                   // Scan output
-            ReduceByKeyOp(),                // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Scatter an accumulated dot product if it is the head of a valid row
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessTiles()
-    {
-        // Process full tiles
-        int block_offset = 0;
-        while (block_offset <= num_partials - TILE_ITEMS)
-        {
-            ProcessTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process final partial tile (if present)
-        int guarded_items = num_partials - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, guarded_items);
-        }
-
-        // Scatter the final aggregate (this kernel contains only 1 thread block)
-        if (threadIdx.x == 0)
-        {
-            d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
-        }
-    }
-};
-
-
-/******************************************************************************
- * Kernel entrypoints
- ******************************************************************************/
-
-
-
-/**
- * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
- */
-template <
-    int                             BLOCK_THREADS,
-    int                             ITEMS_PER_THREAD,
-    typename                        VertexId,
-    typename                        Value>
-__launch_bounds__ (BLOCK_THREADS)
-__global__ void CooKernel(
-    GridEvenShare<int>              even_share,
-    PartialProduct<VertexId, Value> *d_block_partials,
-    VertexId                        *d_rows,
-    VertexId                        *d_columns,
-    Value                           *d_values,
-    Value                           *d_vector,
-    Value                           *d_result)
-{
-    // Specialize SpMV thread block abstraction type
-    typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
-
-    // Shared memory allocation
-    __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
-
-    // Initialize thread block even-share to tell us where to start and stop our tile-processing
-    even_share.BlockInit();
-
-    // Construct persistent thread block
-    PersistentBlockSpmv persistent_block(
-        temp_storage,
-        d_rows,
-        d_columns,
-        d_values,
-        d_vector,
-        d_result,
-        d_block_partials,
-        even_share.block_offset,
-        even_share.block_end);
-
-    // Process input tiles
-    persistent_block.ProcessTiles();
-}
-
-
-/**
- * Kernel for "fixing up" an array of interblock SpMV partial products.
- */
-template <
-    int                             BLOCK_THREADS,
-    int                             ITEMS_PER_THREAD,
-    typename                        VertexId,
-    typename                        Value>
-__launch_bounds__ (BLOCK_THREADS,  1)
-__global__ void CooFinalizeKernel(
-    PartialProduct<VertexId, Value> *d_block_partials,
-    int                             num_partials,
-    Value                           *d_result)
-{
-    // Specialize "fix-up" thread block abstraction type
-    typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
-
-    // Shared memory allocation
-    __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
-
-    // Construct persistent thread block
-    FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
-
-    // Process input tiles
-    persistent_block.ProcessTiles();
-}
-
-
-
-//---------------------------------------------------------------------
-// Host subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Simple test of device
- */
-template <
-    int                         COO_BLOCK_THREADS,
-    int                         COO_ITEMS_PER_THREAD,
-    int                         COO_SUBSCRIPTION_FACTOR,
-    int                         FINALIZE_BLOCK_THREADS,
-    int                         FINALIZE_ITEMS_PER_THREAD,
-    typename                    VertexId,
-    typename                    Value>
-void TestDevice(
-    CooGraph<VertexId, Value>&  coo_graph,
-    Value*                      h_vector,
-    Value*                      h_reference)
-{
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
-
-    // SOA device storage
-    VertexId        *d_rows;             // SOA graph row coordinates
-    VertexId        *d_columns;          // SOA graph col coordinates
-    Value           *d_values;           // SOA graph values
-    Value           *d_vector;           // Vector multiplicand
-    Value           *d_result;           // Output row
-    PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
-
-    // Create SOA version of coo_graph on host
-    int             num_edges   = coo_graph.coo_tuples.size();
-    VertexId        *h_rows     = new VertexId[num_edges];
-    VertexId        *h_columns  = new VertexId[num_edges];
-    Value           *h_values   = new Value[num_edges];
-    for (int i = 0; i < num_edges; i++)
-    {
-        h_rows[i]       = coo_graph.coo_tuples[i].row;
-        h_columns[i]    = coo_graph.coo_tuples[i].col;
-        h_values[i]     = coo_graph.coo_tuples[i].val;
-    }
-
-    // Get CUDA properties
-    Device device_props;
-    CubDebugExit(device_props.Init());
-
-    // Determine launch configuration from kernel properties
-    int coo_sm_occupancy;
-    CubDebugExit(device_props.MaxSmOccupancy(
-        coo_sm_occupancy,
-        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
-        COO_BLOCK_THREADS));
-    int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
-
-    // Construct an even-share work distribution
-    GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
-    int coo_grid_size  = even_share.grid_size;
-    int num_partials   = coo_grid_size * 2;
-
-    // Allocate COO device arrays
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
-
-    // Copy host arrays to device
-    CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
-
-    // Bind textures
-    TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
-
-    // Print debug info
-    printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
-        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
-    if (coo_grid_size > 1)
-    {
-        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
-    }
-    fflush(stdout);
-
-    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
-
-    // Run kernel (always run one iteration without timing)
-    GpuTimer gpu_timer;
-    float elapsed_millis = 0.0;
-    for (int i = 0; i <= g_timing_iterations; i++)
-    {
-        gpu_timer.Start();
-
-        // Initialize output
-        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
-
-        // Run the COO kernel
-        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
-            even_share,
-            d_block_partials,
-            d_rows,
-            d_columns,
-            d_values,
-            d_vector,
-            d_result);
-
-        if (coo_grid_size > 1)
-        {
-            // Run the COO finalize kernel
-            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
-                d_block_partials,
-                num_partials,
-                d_result);
-        }
-
-        gpu_timer.Stop();
-
-        if (i > 0)
-            elapsed_millis += gpu_timer.ElapsedMillis();
-    }
-
-    // Force any kernel stdio to screen
-    CubDebugExit(cudaThreadSynchronize());
-    fflush(stdout);
-
-    // Display timing
-    if (g_timing_iterations > 0)
-    {
-        float avg_elapsed = elapsed_millis / g_timing_iterations;
-        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
-        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
-            g_timing_iterations,
-            avg_elapsed,
-            total_bytes / avg_elapsed / 1000.0 / 1000.0,
-            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
-    }
-
-    // Check results
-    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    TexVector<Value>::UnbindTexture();
-    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
-    CubDebugExit(g_allocator.DeviceFree(d_rows));
-    CubDebugExit(g_allocator.DeviceFree(d_columns));
-    CubDebugExit(g_allocator.DeviceFree(d_values));
-    CubDebugExit(g_allocator.DeviceFree(d_vector));
-    CubDebugExit(g_allocator.DeviceFree(d_result));
-    delete[] h_rows;
-    delete[] h_columns;
-    delete[] h_values;
-}
-
-
-/**
- * Compute reference answer on CPU
- */
-template <typename VertexId, typename Value>
-void ComputeReference(
-    CooGraph<VertexId, Value>&  coo_graph,
-    Value*                      h_vector,
-    Value*                      h_reference)
-{
-    for (VertexId i = 0; i < coo_graph.row_dim; i++)
-    {
-        h_reference[i] = 0.0;
-    }
-
-    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
-    {
-        h_reference[coo_graph.coo_tuples[i].row] +=
-            coo_graph.coo_tuples[i].val *
-            h_vector[coo_graph.coo_tuples[i].col];
-    }
-}
-
-
-/**
- * Assign arbitrary values to vector items
- */
-template <typename Value>
-void AssignVectorValues(Value *vector, int col_dim)
-{
-    for (int i = 0; i < col_dim; i++)
-    {
-        vector[i] = 1.0;
-    }
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
-            "\t--type=wheel --spokes=<spokes>\n"
-            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
-            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
-            "\t--type=market --file=<file>\n"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get graph type
-    string type;
-    args.GetCmdLineArgument("type", type);
-
-    // Generate graph structure
-
-    CpuTimer timer;
-    timer.Start();
-    CooGraph<VertexId, Value> coo_graph;
-    if (type == string("grid2d"))
-    {
-        VertexId width;
-        args.GetCmdLineArgument("width", width);
-        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
-        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
-        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
-    } else if (type == string("grid3d"))
-    {
-        VertexId width;
-        args.GetCmdLineArgument("width", width);
-        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
-        printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
-        if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
-    }
-    else if (type == string("wheel"))
-    {
-        VertexId spokes;
-        args.GetCmdLineArgument("spokes", spokes);
-        printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
-        if (coo_graph.InitWheel(spokes)) exit(1);
-    }
-    else if (type == string("market"))
-    {
-        string filename;
-        args.GetCmdLineArgument("file", filename);
-        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
-        if (coo_graph.InitMarket(filename)) exit(1);
-    }
-    else
-    {
-        printf("Unsupported graph type\n");
-        exit(1);
-    }
-    timer.Stop();
-    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
-        timer.ElapsedMillis() / 1000.0,
-        coo_graph.coo_tuples.size(),
-        coo_graph.row_dim,
-        coo_graph.col_dim);
-    fflush(stdout);
-
-    if (g_verbose)
-    {
-        cout << coo_graph << "\n";
-    }
-
-    // Create vector
-    Value *h_vector = new Value[coo_graph.col_dim];
-    AssignVectorValues(h_vector, coo_graph.col_dim);
-    if (g_verbose)
-    {
-        printf("Vector[%d]: ", coo_graph.col_dim);
-        DisplayResults(h_vector, coo_graph.col_dim);
-        printf("\n\n");
-    }
-
-    // Compute reference answer
-    Value *h_reference = new Value[coo_graph.row_dim];
-    ComputeReference(coo_graph, h_vector, h_reference);
-    if (g_verbose)
-    {
-        printf("Results[%d]: ", coo_graph.row_dim);
-        DisplayResults(h_reference, coo_graph.row_dim);
-        printf("\n\n");
-    }
-
-    // Parameterization for SM35
-    enum
-    {
-        COO_BLOCK_THREADS           = 64,
-        COO_ITEMS_PER_THREAD        = 10,
-        COO_SUBSCRIPTION_FACTOR     = 4,
-        FINALIZE_BLOCK_THREADS      = 256,
-        FINALIZE_ITEMS_PER_THREAD   = 4,
-    };
-
-    // Run GPU version
-    TestDevice<
-        COO_BLOCK_THREADS,
-        COO_ITEMS_PER_THREAD,
-        COO_SUBSCRIPTION_FACTOR,
-        FINALIZE_BLOCK_THREADS,
-        FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
-
-    // Cleanup
-    delete[] h_vector;
-    delete[] h_reference;
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu
deleted file mode 100644
index 5d27227fd..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/defunct/test_device_seg_reduce.cu
+++ /dev/null
@@ -1,2142 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * An implementation of segmented reduction using a load-balanced parallelization
- * strategy based on the MergePath decision path.
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-/******************************************************************************
- * Globals, constants, and typedefs
- ******************************************************************************/
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 1;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/******************************************************************************
- * Utility routines
- ******************************************************************************/
-
-
-/**
- * An pair of index offsets
- */
-template <typename OffsetT>
-struct IndexPair
-{
-    OffsetT a_idx;
-    OffsetT b_idx;
-};
-
-
-/**
- * Computes the begin offsets into A and B for the specified
- * location (diagonal) along the merge decision path
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            IteratorA,
-    typename            IteratorB,
-    typename            OffsetT>
-__device__ __forceinline__ void ParallelMergePathSearch(
-    OffsetT             diagonal,
-    IteratorA           a,
-    IteratorB           b,
-    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
-    IndexPair<OffsetT>  end,            // End offsets into a and b
-    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
-{
-    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
-    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
-
-    while (a_split_min < a_split_max)
-    {
-        OffsetT a_distance       = a_split_max - a_split_min;
-        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
-        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
-
-        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
-        int num_up = __syncthreads_count(move_up);
-/*
-        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
-            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
-*/
-        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
-        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
-    }
-
-    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
-    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
-}
-
-/**
- * Computes the begin offsets into A and B for the specified
- * location (diagonal) along the merge decision path
- */
-template <
-    typename            IteratorA,
-    typename            IteratorB,
-    typename            OffsetT>
-__device__ __forceinline__ void MergePathSearch(
-    OffsetT             diagonal,
-    IteratorA           a,
-    IteratorB           b,
-    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
-    IndexPair<OffsetT>  end,            // End offsets into a and b
-    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
-{
-    OffsetT split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
-    OffsetT split_max = CUB_MIN(diagonal, end.a_idx);
-
-    while (split_min < split_max)
-    {
-        OffsetT split_pivot = (split_min + split_max) >> 1;
-        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
-        {
-            // Move candidate split range up A, down B
-            split_min = split_pivot + 1;
-        }
-        else
-        {
-            // Move candidate split range up B, down A
-            split_max = split_pivot;
-        }
-    }
-
-    intersection.a_idx = CUB_MIN(split_min, end.a_idx);
-    intersection.b_idx = CUB_MIN(diagonal - split_min, end.b_idx);
-}
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockSegReduceRegion
- */
-template <
-    int                     _BLOCK_THREADS,             ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    bool                    _USE_SMEM_SEGMENT_CACHE,    ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-    bool                    _USE_SMEM_VALUE_CACHE,      ///< Whether or not to cache incoming values in shared memory before reducing each tile
-    CacheLoadModifier       _LOAD_MODIFIER_SEGMENTS,    ///< Cache load modifier for reading segment offsets
-    CacheLoadModifier       _LOAD_MODIFIER_VALUES,      ///< Cache load modifier for reading values
-    BlockReduceAlgorithm    _REDUCE_ALGORITHM,          ///< The BlockReduce algorithm to use
-    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
-struct BlockSegReduceRegionPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        USE_SMEM_SEGMENT_CACHE  = _USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-        USE_SMEM_VALUE_CACHE    = _USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
-    };
-
-    static const CacheLoadModifier      LOAD_MODIFIER_SEGMENTS  = _LOAD_MODIFIER_SEGMENTS;  ///< Cache load modifier for reading segment offsets
-    static const CacheLoadModifier      LOAD_MODIFIER_VALUES    = _LOAD_MODIFIER_VALUES;    ///< Cache load modifier for reading values
-    static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< The BlockReduce algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * \brief BlockSegReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide segmented reduction.
- */
-template <
-    typename BlockSegReduceRegionPolicy,    ///< Parameterized BlockSegReduceRegionPolicy tuning policy
-    typename SegmentOffsetIterator,         ///< Random-access input iterator type for reading segment end-offsets
-    typename ValueIterator,                 ///< Random-access input iterator type for reading values
-    typename OutputIteratorT,               ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT>                       ///< Signed integer type for global offsets
-struct BlockSegReduceRegion
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockSegReduceRegionPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockSegReduceRegionPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,                     /// Number of work items to be processed per tile
-
-        USE_SMEM_SEGMENT_CACHE  = BlockSegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-        USE_SMEM_VALUE_CACHE    = BlockSegReduceRegionPolicy::USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
-
-        SMEM_SEGMENT_CACHE_ITEMS    = USE_SMEM_SEGMENT_CACHE ? TILE_ITEMS : 1,
-        SMEM_VALUE_CACHE_ITEMS      = USE_SMEM_VALUE_CACHE ? TILE_ITEMS : 1,
-    };
-
-    // Segment offset type
-    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
-
-    // Value type
-    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-    // Counting iterator type
-    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
-
-    // Segment offsets iterator wrapper type
-    typedef typename If<(IsPointer<SegmentOffsetIterator>::VALUE),
-            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS, SegmentOffsetT, OffsetT>,  // Wrap the native input pointer with CacheModifiedInputIterator
-            SegmentOffsetIterator>::Type                                                                            // Directly use the supplied input iterator type
-        WrappedSegmentOffsetIterator;
-
-    // Values iterator wrapper type
-    typedef typename If<(IsPointer<ValueIterator>::VALUE),
-            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_VALUES, Value, OffsetT>,        // Wrap the native input pointer with CacheModifiedInputIterator
-            ValueIterator>::Type                                                                                // Directly use the supplied input iterator type
-        WrappedValueIterator;
-
-    // Tail flag type for marking segment discontinuities
-    typedef int TailFlag;
-
-    // Reduce-by-key data type tuple (segment-ID, value)
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Index pair data type
-    typedef IndexPair<OffsetT> IndexPair;
-
-    // BlockScan scan operator for reduction-by-segment
-    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef RunningBlockPrefixCallbackOp<
-            KeyValuePair,
-            ReduceByKeyOp>
-        RunningPrefixCallbackOp;
-
-    // Parameterized BlockShift type for exchanging index pairs
-    typedef BlockShift<
-            IndexPair,
-            BLOCK_THREADS>
-        BlockShift;
-
-    // Parameterized BlockReduce type for block-wide reduction
-    typedef BlockReduce<
-            Value,
-            BLOCK_THREADS,
-            BlockSegReduceRegionPolicy::REDUCE_ALGORITHM>
-        BlockReduce;
-
-    // Parameterized BlockScan type for block-wide reduce-value-by-key
-    typedef BlockScan<
-            KeyValuePair,
-            BLOCK_THREADS,
-            BlockSegReduceRegionPolicy::SCAN_ALGORITHM>
-        BlockScan;
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        union
-        {
-            // Smem needed for BlockScan
-            typename BlockScan::TempStorage scan;
-
-            // Smem needed for BlockReduce
-            typename BlockReduce::TempStorage reduce;
-
-            struct
-            {
-                // Smem needed for communicating start/end indices between threads for a given work tile
-                typename BlockShift::TempStorage shift;
-
-                // Smem needed for caching segment end-offsets
-                SegmentOffset cached_segment_end_offsets[SMEM_SEGMENT_CACHE_ITEMS + 1];
-            };
-
-            // Smem needed for caching values
-            Value cached_values[SMEM_VALUE_CACHE_ITEMS];
-        };
-
-        IndexPair block_region_idx[2];      // The starting [0] and ending [1] pairs of segment and value indices for the thread block's region
-
-        // The first partial reduction tuple scattered by this thread block
-        KeyValuePair first_tuple;
-    };
-
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;          ///< Reference to shared storage
-    WrappedSegmentOffsetIterator    d_segment_end_offsets;  ///< A sequence of \p num_segments segment end-offsets
-    WrappedValueIterator            d_values;               ///< A sequence of \p num_values data to reduce
-    OutputIteratorT                  d_output;               ///< A sequence of \p num_segments segment totals
-    CountingIterator                d_value_offsets;        ///< A sequence of \p num_values value-offsets
-    IndexPair                       *d_block_idx;
-    OffsetT                         num_values;             ///< Total number of values to reduce
-    OffsetT                         num_segments;           ///< Number of segments being reduced
-    Value                           identity;               ///< Identity value (for zero-length segments)
-    ReductionOp                     reduction_op;           ///< Reduction operator
-    ReduceByKeyOp                   scan_op;                ///< Reduce-by-key scan operator
-    RunningPrefixCallbackOp         prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    BlockSegReduceRegion(
-        TempStorage             &temp_storage,          ///< Reference to shared storage
-        SegmentOffsetIterator   d_segment_end_offsets,  ///< A sequence of \p num_segments segment end-offsets
-        ValueIterator           d_values,               ///< A sequence of \p num_values values
-        OutputIteratorT          d_output,               ///< A sequence of \p num_segments segment totals
-        IndexPair               *d_block_idx,
-        OffsetT                 num_values,             ///< Number of values to reduce
-        OffsetT                 num_segments,           ///< Number of segments being reduced
-        Value                   identity,               ///< Identity value (for zero-length segments)
-        ReductionOp             reduction_op)           ///< Reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_segment_end_offsets(d_segment_end_offsets),
-        d_values(d_values),
-        d_value_offsets(0),
-        d_output(d_output),
-        d_block_idx(d_block_idx),
-        num_values(num_values),
-        num_segments(num_segments),
-        identity(identity),
-        reduction_op(reduction_op),
-        scan_op(reduction_op),
-        prefix_op(scan_op)
-    {}
-
-
-    /**
-     * Fast-path single-segment tile reduction.  Perform a
-     * simple block-wide reduction and accumulate the result into
-     * the running total.
-     */
-    __device__ __forceinline__ void SingleSegmentTile(
-        IndexPair next_tile_idx,
-        IndexPair block_idx)
-    {
-        OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
-
-        // Load a tile's worth of values (using identity for out-of-bounds items)
-        Value values[ITEMS_PER_THREAD];
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
-
-        // Barrier for smem reuse
-        __syncthreads();
-
-        // Reduce the tile of values and update the running total in thread-0
-        KeyValuePair tile_aggregate;
-        tile_aggregate.key      = block_idx.a_idx;
-        tile_aggregate.value    = BlockReduce(temp_storage.reduce).Reduce(values, reduction_op);
-
-        if (threadIdx.x == 0)
-        {
-            prefix_op.running_total = scan_op(prefix_op.running_total, tile_aggregate);
-        }
-    }
-
-    /**
-     * Fast-path empty-segment tile reduction.  Write out a tile of identity
-     * values to output.
-     */
-    __device__ __forceinline__ void EmptySegmentsTile(
-        IndexPair next_tile_idx,
-        IndexPair block_idx)
-    {
-        Value segment_reductions[ITEMS_PER_THREAD];
-
-        if (threadIdx.x == 0)
-        {
-            // The first segment gets the running segment total
-            segment_reductions[0] = prefix_op.running_total.value;
-
-            // Update the running prefix
-            prefix_op.running_total.value = identity;
-            prefix_op.running_total.key = next_tile_idx.a_idx;
-        }
-        else
-        {
-            // Remainder of segments in this tile get identity
-            segment_reductions[0] = identity;
-        }
-
-        // Remainder of segments in this tile get identity
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            segment_reductions[ITEM] = identity;
-
-        // Store reductions
-        OffsetT tile_segments = next_tile_idx.a_idx - block_idx.a_idx;
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_output + block_idx.a_idx, segment_reductions, tile_segments);
-    }
-
-
-    /**
-     * Multi-segment tile reduction.
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void MultiSegmentTile(
-        IndexPair block_idx,
-        IndexPair thread_idx,
-        IndexPair next_thread_idx,
-        IndexPair next_tile_idx)
-    {
-        IndexPair local_thread_idx;
-        local_thread_idx.a_idx = thread_idx.a_idx - block_idx.a_idx;
-        local_thread_idx.b_idx = thread_idx.b_idx - block_idx.b_idx;
-
-        // Check if first segment end-offset is in range
-        bool valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
-
-        // Check if first value offset is in range
-        bool valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
-
-        // Load first segment end-offset
-        OffsetT segment_end_offset = (valid_segment) ?
-            (USE_SMEM_SEGMENT_CACHE)?
-                temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx] :
-                d_segment_end_offsets[thread_idx.a_idx] :
-            -1;
-
-        OffsetT segment_ids[ITEMS_PER_THREAD];
-        OffsetT value_offsets[ITEMS_PER_THREAD];
-
-        KeyValuePair first_partial;
-        first_partial.key    = thread_idx.a_idx;
-        first_partial.value  = identity;
-
-        // Get segment IDs and gather-offsets for values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            segment_ids[ITEM]   = -1;
-            value_offsets[ITEM] = -1;
-
-            // Whether or not we slide (a) right along the segment path or (b) down the value path
-            if (valid_segment && (!valid_value || (segment_end_offset <= thread_idx.b_idx)))
-            {
-                // Consume this segment index
-                segment_ids[ITEM] = thread_idx.a_idx;
-                thread_idx.a_idx++;
-                local_thread_idx.a_idx++;
-
-                valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
-
-                // Read next segment end-offset (if valid)
-                if (valid_segment)
-                {
-                    if (USE_SMEM_SEGMENT_CACHE)
-                        segment_end_offset = temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx];
-                    else
-                        segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
-                }
-            }
-            else if (valid_value)
-            {
-                // Consume this value index
-                value_offsets[ITEM] = thread_idx.b_idx;
-                thread_idx.b_idx++;
-                local_thread_idx.b_idx++;
-
-                valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
-            }
-        }
-
-        // Load values
-        Value values[ITEMS_PER_THREAD];
-
-        if (USE_SMEM_VALUE_CACHE)
-        {
-            // Barrier for smem reuse
-            __syncthreads();
-
-            OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
-
-            // Load a tile's worth of values (using identity for out-of-bounds items)
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
-
-            // Store to shared
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_values, values, tile_values);
-
-            // Barrier for smem reuse
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                values[ITEM] = (value_offsets[ITEM] == -1) ?
-                    identity :
-                    temp_storage.cached_values[value_offsets[ITEM] - block_idx.b_idx];
-            }
-        }
-        else
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                values[ITEM] = (value_offsets[ITEM] == -1) ?
-                    identity :
-                    d_values[value_offsets[ITEM]];
-            }
-        }
-
-        // Reduce within thread segments
-        KeyValuePair running_total = first_partial;
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_ids[ITEM] != -1)
-            {
-                // Consume this segment index
-                d_output[segment_ids[ITEM]] = running_total.value;
-
-//                _CubLog("Updating segment %d with value %lld\n", segment_ids[ITEM], running_total.value)
-
-                if (first_partial.key == segment_ids[ITEM])
-                    first_partial.value = running_total.value;
-
-                running_total.key    = segment_ids[ITEM];
-                running_total.value  = identity;
-            }
-
-            running_total.value = reduction_op(running_total.value, values[ITEM]);
-        }
-/*
-
-        // Barrier for smem reuse
-        __syncthreads();
-
-        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).InclusiveScan(
-            pairs,                          // Scan input
-            pairs,                          // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-*/
-
-/*
-        // Check if first segment end-offset is in range
-        bool valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx);
-
-        // Check if first value offset is in range
-        bool valid_value = (thread_idx.b_idx < next_thread_idx.b_idx);
-
-        // Load first segment end-offset
-        OffsetT segment_end_offset = (valid_segment) ?
-            d_segment_end_offsets[thread_idx.a_idx] :
-            num_values;                                                     // Out of range (the last segment end-offset is one-past the last value offset)
-
-        // Load first value offset
-        OffsetT value_offset = (valid_value) ?
-            d_value_offsets[thread_idx.b_idx] :
-            num_values;                                                     // Out of range (one-past the last value offset)
-
-        // Assemble segment-demarcating tail flags and partial reduction tuples
-        TailFlag        tail_flags[ITEMS_PER_THREAD];
-        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Default tuple and flag values
-            partial_reductions[ITEM].key    = thread_idx.a_idx;
-            partial_reductions[ITEM].value  = identity;
-            tail_flags[ITEM]                = 0;
-
-            // Whether or not we slide (a) right along the segment path or (b) down the value path
-            if (valid_segment && (!valid_value || (segment_end_offset <= value_offset)))
-            {
-                // Consume this segment index
-
-                // Set tail flag noting the end of the segment
-                tail_flags[ITEM] = 1;
-
-                // Increment segment index
-                thread_idx.a_idx++;
-
-                // Read next segment end-offset (if valid)
-                if ((valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx)))
-                    segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
-            }
-            else if (valid_value)
-            {
-                // Consume this value index
-
-                // Update the tuple's value with the value at this index.
-                partial_reductions[ITEM].value = d_values[value_offset];
-
-                // Increment value index
-                thread_idx.b_idx++;
-
-                // Read next value offset (if valid)
-                if ((valid_value = (thread_idx.b_idx < next_thread_idx.b_idx)))
-                    value_offset = d_value_offsets[thread_idx.b_idx];
-            }
-        }
-
-        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).InclusiveScan(
-            partial_reductions,             // Scan input
-            partial_reductions,             // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // The first segment index for this region (hoist?)
-        OffsetT first_segment_idx = temp_storage.block_idx.a_idx[0];
-
-        // Scatter an accumulated reduction if it is the head of a valid segment
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (tail_flags[ITEM])
-            {
-                OffsetT segment_idx = partial_reductions[ITEM].key;
-                Value   value       = partial_reductions[ITEM].value;
-
-                // Write value reduction to corresponding segment id
-                d_output[segment_idx] = value;
-
-                // Save off the first value product that this thread block will scatter
-                if (segment_idx == first_segment_idx)
-                {
-                    temp_storage.first_tuple.value = value;
-                }
-            }
-        }
-*/
-    }
-
-
-
-    /**
-     * Have the thread block process the specified region of the MergePath decision path
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT         block_diagonal,
-        OffsetT         next_block_diagonal,
-        KeyValuePair    &first_tuple,       // [Out] Valid in thread-0
-        KeyValuePair    &last_tuple)        // [Out] Valid in thread-0
-    {
-        // Thread block initialization
-        if (threadIdx.x < 2)
-        {
-            // Retrieve block starting and ending indices
-            IndexPair block_idx = {0, 0};
-            if (gridDim.x > 1)
-            {
-                block_idx = d_block_idx[blockIdx.x + threadIdx.x];
-            }
-            else if (threadIdx.x > 0)
-            {
-                block_idx.a_idx = num_segments;
-                block_idx.b_idx = num_values;
-            }
-
-            // Share block starting and ending indices
-            temp_storage.block_region_idx[threadIdx.x] = block_idx;
-
-            // Initialize the block's running prefix
-            if (threadIdx.x == 0)
-            {
-                prefix_op.running_total.key    = block_idx.a_idx;
-                prefix_op.running_total.value  = identity;
-
-                // Initialize the "first scattered partial reduction tuple" to the prefix tuple (in case we don't actually scatter one)
-                temp_storage.first_tuple = prefix_op.running_total;
-            }
-        }
-
-        // Ensure coherence of region indices
-        __syncthreads();
-
-        // Read block's starting indices
-        IndexPair block_idx = temp_storage.block_region_idx[0];
-
-        // Have the thread block iterate over the region
-        #pragma unroll 1
-        while (block_diagonal < next_block_diagonal)
-        {
-            // Read block's ending indices (hoist?)
-            IndexPair next_block_idx = temp_storage.block_region_idx[1];
-
-            // Clamp the per-thread search range to within one work-tile of block's current indices
-            IndexPair next_tile_idx;
-            next_tile_idx.a_idx = CUB_MIN(next_block_idx.a_idx, block_idx.a_idx + TILE_ITEMS);
-            next_tile_idx.b_idx = CUB_MIN(next_block_idx.b_idx, block_idx.b_idx + TILE_ITEMS);
-
-            // Have each thread search for the end-indices of its subranges within the segment and value inputs
-            IndexPair next_thread_idx;
-            if (USE_SMEM_SEGMENT_CACHE)
-            {
-                // Search in smem cache
-                OffsetT num_segments = next_tile_idx.a_idx - block_idx.a_idx;
-
-                // Load global
-                SegmentOffset segment_offsets[ITEMS_PER_THREAD];
-                LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_segment_end_offsets + block_idx.a_idx, segment_offsets, num_segments, num_values);
-
-                // Store to shared
-                StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_segment_end_offsets, segment_offsets);
-
-                __syncthreads();
-
-                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
-
-                MergePathSearch(
-                    next_thread_diagonal,                       // Next thread diagonal
-                    temp_storage.cached_segment_end_offsets - block_idx.a_idx,                      // A (segment end-offsets)
-                    d_value_offsets,                            // B (value offsets)
-                    block_idx,                                  // Start indices into A and B
-                    next_tile_idx,                              // End indices into A and B
-                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
-            }
-            else
-            {
-                // Search in global
-
-                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
-
-                MergePathSearch(
-                    next_thread_diagonal,                       // Next thread diagonal
-                    d_segment_end_offsets,                      // A (segment end-offsets)
-                    d_value_offsets,                            // B (value offsets)
-                    block_idx,                                  // Start indices into A and B
-                    next_tile_idx,                              // End indices into A and B
-                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
-            }
-
-            // Share thread end-indices to get thread begin-indices and tile end-indices
-            IndexPair thread_idx;
-
-            BlockShift(temp_storage.shift).Up(
-                next_thread_idx,    // Input item
-                thread_idx,         // [out] Output item
-                block_idx,          // Prefix item to be provided to <em>thread</em><sub>0</sub>
-                next_tile_idx);     // [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
-
-//            if (block_idx.a_idx == next_tile_idx.a_idx)
-//            {
-//                // There are no segment end-offsets in this tile.  Perform a
-//                // simple block-wide reduction and accumulate the result into
-//                // the running total.
-//                SingleSegmentTile(next_tile_idx, block_idx);
-//            }
-//          else if (block_idx.b_idx == next_tile_idx.b_idx)
-//            {
-//                // There are no values in this tile (only empty segments).
-//                EmptySegmentsTile(next_tile_idx.a_idx, block_idx.a_idx);
-//            }
-//            else
-            if ((next_tile_idx.a_idx < num_segments) && (next_tile_idx.b_idx < num_values))
-            {
-                // Merge the tile's segment and value indices (full tile)
-                MultiSegmentTile<true>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
-            }
-            else
-            {
-                // Merge the tile's segment and value indices (partially full tile)
-                MultiSegmentTile<false>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
-            }
-
-            // Advance the block's indices in preparation for the next tile
-            block_idx = next_tile_idx;
-
-            // Advance to the next region in the decision path
-            block_diagonal += TILE_ITEMS;
-
-            // Barrier for smem reuse
-            __syncthreads();
-        }
-
-        // Get first and last tuples for the region
-        if (threadIdx.x == 0)
-        {
-            first_tuple = temp_storage.first_tuple;
-            last_tuple = prefix_op.running_total;
-        }
-
-    }
-
-
-};
-
-
-
-
-
-
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockSegReduceRegionByKey
- */
-template <
-    int                     _BLOCK_THREADS,             ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm      _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
-    bool                    _LOAD_WARP_TIME_SLICING,    ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-    CacheLoadModifier       _LOAD_MODIFIER,             ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
-struct BlockSegReduceRegionByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)    };
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * \brief BlockSegReduceRegionByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-template <
-    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
-    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
-    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
-    typename    ReductionOp>                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct BlockSegReduceRegionByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockSegReduceRegionByKeyPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // KeyValuePair input type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type KeyValuePair;
-
-    // Signed integer type for global offsets
-    typedef typename KeyValuePair::Key OffsetT;
-
-    // Value type
-    typedef typename KeyValuePair::Value Value;
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Input iterator wrapper type for loading KeyValuePair elements through cache
-    typedef CacheModifiedInputIterator<
-            BlockSegReduceRegionByKeyPolicy::LOAD_MODIFIER,
-            KeyValuePair,
-            OffsetT>
-        WrappedInputIteratorT;
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            WrappedInputIteratorT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            BlockSegReduceRegionByKeyPolicy::LOAD_ALGORITHM,
-            BlockSegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING>
-        BlockLoad;
-
-    // BlockScan scan operator for reduction-by-segment
-    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef RunningBlockPrefixCallbackOp<
-            KeyValuePair,
-            ReduceByKeyOp>
-        RunningPrefixCallbackOp;
-
-    // Parameterized BlockScan type for block-wide reduce-value-by-key
-    typedef BlockScan<
-            KeyValuePair,
-            BLOCK_THREADS,
-            BlockSegReduceRegionByKeyPolicy::SCAN_ALGORITHM>
-        BlockScan;
-
-    // Parameterized BlockDiscontinuity type for identifying key discontinuities
-    typedef BlockDiscontinuity<
-            OffsetT,
-            BLOCK_THREADS>
-        BlockDiscontinuity;
-
-    // Operator for detecting discontinuities in a list of segment identifiers.
-    struct NewSegmentOp
-    {
-        /// Returns true if row_b is the start of a new row
-        __device__ __forceinline__ bool operator()(const OffsetT& b, const OffsetT& a)
-        {
-            return (a != b);
-        }
-    };
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoad::TempStorage                 load;           // Smem needed for tile loading
-            struct {
-                typename BlockScan::TempStorage             scan;           // Smem needed for reduce-value-by-segment scan
-                typename BlockDiscontinuity::TempStorage    discontinuity;  // Smem needed for head-flagging
-            };
-        };
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;          ///< Reference to shared storage
-    WrappedInputIteratorT       d_tuple_partials;       ///< A sequence of partial reduction tuples to scan
-    OutputIteratorT              d_output;               ///< A sequence of segment totals
-    Value                       identity;               ///< Identity value (for zero-length segments)
-    ReduceByKeyOp               scan_op;                ///< Reduce-by-key scan operator
-    RunningPrefixCallbackOp     prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    BlockSegReduceRegionByKey(
-        TempStorage             &temp_storage,          ///< Reference to shared storage
-        InputIteratorT          d_tuple_partials,       ///< A sequence of partial reduction tuples to scan
-        OutputIteratorT          d_output,               ///< A sequence of segment totals
-        Value                   identity,               ///< Identity value (for zero-length segments)
-        ReductionOp             reduction_op)           ///< Reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_tuple_partials(d_tuple_partials),
-        d_output(d_output),
-        identity(identity),
-        scan_op(reduction_op),
-        prefix_op(scan_op)
-    {}
-
-
-
-    /**
-     * Processes a reduce-value-by-key input tile, outputting reductions for each segment
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__
-    void ProcessTile(
-        OffsetT block_offset,
-        OffsetT first_segment_idx,
-        OffsetT last_segment_idx,
-        int guarded_items = TILE_ITEMS)
-    {
-        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
-        OffsetT         segment_ids[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a tile of block partials from previous kernel
-        if (FULL_TILE)
-        {
-            // Full tile
-            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions);
-        }
-        else
-        {
-            KeyValuePair oob_default;
-            oob_default.key    = last_segment_idx;       // The last segment ID to be reduced
-            oob_default.value  = identity;
-
-            // Partially-full tile
-            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions, guarded_items, oob_default);
-        }
-
-        // Barrier for shared memory reuse
-        __syncthreads();
-
-        // Copy the segment IDs for head-flagging
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            segment_ids[ITEM] = partial_reductions[ITEM].key;
-        }
-
-        // FlagT segment heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            head_flags,                         // [out] Head flags
-            segment_ids,                        // Segment ids
-            NewSegmentOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_total.key);       // Last segment ID from previous tile to compare with first segment ID in this tile
-
-        // Reduce-value-by-segment across partial_reductions using exclusive prefix scan
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_reductions,                   // Scan input
-            partial_reductions,                   // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Scatter an accumulated reduction if it is the head of a valid segment
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_output[partial_reductions[ITEM].key] = partial_reductions[ITEM].value;
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessRegion(
-        OffsetT block_offset,
-        OffsetT block_end,
-        OffsetT first_segment_idx,
-        OffsetT last_segment_idx)
-    {
-        if (threadIdx.x == 0)
-        {
-            // Initialize running prefix to the first segment index paired with identity
-            prefix_op.running_total.key    = first_segment_idx;
-            prefix_op.running_total.value  = identity;
-        }
-
-        // Process full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessTile<true>(block_offset, first_segment_idx, last_segment_idx);
-            __syncthreads();
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process final value tile (if present)
-        int guarded_items = block_end - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, first_segment_idx, last_segment_idx, guarded_items);
-        }
-    }
-};
-
-
-
-/******************************************************************************
- * Kernel entrypoints
- ******************************************************************************/
-
-/**
- * Segmented reduce region kernel entry point (multi-block).
- */
-
-template <
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename OffsetT>                           ///< Signed integer type for global offsets
-__global__ void SegReducePartitionKernel(
-    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-    IndexPair<OffsetT>          *d_block_idx,
-    int                         num_partition_samples,
-    OffsetT                     num_values,             ///< [in] Number of values to reduce
-    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
-    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    // Segment offset type
-    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
-
-    // Counting iterator type
-    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
-
-    // Cache-modified iterator for segment end-offsets
-    CacheModifiedInputIterator<LOAD_LDG, SegmentOffsetT, OffsetT> d_wrapped_segment_end_offsets(d_segment_end_offsets);
-
-    // Counting iterator for value offsets
-    CountingIterator d_value_offsets(0);
-
-    // Initialize even-share to tell us where to start and stop our tile-processing
-    int partition_id = (blockDim.x * blockIdx.x) + threadIdx.x;
-    even_share.Init(partition_id);
-
-    // Search for block starting and ending indices
-    IndexPair<OffsetT> start_idx = {0, 0};
-    IndexPair<OffsetT> end_idx   = {num_segments, num_values};
-    IndexPair<OffsetT> block_idx;
-
-    MergePathSearch(
-        even_share.block_offset,            // Next thread diagonal
-        d_wrapped_segment_end_offsets,      // A (segment end-offsets)
-        d_value_offsets,                    // B (value offsets)
-        start_idx,                          // Start indices into A and B
-        end_idx,                            // End indices into A and B
-        block_idx);                         // [out] diagonal intersection indices into A and B
-
-    // Write output
-    if (partition_id < num_partition_samples)
-    {
-        d_block_idx[partition_id] = block_idx;
-    }
-}
-
-
-/**
- * Segmented reduce region kernel entry point (multi-block).
- */
-template <
-    typename BlockSegReduceRegionPolicy,        ///< Parameterized BlockSegReduceRegionPolicy tuning policy
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename ValueIterator,                     ///< Random-access input iterator type for reading values
-    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT,                           ///< Signed integer type for global offsets
-    typename Value>                             ///< Value type
-__launch_bounds__ (BlockSegReduceRegionPolicy::BLOCK_THREADS)
-__global__ void SegReduceRegionKernel(
-    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-    ValueIterator               d_values,               ///< [in] A sequence of \p num_values values
-    OutputIteratorT              d_output,               ///< [out] A sequence of \p num_segments segment totals
-    KeyValuePair<OffsetT, Value> *d_tuple_partials,      ///< [out] A sequence of (gridDim.x * 2) partial reduction tuples
-    IndexPair<OffsetT>          *d_block_idx,
-    OffsetT                     num_values,             ///< [in] Number of values to reduce
-    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
-    Value                       identity,               ///< [in] Identity value (for zero-length segments)
-    ReductionOp                 reduction_op,           ///< [in] Reduction operator
-    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Specialize thread block abstraction type for reducing a range of segmented values
-    typedef BlockSegReduceRegion<
-            BlockSegReduceRegionPolicy,
-            SegmentOffsetIterator,
-            ValueIterator,
-            OutputIteratorT,
-            ReductionOp,
-            OffsetT>
-        BlockSegReduceRegion;
-
-    // Shared memory allocation
-    __shared__ typename BlockSegReduceRegion::TempStorage temp_storage;
-
-    // Initialize thread block even-share to tell us where to start and stop our tile-processing
-    even_share.BlockInit();
-
-    // Construct persistent thread block
-    BlockSegReduceRegion thread_block(
-        temp_storage,
-        d_segment_end_offsets,
-        d_values,
-        d_output,
-        d_block_idx,
-        num_values,
-        num_segments,
-        identity,
-        reduction_op);
-
-    // First and last partial reduction tuples within the range (valid in thread-0)
-    KeyValuePair first_tuple, last_tuple;
-
-    // Consume block's region of work
-    thread_block.ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end,
-        first_tuple,
-        last_tuple);
-
-    if (threadIdx.x == 0)
-    {
-        if (gridDim.x > 1)
-        {
-            // Special case where the first segment written and the carry-out are for the same segment
-            if (first_tuple.key == last_tuple.key)
-            {
-                first_tuple.value = identity;
-            }
-
-            // Write the first and last partial products from this thread block so
-            // that they can be subsequently "fixed up" in the next kernel.
-            d_tuple_partials[blockIdx.x * 2]          = first_tuple;
-            d_tuple_partials[(blockIdx.x * 2) + 1]    = last_tuple;
-        }
-    }
-
-}
-
-
-/**
- * Segmented reduce region kernel entry point (single-block).
- */
-template <
-    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
-    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
-    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
-    typename    ReductionOp,                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename    OffsetT,                                ///< Signed integer type for global offsets
-    typename    Value>                                  ///< Value type
-__launch_bounds__ (BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS, 1)
-__global__ void SegReduceRegionByKeyKernel(
-    InputIteratorT          d_tuple_partials,           ///< [in] A sequence of partial reduction tuples
-    OutputIteratorT          d_output,                   ///< [out] A sequence of \p num_segments segment totals
-    OffsetT                 num_segments,               ///< [in] Number of segments in the \p d_output sequence
-    int                     num_tuple_partials,         ///< [in] Number of partial reduction tuples being reduced
-    Value                   identity,                   ///< [in] Identity value (for zero-length segments)
-    ReductionOp             reduction_op)               ///< [in] Reduction operator
-{
-    // Specialize thread block abstraction type for reducing a range of values by key
-    typedef BlockSegReduceRegionByKey<
-            BlockSegReduceRegionByKeyPolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            ReductionOp>
-        BlockSegReduceRegionByKey;
-
-    // Shared memory allocation
-    __shared__ typename BlockSegReduceRegionByKey::TempStorage temp_storage;
-
-    // Construct persistent thread block
-    BlockSegReduceRegionByKey thread_block(
-        temp_storage,
-        d_tuple_partials,
-        d_output,
-        identity,
-        reduction_op);
-
-    // Process input tiles
-    thread_block.ProcessRegion(
-        0,                          // Region start
-        num_tuple_partials,         // Region end
-        0,                          // First segment ID
-        num_segments);              // Last segment ID (one-past)
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
- */
-template <
-    typename ValueIterator,                     ///< Random-access input iterator type for reading values
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT>                           ///< Signed integer type for global offsets
-struct DeviceSegReduceDispatch
-{
-    // Value type
-    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-    // Reduce-by-key data type tuple (segment-ID, value)
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Index pair data type
-    typedef IndexPair<OffsetT>IndexPair;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        // ReduceRegionPolicy
-        typedef BlockSegReduceRegionPolicy<
-                128,                            ///< Threads per thread block
-                6,                              ///< Items per thread (per tile of input)
-                true,                           ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
-                LOAD_LDG,                       ///< Cache load modifier for reading values
-                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionPolicy;
-
-        // ReduceRegionByKeyPolicy
-        typedef BlockSegReduceRegionByKeyPolicy<
-                256,                            ///< Threads per thread block
-                9,                             ///< Items per thread (per tile of input)
-                BLOCK_LOAD_DIRECT,              ///< The BlockLoad algorithm to use
-                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-                LOAD_LDG,                       ///< Cache load modifier for reading input elements
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionByKeyPolicy;
-    };
-
-
-    /// SM10
-    struct Policy100
-    {
-        // ReduceRegionPolicy
-        typedef BlockSegReduceRegionPolicy<
-                128,                            ///< Threads per thread block
-                3,                              ///< Items per thread (per tile of input)
-                false,                          ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading values
-                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
-                BLOCK_SCAN_RAKING>              ///< The BlockScan algorithm to use
-            SegReduceRegionPolicy;
-
-        // ReduceRegionByKeyPolicy
-        typedef BlockSegReduceRegionByKeyPolicy<
-                128,                            ///< Threads per thread block
-                3,                              ///< Items per thread (per tile of input)
-                BLOCK_LOAD_WARP_TRANSPOSE,      ///< The BlockLoad algorithm to use
-                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading input elements
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionByKeyPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-/*
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-*/
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSegReduceRegionPolicy           : PtxPolicy::SegReduceRegionPolicy {};
-    struct PtxSegReduceRegionByKeyPolicy      : PtxPolicy::SegReduceRegionByKeyPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <
-        typename SegReduceKernelConfig,
-        typename SegReduceByKeyKernelConfig>
-    __host__ __device__ __forceinline__
-    static void InitConfigs(
-        int                         ptx_version,
-        SegReduceKernelConfig       &seg_reduce_region_config,
-        SegReduceByKeyKernelConfig  &seg_reduce_region_by_key_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        seg_reduce_region_config.Init<PtxSegReduceRegionPolicy>();
-        seg_reduce_region_by_key_config.Init<PtxSegReduceRegionByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            seg_reduce_region_config.template          Init<typename Policy350::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy350::SegReduceRegionByKeyPolicy>();
-        }
-/*
-        else if (ptx_version >= 300)
-        {
-            seg_reduce_region_config.template          Init<typename Policy300::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy300::SegReduceRegionByKeyPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            seg_reduce_region_config.template          Init<typename Policy200::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy200::SegReduceRegionByKeyPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            seg_reduce_region_config.template          Init<typename Policy130::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy130::SegReduceRegionByKeyPolicy>();
-        }
-*/
-        else
-        {
-            seg_reduce_region_config.template          Init<typename Policy100::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy100::SegReduceRegionByKeyPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * SegReduceRegionKernel kernel dispatch configuration
-     */
-    struct SegReduceKernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        bool                    use_smem_segment_cache;
-        bool                    use_smem_value_cache;
-        CacheLoadModifier       load_modifier_segments;
-        CacheLoadModifier       load_modifier_values;
-        BlockReduceAlgorithm    reduce_algorithm;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename SegReduceRegionPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = SegReduceRegionPolicy::BLOCK_THREADS;
-            items_per_thread            = SegReduceRegionPolicy::ITEMS_PER_THREAD;
-            use_smem_segment_cache      = SegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE;
-            use_smem_value_cache        = SegReduceRegionPolicy::USE_SMEM_VALUE_CACHE;
-            load_modifier_segments      = SegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS;
-            load_modifier_values        = SegReduceRegionPolicy::LOAD_MODIFIER_VALUES;
-            reduce_algorithm            = SegReduceRegionPolicy::REDUCE_ALGORITHM;
-            scan_algorithm              = SegReduceRegionPolicy::SCAN_ALGORITHM;
-        }
-    };
-
-    /**
-     * SegReduceRegionByKeyKernel kernel dispatch configuration
-     */
-    struct SegReduceByKeyKernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_algorithm;
-        bool                    load_warp_time_slicing;
-        CacheLoadModifier       load_modifier;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename SegReduceRegionByKeyPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = SegReduceRegionByKeyPolicy::BLOCK_THREADS;
-            items_per_thread            = SegReduceRegionByKeyPolicy::ITEMS_PER_THREAD;
-            load_algorithm              = SegReduceRegionByKeyPolicy::LOAD_ALGORITHM;
-            load_warp_time_slicing      = SegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING;
-            load_modifier               = SegReduceRegionByKeyPolicy::LOAD_MODIFIER;
-            scan_algorithm              = SegReduceRegionByKeyPolicy::SCAN_ALGORITHM;
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide segmented reduction.
-     */
-    template <
-        typename                        SegReducePartitionKernelPtr,
-        typename                        SegReduceRegionKernelPtr,               ///< Function type of cub::SegReduceRegionKernel
-        typename                        SegReduceRegionByKeyKernelPtr>          ///< Function type of cub::SegReduceRegionByKeyKernel
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
-        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
-        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous,                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                             sm_version,                             ///< [in] SM version of target device to use when computing SM occupancy
-        SegReducePartitionKernelPtr     seg_reduce_partition_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
-        SegReduceRegionKernelPtr        seg_reduce_region_kernel,               ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
-        SegReduceRegionByKeyKernelPtr   seg_reduce_region_by_key_kernel,        ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionByKeyKernel
-        SegReduceKernelConfig           &seg_reduce_region_config,              ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_kernel was compiled for
-        SegReduceByKeyKernelConfig      &seg_reduce_region_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_by_key_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Dispatch two kernels: (1) a multi-block segmented reduction
-            // to reduce regions by block, and (2) a single-block reduce-by-key kernel
-            // to "fix up" segments spanning more than one region.
-
-            // Tile size of seg_reduce_region_kernel
-            int tile_size = seg_reduce_region_config.block_threads * seg_reduce_region_config.items_per_thread;
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for histogram_region_kernel
-            int seg_reduce_region_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                seg_reduce_region_sm_occupancy,
-                sm_version,
-                seg_reduce_region_kernel,
-                seg_reduce_region_config.block_threads))) break;
-
-            // Get device occupancy for histogram_region_kernel
-            int seg_reduce_region_occupancy = seg_reduce_region_sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int num_diagonals = num_values + num_segments;                  // Total number of work items
-            int subscription_factor = seg_reduce_region_sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
-            int max_grid_size = seg_reduce_region_occupancy * subscription_factor;
-            GridEvenShare<OffsetT>even_share(
-                num_diagonals,
-                max_grid_size,
-                tile_size);
-
-            // Get grid size for seg_reduce_region_kernel
-            int seg_reduce_region_grid_size = even_share.grid_size;
-
-            // Number of "fix-up" reduce-by-key tuples (2 per thread block)
-            int num_tuple_partials = seg_reduce_region_grid_size * 2;
-            int num_partition_samples = seg_reduce_region_grid_size + 1;
-
-            // Temporary storage allocation requirements
-            void* allocations[2] = {};
-            size_t allocation_sizes[2] =
-            {
-                num_tuple_partials * sizeof(KeyValuePair),  // bytes needed for "fix-up" reduce-by-key tuples
-                num_partition_samples * sizeof(IndexPair),  // bytes needed block indices
-            };
-
-            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocations
-            KeyValuePair    *d_tuple_partials   = (KeyValuePair*) allocations[0];           // "fix-up" tuples
-            IndexPair       *d_block_idx        = (IndexPair *) allocations[1];             // block starting/ending indices
-
-            // Array of segment end-offsets
-            SegmentOffsetIterator d_segment_end_offsets = d_segment_offsets + 1;
-
-            // Grid launch params for seg_reduce_partition_kernel
-            int partition_block_size = 32;
-            int partition_grid_size = (num_partition_samples + partition_block_size - 1) / partition_block_size;
-
-            // Partition work among multiple thread blocks if necessary
-            if (seg_reduce_region_grid_size > 1)
-            {
-                // Log seg_reduce_partition_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking seg_reduce_partition_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    partition_grid_size, partition_block_size, (long long) stream);
-
-                // Invoke seg_reduce_partition_kernel
-                seg_reduce_partition_kernel<<<partition_grid_size, partition_block_size, 0, stream>>>(
-                    d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-                    d_block_idx,
-                    num_partition_samples,
-                    num_values,             ///< [in] Number of values to reduce
-                    num_segments,           ///< [in] Number of segments being reduced
-                    even_share);            ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-
-                // Sync the stream if specified
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            // Log seg_reduce_region_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking seg_reduce_region_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, (long long) stream, seg_reduce_region_config.items_per_thread, seg_reduce_region_sm_occupancy);
-
-            // Mooch
-            if (CubDebug(error = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte))) break;
-
-            // Invoke seg_reduce_region_kernel
-            seg_reduce_region_kernel<<<seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, 0, stream>>>(
-                d_segment_end_offsets,
-                d_values,
-                d_output,
-                d_tuple_partials,
-                d_block_idx,
-                num_values,
-                num_segments,
-                identity,
-                reduction_op,
-                even_share);
-
-            // Sync the stream if specified
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-/*
-            // Perform "fix-up" of region partial reductions if grid size is greater than one thread block
-            if (seg_reduce_region_grid_size > 1)
-            {
-                // Log seg_reduce_region_by_key_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking seg_reduce_region_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, seg_reduce_region_by_key_config.block_threads, (long long) stream, seg_reduce_region_by_key_config.items_per_thread);
-
-                // Invoke seg_reduce_region_by_key_kernel
-                seg_reduce_region_by_key_kernel<<<1, seg_reduce_region_by_key_config.block_threads, 0, stream>>>(
-                    d_tuple_partials,
-                    d_output,
-                    num_segments,
-                    num_tuple_partials,
-                    identity,
-                    reduction_op);
-
-                // Sync the stream if specified
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-*/
-        }
-
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide segmented reduction.
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
-        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
-        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous)                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            SegReduceKernelConfig seg_reduce_region_config;
-            SegReduceByKeyKernelConfig seg_reduce_region_by_key_config;
-
-            InitConfigs(ptx_version, seg_reduce_region_config, seg_reduce_region_by_key_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_values,
-                d_segment_offsets,
-                d_output,
-                num_values,
-                num_segments,
-                identity,
-                reduction_op,
-                stream,
-                debug_synchronous,
-                ptx_version,            // Use PTX version instead of SM version because, as a statically known quantity, this improves device-side launch dramatically but at the risk of imprecise occupancy calculation for mismatches
-                SegReducePartitionKernel<SegmentOffsetIterator, OffsetT>,
-                SegReduceRegionKernel<PtxSegReduceRegionPolicy, SegmentOffsetIterator, ValueIterator, OutputIteratorT, ReductionOp, OffsetT, Value>,
-                SegReduceRegionByKeyKernel<PtxSegReduceRegionByKeyPolicy, KeyValuePair*, OutputIteratorT, ReductionOp, OffsetT, Value>,
-                seg_reduce_region_config,
-                seg_reduce_region_by_key_config))) break;
-        }
-        while (0);
-
-        return error;
-
-    }
-};
-
-
-
-
-/******************************************************************************
- * DeviceSegReduce
- *****************************************************************************/
-
-/**
- * \brief DeviceSegReduce provides operations for computing a device-wide, parallel segmented reduction across a sequence of data items residing within global memory.
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a list of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- */
-struct DeviceSegReduce
-{
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * Does not support non-commutative reduction operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
-     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
-     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
-     * \tparam Value                    <b>[inferred]</b> Value type
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename                ValueIterator,
-        typename                SegmentOffsetIterator,
-        typename                OutputIteratorT,
-        typename                Value,
-        typename                ReductionOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t Reduce(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        int                     num_values,                             ///< [in] Total number of values to reduce
-        int                     num_segments,                           ///< [in] Number of segments being reduced
-        Value                   identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp             reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        typedef DeviceSegReduceDispatch<
-                ValueIterator,
-                SegmentOffsetIterator,
-                OutputIteratorT,
-                ReductionOp,
-                OffsetT>
-            DeviceSegReduceDispatch;
-
-        return DeviceSegReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_values,
-            d_segment_offsets,
-            d_output,
-            num_values,
-            num_segments,
-            identity,
-            reduction_op,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * Does not support non-commutative summation.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
-     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
-     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
-     */
-    template <
-        typename                ValueIterator,
-        typename                SegmentOffsetIterator,
-        typename                OutputIteratorT>
-    __host__ __device__ __forceinline__
-    static cudaError_t Sum(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        int                     num_values,                             ///< [in] Total number of values to reduce
-        int                     num_segments,                           ///< [in] Number of segments being reduced
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Value type
-        typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-        Value identity = Value();
-        cub::Sum reduction_op;
-
-        typedef DeviceSegReduceDispatch<
-                ValueIterator,
-                SegmentOffsetIterator,
-                OutputIteratorT,
-                cub::Sum,
-                OffsetT>
-            DeviceSegReduceDispatch;
-
-        return DeviceSegReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_values,
-            d_segment_offsets,
-            d_output,
-            num_values,
-            num_segments,
-            identity,
-            reduction_op,
-            stream,
-            debug_synchronous);
-    }
-};
-
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem
- */
-template <typename OffsetT, typename Value>
-void Initialize(
-    GenMode         gen_mode,
-    Value           *h_values,
-    vector<OffsetT> &segment_offsets,
-    int             num_values,
-    int             avg_segment_size)
-{
-    // Initialize values
-//    if (g_verbose) printf("Values: ");
-    for (int i = 0; i < num_values; ++i)
-    {
-        InitValue(gen_mode, h_values[i], i);
-//        if (g_verbose) std::cout << h_values[i] << ", ";
-    }
-//    if (g_verbose) printf("\n\n");
-
-    // Initialize segment lengths
-    const unsigned int  MAX_INTEGER         = -1u;
-    const unsigned int  MAX_SEGMENT_LENGTH  = avg_segment_size * 2;
-    const double        SCALE_FACTOR        = double(MAX_SEGMENT_LENGTH) / double(MAX_INTEGER);
-
-    segment_offsets.push_back(0);
-
-    OffsetT consumed = 0;
-    OffsetT remaining = num_values;
-    while (remaining > 0)
-    {
-        // Randomly sample a 32-bit unsigned int
-        unsigned int segment_length;
-        RandomBits(segment_length);
-
-        // Scale to maximum segment length
-        segment_length = (unsigned int) (double(segment_length) * SCALE_FACTOR);
-        segment_length = CUB_MIN(segment_length, remaining);
-
-        consumed += segment_length;
-        remaining -= segment_length;
-
-        segment_offsets.push_back(consumed);
-    }
-}
-
-
-/**
- * Compute reference answer
- */
-template <typename OffsetT, typename Value>
-void ComputeReference(
-    Value       *h_values,
-    OffsetT     *h_segment_offsets,
-    Value       *h_reference,
-    int         num_segments,
-    Value       identity)
-{
-    if (g_verbose) printf("%d segment reductions: ", num_segments);
-    for (int segment = 0; segment < num_segments; ++segment)
-    {
-        h_reference[segment] = identity;
-
-        for (int i = h_segment_offsets[segment]; i < h_segment_offsets[segment + 1]; ++i)
-        {
-            h_reference[segment] += h_values[i];
-        }
-        if (g_verbose) std::cout << h_reference[segment] << ", ";
-    }
-    if (g_verbose) printf("\n\n");
-}
-
-
-/**
- * Simple test of device
- */
-template <
-    bool            CDP,
-    typename        OffsetT,
-    typename        Value,
-    typename        ReductionOp>
-void Test(
-    OffsetT         num_values,
-    int             avg_segment_size,
-    ReductionOp     reduction_op,
-    Value           identity,
-    char*           type_string)
-{
-    Value   *h_values = NULL;
-    Value   *h_reference = NULL;
-    OffsetT *h_segment_offsets = NULL;
-
-    printf("%d\n", num_values);
-
-    // Initialize problem on host
-    h_values = new Value[num_values];
-    vector<OffsetT> segment_offsets;
-    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
-
-    // Allocate simple offsets array and copy STL vector into it
-    h_segment_offsets = new OffsetT[segment_offsets.size()];
-    for (int i = 0; i < segment_offsets.size(); ++i)
-        h_segment_offsets[i] = segment_offsets[i];
-
-    OffsetT num_segments = segment_offsets.size() - 1;
-    if (g_verbose)
-    {
-        printf("%d segment offsets: ", num_segments);
-        for (int i = 0; i < num_segments; ++i)
-            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
-        if (g_verbose) std::cout << std::endl << std::endl;
-    }
-
-    // Solve problem on host
-    h_reference = new Value[num_segments];
-    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
-
-    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
-        (CDP) ? "CDP device invoked" : "Host-invoked",
-        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
-        num_values, (int) sizeof(Value), type_string,
-        num_segments, (int) sizeof(OffsetT));
-    fflush(stdout);
-
-    // Allocate and initialize problem on device
-    Value   *d_values = NULL;
-    OffsetT *d_segment_offsets = NULL;
-    Value   *d_output = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
-    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    // Request and allocate temporary storage
-    void    *d_temp_storage = NULL;
-    size_t  temp_storage_bytes = 0;
-    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output
-    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
-    }
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
-        float giga_bandwidth = giga_rate *
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-
-    // Device cleanup
-    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
-    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Host cleanup
-    if (h_values)           delete[] h_values;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (h_reference)        delete[] h_reference;
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_values          = 32 * 1024 * 1024;
-    int avg_segment_size    = 500;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_values);
-    args.GetCmdLineArgument("ss", avg_segment_size);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "[--n=<input samples>]\n"
-            "[--ss=<average segment size>]\n"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h
deleted file mode 100644
index 07c2e4aa2..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_cub.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <cub/device/device_histogram.cuh>
-
-using namespace cub;
-
-template <
-    int         NUM_CHANNELS,
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_cub_histogram(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist, 
-    bool is_warmup)
-{
-    enum {
-        is_float = Equals<PixelType, float4>::VALUE,
-    };
-
-    typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
-    typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
-
-    // Setup data structures
-    unsigned int*       d_histogram[ACTIVE_CHANNELS];
-    int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
-        num_levels[CHANNEL] = NUM_BINS + 1;
-        lower_level[CHANNEL] = 0;
-        upper_level[CHANNEL] = (is_float) ? 1 : 256;
-    }
-
-    // Allocate temporary storage
-    size_t temp_storage_bytes = 0;
-    void *d_temp_storage = NULL;
-
-    SampleT* d_image_samples = (SampleT*) d_image;
-
-    // Get amount of temporary storage needed
-    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_image_samples,
-        d_histogram,
-        num_levels,
-        lower_level,
-        upper_level,
-        width * height, 
-        (cudaStream_t) 0,
-        is_warmup);
-
-    cudaMalloc(&d_temp_storage, temp_storage_bytes);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    // Compute histogram
-    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_image_samples,
-        d_histogram,
-        num_levels,
-        lower_level,
-        upper_level,
-        width * height, 
-        (cudaStream_t) 0,
-        is_warmup);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_temp_storage);
-
-    return elapsed_millis;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h
deleted file mode 100644
index 3308a2851..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_gmem_atomics.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <test/test_util.h>
-
-namespace histogram_gmem_atomics
-{
-    // Decode float4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        float* samples = reinterpret_cast<float*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-    }
-
-    // Decode uchar4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-    }
-
-    // Decode uchar1 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        bins[0] = (unsigned int) pixel.x;
-    }
-
-    // First-pass histogram kernel (binning into privatized counters)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS,
-        typename    PixelType>
-    __global__ void histogram_gmem_atomics(
-        const PixelType *in,
-        int width,
-        int height,
-        unsigned int *out)
-    {
-        // global position and size
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-        int nx = blockDim.x * gridDim.x;
-        int ny = blockDim.y * gridDim.y;
-
-        // threads in workgroup
-        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
-        int nt = blockDim.x * blockDim.y; // total threads in workgroup
-
-        // group index in 0..ngroups-1
-        int g = blockIdx.x + blockIdx.y * gridDim.x;
-
-        // initialize smem
-        unsigned int *gmem = out + g * NUM_PARTS;
-        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
-            gmem[i] = 0;
-        __syncthreads();
-
-        // process pixels (updates our group's partial histogram in gmem)
-        for (int col = x; col < width; col += nx)
-        {
-            for (int row = y; row < height; row += ny)
-            {
-                PixelType pixel = in[row * width + col];
-
-                unsigned int bins[ACTIVE_CHANNELS];
-                DecodePixel<NUM_BINS>(pixel, bins);
-
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                    atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
-            }
-        }
-    }
-
-    // Second pass histogram kernel (accumulation)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS>
-    __global__ void histogram_gmem_accum(
-        const unsigned int *in,
-        int n,
-        unsigned int *out)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i > ACTIVE_CHANNELS * NUM_BINS)
-            return; // out of range
-
-        unsigned int total = 0;
-        for (int j = 0; j < n; j++)
-            total += in[i + NUM_PARTS * j];
-
-        out[i] = total;
-    }
-
-
-}   // namespace histogram_gmem_atomics
-
-
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_gmem_atomics(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist,
-    bool warmup)
-{
-    enum
-    {
-        NUM_PARTS = 1024
-    };
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, 0);
-
-    dim3 block(32, 4);
-    dim3 grid(16, 16);
-    int total_blocks = grid.x * grid.y;
-
-    // allocate partial histogram
-    unsigned int *d_part_hist;
-    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
-
-    dim3 block2(128);
-    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
-        d_image,
-        width,
-        height,
-        d_part_hist);
-
-    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
-        d_part_hist,
-        total_blocks,
-        d_hist);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_part_hist);
-
-    return elapsed_millis;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h
deleted file mode 100644
index 2c70702e2..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram/histogram_smem_atomics.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <test/test_util.h>
-
-namespace histogram_smem_atomics
-{
-    // Decode float4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        float* samples = reinterpret_cast<float*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-    }
-
-    // Decode uchar4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-    }
-
-    // Decode uchar1 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        bins[0] = (unsigned int) pixel.x;
-    }
-
-    // First-pass histogram kernel (binning into privatized counters)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS,
-        typename    PixelType>
-    __global__ void histogram_smem_atomics(
-        const PixelType *in,
-        int width,
-        int height,
-        unsigned int *out)
-    {
-        // global position and size
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-        int nx = blockDim.x * gridDim.x;
-        int ny = blockDim.y * gridDim.y;
-
-        // threads in workgroup
-        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
-        int nt = blockDim.x * blockDim.y; // total threads in workgroup
-
-        // group index in 0..ngroups-1
-        int g = blockIdx.x + blockIdx.y * gridDim.x;
-
-        // initialize smem
-        __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
-        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
-            smem[i] = 0;
-        __syncthreads();
-
-        // process pixels
-        // updates our group's partial histogram in smem
-        for (int col = x; col < width; col += nx)
-        {
-            for (int row = y; row < height; row += ny)
-            {
-                PixelType pixel = in[row * width + col];
-
-                unsigned int bins[ACTIVE_CHANNELS];
-                DecodePixel<NUM_BINS>(pixel, bins);
-
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                    atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
-            }
-        }
-
-        __syncthreads();
-
-        // move to our workgroup's slice of output
-        out += g * NUM_PARTS;
-
-        // store local output to global
-        for (int i = t; i < NUM_BINS; i += nt)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
-        }
-    }
-
-    // Second pass histogram kernel (accumulation)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS>
-    __global__ void histogram_smem_accum(
-        const unsigned int *in,
-        int n,
-        unsigned int *out)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
-        unsigned int total = 0;
-        for (int j = 0; j < n; j++)
-            total += in[i + NUM_PARTS * j];
-        out[i] = total;
-    }
-
-}   // namespace histogram_smem_atomics
-
-
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_smem_atomics(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist, 
-    bool warmup)
-{
-    enum
-    {
-        NUM_PARTS = 1024
-    };
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, 0);
-
-    dim3 block(32, 4);
-    dim3 grid(16, 16);
-    int total_blocks = grid.x * grid.y;
-
-    // allocate partial histogram
-    unsigned int *d_part_hist;
-    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
-
-    dim3 block2(128);
-    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
-        d_image,
-        width,
-        height,
-        d_part_hist);
-
-    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
-        d_part_hist,
-        total_blocks,
-        d_hist);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_part_hist);
-
-    return elapsed_millis;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu
deleted file mode 100644
index 7ab66a16a..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/histogram_compare.cu
+++ /dev/null
@@ -1,635 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <stdio.h>
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <cstdio>
-#include <fstream>
-
-#include "histogram/histogram_gmem_atomics.h"
-#include "histogram/histogram_smem_atomics.h"
-#include "histogram/histogram_cub.h"
-
-#include <cub/util_allocator.cuh>
-#include <test/test_util.h>
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants, and type declarations
-//---------------------------------------------------------------------
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-bool                    g_report = false;   // Whether to display a full report in CSV format
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-struct less_than_value
-{
-    inline bool operator()(
-        const std::pair<std::string, double> &a,
-        const std::pair<std::string, double> &b)
-    {
-        return a.second < b.second;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Targa (.tga) image file parsing
-//---------------------------------------------------------------------
-
-/**
- * TGA image header info
- */
-struct TgaHeader
-{
-    char idlength;
-    char colormaptype;
-    char datatypecode;
-    short colormaporigin;
-    short colormaplength;
-    char colormapdepth;
-    short x_origin;
-    short y_origin;
-    short width;
-    short height;
-    char bitsperpixel;
-    char imagedescriptor;
-
-    void Parse (FILE *fptr)
-    {
-        idlength = fgetc(fptr);
-        colormaptype = fgetc(fptr);
-        datatypecode = fgetc(fptr);
-        fread(&colormaporigin, 2, 1, fptr);
-        fread(&colormaplength, 2, 1, fptr);
-        colormapdepth = fgetc(fptr);
-        fread(&x_origin, 2, 1, fptr);
-        fread(&y_origin, 2, 1, fptr);
-        fread(&width, 2, 1, fptr);
-        fread(&height, 2, 1, fptr);
-        bitsperpixel = fgetc(fptr);
-        imagedescriptor = fgetc(fptr);
-    }
-
-    void Display (FILE *fptr)
-    {
-        fprintf(fptr, "ID length:           %d\n", idlength);
-        fprintf(fptr, "Color map type:      %d\n", colormaptype);
-        fprintf(fptr, "Image type:          %d\n", datatypecode);
-        fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
-        fprintf(fptr, "Color map length:    %d\n", colormaplength);
-        fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
-        fprintf(fptr, "X origin:            %d\n", x_origin);
-        fprintf(fptr, "Y origin:            %d\n", y_origin);
-        fprintf(fptr, "Width:               %d\n", width);
-        fprintf(fptr, "Height:              %d\n", height);
-        fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
-        fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
-    }
-};
-
-
-/**
- * Decode image byte data into pixel
- */
-void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
-{
-    if (bytes == 4)
-    {
-        pixel.x = tga_pixel[2];
-        pixel.y = tga_pixel[1];
-        pixel.z = tga_pixel[0];
-        pixel.w = tga_pixel[3];
-    }
-    else if (bytes == 3)
-    {
-        pixel.x = tga_pixel[2];
-        pixel.y = tga_pixel[1];
-        pixel.z = tga_pixel[0];
-        pixel.w = 0;
-    }
-    else if (bytes == 2)
-    {
-        pixel.x = (tga_pixel[1] & 0x7c) << 1;
-        pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
-        pixel.z = (tga_pixel[0] & 0x1f) << 3;
-        pixel.w = (tga_pixel[1] & 0x80);
-    }
-}
-
-
-/**
- * Reads a .tga image file
- */
-void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
-{
-    // Open the file
-    FILE *fptr;
-    if ((fptr = fopen(filename, "rb")) == NULL)
-    {
-        fprintf(stderr, "File open failed\n");
-        exit(-1);
-    }
-
-    // Parse header
-    TgaHeader header;
-    header.Parse(fptr);
-//    header.Display(stdout);
-    width = header.width;
-    height = header.height;
-
-    // Verify compatibility
-    if (header.datatypecode != 2 && header.datatypecode != 10)
-    {
-        fprintf(stderr, "Can only handle image type 2 and 10\n");
-        exit(-1);
-    }
-    if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
-    {
-        fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
-        exit(-1);
-    }
-    if (header.colormaptype != 0 && header.colormaptype != 1)
-    {
-        fprintf(stderr, "Can only handle color map types of 0 and 1\n");
-        exit(-1);
-    }
-
-    // Skip unnecessary header info
-    int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
-    fseek(fptr, skip_bytes, SEEK_CUR);
-
-    // Read the image
-    int pixel_bytes = header.bitsperpixel / 8;
-
-    // Allocate and initialize pixel data
-    size_t image_bytes = width * height * sizeof(uchar4);
-    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
-    {
-        fprintf(stderr, "malloc of image failed\n");
-        exit(-1);
-    }
-    memset(pixels, 0, image_bytes);
-
-    // Parse pixels
-    unsigned char   tga_pixel[5];
-    int             current_pixel = 0;
-    while (current_pixel < header.width * header.height)
-    {
-        if (header.datatypecode == 2)
-        {
-            // Uncompressed
-            if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
-            {
-                fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
-                exit(-1);
-            }
-            ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
-            current_pixel++;
-        }
-        else if (header.datatypecode == 10)
-        {
-            // Compressed
-            if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
-            {
-                fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
-                exit(-1);
-            }
-            int run_length = tga_pixel[0] & 0x7f;
-            ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
-            current_pixel++;
-
-            if (tga_pixel[0] & 0x80)
-            {
-                // RLE chunk
-                for (int i = 0; i < run_length; i++)
-                {
-                    ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
-                    current_pixel++;
-                }
-            }
-            else
-            {
-                // Normal chunk
-                for (int i = 0; i < run_length; i++)
-                {
-                    if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
-                    {
-                        fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
-                        exit(-1);
-                    }
-                    ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
-                    current_pixel++;
-                }
-            }
-        }
-    }
-
-    // Close file
-    fclose(fptr);
-}
-
-
-
-//---------------------------------------------------------------------
-// Random image generation
-//---------------------------------------------------------------------
-
-/**
- * Generate a random image with specified entropy
- */
-void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
-{
-    int num_pixels = width * height;
-    size_t image_bytes = num_pixels * sizeof(uchar4);
-    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
-    {
-        fprintf(stderr, "malloc of image failed\n");
-        exit(-1);
-    }
-
-    for (int i = 0; i < num_pixels; ++i)
-    {
-        RandomBits(pixels[i].x, entropy_reduction);
-        RandomBits(pixels[i].y, entropy_reduction);
-        RandomBits(pixels[i].z, entropy_reduction);
-        RandomBits(pixels[i].w, entropy_reduction);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Histogram verification
-//---------------------------------------------------------------------
-
-// Decode float4 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    float* samples = reinterpret_cast<float*>(&pixel);
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-}
-
-// Decode uchar4 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-}
-
-// Decode uchar1 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    bins[0] = (unsigned int) pixel.x;
-}
-
-
-// Compute reference histogram.  Specialized for uchar4
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
-{
-    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
-
-    for (int i = 0; i < width; i++)
-    {
-        for (int j = 0; j < height; j++)
-        {
-            PixelType pixel = image[i + j * width];
-
-            unsigned int bins[ACTIVE_CHANNELS];
-            DecodePixelGold<NUM_BINS>(pixel, bins);
-
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
-            }
-        }
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Test execution
-//---------------------------------------------------------------------
-
-/**
- * Run a specific histogram implementation
- */
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void RunTest(
-    std::vector<std::pair<std::string, double> >&   timings,
-    PixelType*                                      d_pixels,
-    const int                                       width,
-    const int                                       height,
-    unsigned int *                                  d_hist,
-    unsigned int *                                  h_hist,
-    int                                             timing_iterations,
-    const char *                                    long_name,
-    const char *                                    short_name,
-    double (*f)(PixelType*, int, int, unsigned int*, bool))
-{
-    if (!g_report) printf("%s ", long_name); fflush(stdout);
-
-    // Run single test to verify (and code cache)
-    (*f)(d_pixels, width, height, d_hist, !g_report);
-
-    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
-    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-
-    double elapsed_ms = 0;
-    for (int i = 0; i < timing_iterations; i++)
-    {
-        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
-    }
-    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
-    timings.push_back(std::pair<std::string, double>(short_name, avg_us));
-
-    if (!g_report)
-    {
-        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
-    }
-    else
-    {
-        printf("%.3f, ", avg_us); fflush(stdout);
-    }
-
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Evaluate corpus of histogram implementations
- */
-template <
-    int         NUM_CHANNELS,
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void TestMethods(
-    PixelType*  h_pixels,
-    int         height,
-    int         width,
-    int         timing_iterations,
-    double      bandwidth_GBs)
-{
-    // Copy data to gpu
-    PixelType* d_pixels;
-    size_t pixel_bytes = width * height * sizeof(PixelType);
-    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
-    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
-
-    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
-
-    // Allocate results arrays on cpu/gpu
-    unsigned int *h_hist;
-    unsigned int *d_hist;
-    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
-    h_hist = (unsigned int *) malloc(histogram_bytes);
-    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
-
-    // Compute reference cpu histogram
-    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
-
-    // Store timings
-    std::vector<std::pair<std::string, double> > timings;
-
-    // Run experiments
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-
-    // Report timings
-    if (!g_report)
-    {
-        std::sort(timings.begin(), timings.end(), less_than_value());
-        printf("Timings (us):\n");
-        for (int i = 0; i < timings.size(); i++)
-        {
-            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
-            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
-        }
-        printf("\n");
-    }
-
-    // Free data
-    CubDebugExit(g_allocator.DeviceFree(d_pixels));
-    CubDebugExit(g_allocator.DeviceFree(d_hist));
-    free(h_hist);
-}
-
-
-/**
- * Test different problem genres
- */
-void TestGenres(
-    uchar4*     uchar4_pixels,
-    int         height,
-    int         width,
-    int         timing_iterations,
-    double      bandwidth_GBs)
-{
-    int num_pixels = width * height;
-
-    {
-        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
-
-        size_t      image_bytes     = num_pixels * sizeof(uchar1);
-        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
-
-        // Convert to 1-channel (averaging first 3 channels)
-        for (int i = 0; i < num_pixels; ++i)
-        {
-            uchar1_pixels[i].x = (unsigned char)
-                (((unsigned int) uchar4_pixels[i].x +
-                  (unsigned int) uchar4_pixels[i].y +
-                  (unsigned int) uchar4_pixels[i].z) / 3);
-        }
-
-        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
-        free(uchar1_pixels);
-        if (g_report) printf(", ");
-    }
-
-    {
-        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
-        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
-        if (g_report) printf(", ");
-    }
-
-    {
-        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
-        size_t      image_bytes     = num_pixels * sizeof(float4);
-        float4*     float4_pixels   = (float4*) malloc(image_bytes);
-
-        // Convert to float4 with range [0.0, 1.0)
-        for (int i = 0; i < num_pixels; ++i)
-        {
-            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
-            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
-            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
-            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
-        }
-        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
-        free(float4_pixels);
-        if (g_report) printf("\n");
-    }
-}
-
-
-/**
- * Main
- */
-int main(int argc, char **argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf(
-            "%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "\n\t"
-                "--file=<.tga filename> "
-            "\n\t"
-                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
-                "[--height=<default: 1080>] "
-                "[--width=<default: 1920>] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    std::string         filename;
-    int                 timing_iterations   = 100;
-    int                 entropy_reduction   = 0;
-    int                 height              = 1080;
-    int                 width               = 1920;
-
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_report = args.CheckCmdLineFlag("report");
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("file", filename);
-    args.GetCmdLineArgument("height", height);
-    args.GetCmdLineArgument("width", width);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get GPU device bandwidth (GB/s)
-    int device_ordinal, bus_width, mem_clock_khz;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
-    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
-
-    // Run test(s)
-    uchar4* uchar4_pixels = NULL;
-    if (!g_report)
-    {
-        if (!filename.empty())
-        {
-            // Parse targa file
-            ReadTga(uchar4_pixels, width, height, filename.c_str());
-            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
-        }
-        else
-        {
-            // Generate image
-            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
-            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
-        }
-
-        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-    }
-    else
-    {
-        // Run test suite
-        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
-
-        // Entropy reduction tests
-        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
-        {
-            printf("entropy reduction %d, ", entropy_reduction);
-            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
-            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        }
-        printf("entropy reduction -1, ");
-        GenerateRandomImage(uchar4_pixels, width, height, -1);
-        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        printf("\n");
-
-        // File image tests
-        std::vector<std::string> file_tests;
-        file_tests.push_back("animals");
-        file_tests.push_back("apples");
-        file_tests.push_back("sunset");
-        file_tests.push_back("cheetah");
-        file_tests.push_back("nature");
-        file_tests.push_back("operahouse");
-        file_tests.push_back("austin");
-        file_tests.push_back("cityscape");
-
-        for (int i = 0; i < file_tests.size(); ++i)
-        {
-            printf("%s, ", file_tests[i].c_str());
-            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
-            ReadTga(uchar4_pixels, width, height, filename.c_str());
-            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        }
-    }
-
-    free(uchar4_pixels);
-
-    CubDebugExit(cudaDeviceSynchronize());
-    printf("\n\n");
-
-    return 0;
-}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h
deleted file mode 100644
index 1fb523331..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/sparse_matrix.h
+++ /dev/null
@@ -1,1244 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Matrix data structures and parsing logic
- ******************************************************************************/
-
-#pragma once
-
-#include <cmath>
-#include <cstring>
-
-#include <iterator>
-#include <string>
-#include <algorithm>
-#include <iostream>
-#include <queue>
-#include <set>
-#include <fstream>
-#include <stdio.h>
-
-#ifdef CUB_MKL
-    #include <numa.h>
-    #include <mkl.h>
-#endif
-
-using namespace std;
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-struct GraphStats
-{
-    int         num_rows;
-    int         num_cols;
-    int         num_nonzeros;
-
-    double      diag_dist_mean;         // mean
-    double      diag_dist_std_dev;      // sample std dev
-    double      pearson_r;    // coefficient of variation
-
-    double      row_length_mean;        // mean
-    double      row_length_std_dev;     // sample std_dev
-    double      row_length_variation;   // coefficient of variation
-    double      row_length_skewness;    // skewness
-
-    void Display(bool show_labels = true)
-    {
-        if (show_labels)
-            printf("\n"
-                "\t num_rows: %d\n"
-                "\t num_cols: %d\n"
-                "\t num_nonzeros: %d\n"
-                "\t diag_dist_mean: %.2f\n"
-                "\t diag_dist_std_dev: %.2f\n"
-                "\t pearson_r: %f\n"
-                "\t row_length_mean: %.5f\n"
-                "\t row_length_std_dev: %.5f\n"
-                "\t row_length_variation: %.5f\n"
-                "\t row_length_skewness: %.5f\n",
-                    num_rows,
-                    num_cols,
-                    num_nonzeros,
-                    diag_dist_mean,
-                    diag_dist_std_dev,
-                    pearson_r,
-                    row_length_mean,
-                    row_length_std_dev,
-                    row_length_variation,
-                    row_length_skewness);
-        else
-            printf(
-                "%d, "
-                "%d, "
-                "%d, "
-                "%.2f, "
-                "%.2f, "
-                "%f, "
-                "%.5f, "
-                "%.5f, "
-                "%.5f, "
-                "%.5f, ",
-                    num_rows,
-                    num_cols,
-                    num_nonzeros,
-                    diag_dist_mean,
-                    diag_dist_std_dev,
-                    pearson_r,
-                    row_length_mean,
-                    row_length_std_dev,
-                    row_length_variation,
-                    row_length_skewness);
-    }
-};
-
-
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-
-/**
- * COO matrix type.  A COO matrix is just a vector of edge tuples.  Tuples are sorted
- * first by row, then by column.
- */
-template<typename ValueT, typename OffsetT>
-struct CooMatrix
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // COO edge tuple
-    struct CooTuple
-    {
-        OffsetT            row;
-        OffsetT            col;
-        ValueT             val;
-
-        CooTuple() {}
-        CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {}
-        CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {}
-
-        /**
-         * Comparator for sorting COO sparse format num_nonzeros
-         */
-        bool operator<(const CooTuple &other) const
-        {
-            if ((row < other.row) || ((row == other.row) && (col < other.col)))
-            {
-                return true;
-            }
-
-            return false;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Data members
-    //---------------------------------------------------------------------
-
-    // Fields
-    int                 num_rows;
-    int                 num_cols;
-    int                 num_nonzeros;
-    CooTuple*           coo_tuples;
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    // Constructor
-    CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(NULL) {}
-
-
-    /**
-     * Clear
-     */
-    void Clear()
-    {
-        if (coo_tuples) delete[] coo_tuples;
-        coo_tuples = NULL;
-    }
-
-
-    // Destructor
-    ~CooMatrix()
-    {
-        Clear();
-    }
-
-
-    // Display matrix to stdout
-    void Display()
-    {
-        cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n";
-        cout << "Ordinal, Row, Column, Value\n";
-        for (int i = 0; i < num_nonzeros; i++)
-        {
-            cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n";
-        }
-    }
-
-
-    /**
-     * Builds a symmetric COO sparse from an asymmetric CSR matrix.
-     */
-    template <typename CsrMatrixT>
-    void InitCsrSymmetric(CsrMatrixT &csr_matrix)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = csr_matrix.num_cols;
-        num_cols        = csr_matrix.num_rows;
-        num_nonzeros    = csr_matrix.num_nonzeros * 2;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < csr_matrix.num_rows; ++row)
-        {
-            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
-            {
-                coo_tuples[nonzero].row = row;
-                coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero];
-                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
-
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col;
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row;
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero];
-
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-    }
-
-    /**
-     * Builds a COO sparse from a relabeled CSR matrix.
-     */
-    template <typename CsrMatrixT>
-    void InitCsrRelabel(CsrMatrixT &csr_matrix, OffsetT* relabel_indices)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = csr_matrix.num_rows;
-        num_cols        = csr_matrix.num_cols;
-        num_nonzeros    = csr_matrix.num_nonzeros;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
-            {
-                coo_tuples[nonzero].row = relabel_indices[row];
-                coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]];
-                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-    }
-
-
-
-    /**
-     * Builds a METIS COO sparse from the given file.
-     */
-    void InitMetis(const string &metis_filename)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        // TODO
-    }
-
-
-    /**
-     * Builds a MARKET COO sparse from the given file.
-     */
-    void InitMarket(
-        const string&   market_filename,
-        ValueT          default_value       = 1.0,
-        bool            verbose             = false)
-    {
-        if (verbose) {
-            printf("Reading... "); fflush(stdout);
-        }
-
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        std::ifstream ifs;
-        ifs.open(market_filename.c_str(), std::ifstream::in);
-        if (!ifs.good())
-        {
-            fprintf(stderr, "Error opening file\n");
-            exit(1);
-        }
-
-        bool    array = false;
-        bool    symmetric = false;
-        bool    skew = false;
-        int     current_edge = -1;
-        char    line[1024];
-
-        if (verbose) {
-            printf("Parsing... "); fflush(stdout);
-        }
-
-        while (true)
-        {
-            ifs.getline(line, 1024);
-            if (!ifs.good())
-            {
-                // Done
-                break;
-            }
-
-            if (line[0] == '%')
-            {
-                // Comment
-                if (line[1] == '%')
-                {
-                    // Banner
-                    symmetric   = (strstr(line, "symmetric") != NULL);
-                    skew        = (strstr(line, "skew") != NULL);
-                    array       = (strstr(line, "array") != NULL);
-
-                    if (verbose) {
-                        printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout);
-                    }
-                }
-            }
-            else if (current_edge == -1)
-            {
-                // Problem description
-                int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros);
-                if ((!array) && (nparsed == 3))
-                {
-                    if (symmetric)
-                        num_nonzeros *= 2;
-
-                    // Allocate coo matrix
-                    coo_tuples = new CooTuple[num_nonzeros];
-                    current_edge = 0;
-
-                }
-                else if (array && (nparsed == 2))
-                {
-                    // Allocate coo matrix
-                    num_nonzeros = num_rows * num_cols;
-                    coo_tuples = new CooTuple[num_nonzeros];
-                    current_edge = 0;
-                }
-                else
-                {
-                    fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line);
-                    exit(1);
-                }
-
-            }
-            else
-            {
-                // Edge
-                if (current_edge >= num_nonzeros)
-                {
-                    fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros);
-                    exit(1);
-                }
-
-                int row, col;
-                double val;
-
-                if (array)
-                {
-                    if (sscanf(line, "%lf", &val) != 1)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge);
-                        exit(1);
-                    }
-                    col = (current_edge / num_rows);
-                    row = (current_edge - (num_rows * col));
-
-                    coo_tuples[current_edge] = CooTuple(row, col, val);    // Convert indices to zero-based
-                }
-                else
-                {
-                    // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing)
-                    char *l = line;
-                    char *t = NULL;
-
-                    // parse row
-                    row = strtol(l, &t, 0);
-                    if (t == l)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge);
-                        exit(1);
-                    }
-                    l = t;
-
-                    // parse col
-                    col = strtol(l, &t, 0);
-                    if (t == l)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge);
-                        exit(1);
-                    }
-                    l = t;
-
-                    // parse val
-                    val = strtod(l, &t);
-                    if (t == l)
-                    {
-                        val = default_value;
-                    }
-/*
-                    int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val);
-                    if (nparsed == 2)
-                    {
-                        // No value specified
-                        val = default_value;
-                        
-                    }
-                    else if (nparsed != 3)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge);
-                        exit(1);
-                    }
-*/
-
-                    coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val);    // Convert indices to zero-based
-
-                }
-
-                current_edge++;
-
-                if (symmetric && (row != col))
-                {
-                    coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col;
-                    coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row;
-                    coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1);
-                    current_edge++;
-                }
-            }
-        }
-
-        // Adjust nonzero count (nonzeros along the diagonal aren't reversed)
-        num_nonzeros = current_edge;
-
-        if (verbose) {
-            printf("done. Ordering..."); fflush(stdout);
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        if (verbose) {
-            printf("done. "); fflush(stdout);
-        }
-
-        ifs.close();
-    }
-
-
-    /**
-     * Builds a dense matrix
-     */
-    int InitDense(
-        OffsetT     num_rows,
-        OffsetT     num_cols,
-        ValueT      default_value   = 1.0,
-        bool        verbose         = false)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        this->num_rows  = num_rows;
-        this->num_cols  = num_cols;
-
-        num_nonzeros    = num_rows * num_cols;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            for (OffsetT col = 0; col < num_cols; ++col)
-            {
-                coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value);
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-    /**
-     * Builds a wheel COO sparse matrix having spokes spokes.
-     */
-    int InitWheel(
-        OffsetT     spokes,
-        ValueT      default_value   = 1.0,
-        bool        verbose         = false)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = spokes + 1;
-        num_cols        = num_rows;
-        num_nonzeros    = spokes * 2;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        // Add spoke num_nonzeros
-        int current_edge = 0;
-        for (OffsetT i = 0; i < spokes; i++)
-        {
-            coo_tuples[current_edge] = CooTuple(0, i + 1, default_value);
-            current_edge++;
-        }
-
-        // Add rim
-        for (OffsetT i = 0; i < spokes; i++)
-        {
-            OffsetT dest = (i + 1) % spokes;
-            coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value);
-            current_edge++;
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-
-    /**
-     * Builds a square 2D grid CSR matrix.  Interior num_vertices have degree 5 when including
-     * a self-loop.
-     *
-     * Returns 0 on success, 1 on failure.
-     */
-    int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        int     interior_nodes  = (width - 2) * (width - 2);
-        int     edge_nodes      = (width - 2) * 4;
-        int     corner_nodes    = 4;
-        num_rows                       = width * width;
-        num_cols                       = num_rows;
-        num_nonzeros                   = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2);
-
-        if (self_loop)
-            num_nonzeros += num_rows;
-
-        coo_tuples          = new CooTuple[num_nonzeros];
-        int current_edge    = 0;
-
-        for (OffsetT j = 0; j < width; j++)
-        {
-            for (OffsetT k = 0; k < width; k++)
-            {
-                OffsetT me = (j * width) + k;
-
-                // West
-                OffsetT neighbor = (j * width) + (k - 1);
-                if (k - 1 >= 0) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // East
-                neighbor = (j * width) + (k + 1);
-                if (k + 1 < width) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // North
-                neighbor = ((j - 1) * width) + k;
-                if (j - 1 >= 0) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // South
-                neighbor = ((j + 1) * width) + k;
-                if (j + 1 < width) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                if (self_loop)
-                {
-                    neighbor = me;
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-            }
-        }
-
-        // Sort by rows, then columns, update dims
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-
-    /**
-     * Builds a square 3D grid COO sparse matrix.  Interior num_vertices have degree 7 when including
-     * a self-loop.  Values are unintialized, coo_tuples are sorted.
-     */
-    int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            return -1;
-        }
-
-        OffsetT interior_nodes  = (width - 2) * (width - 2) * (width - 2);
-        OffsetT face_nodes      = (width - 2) * (width - 2) * 6;
-        OffsetT edge_nodes      = (width - 2) * 12;
-        OffsetT corner_nodes    = 8;
-        num_cols                       = width * width * width;
-        num_rows                       = num_cols;
-        num_nonzeros                     = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3);
-
-        if (self_loop)
-            num_nonzeros += num_rows;
-
-        coo_tuples          = new CooTuple[num_nonzeros];
-        int current_edge    = 0;
-
-        for (OffsetT i = 0; i < width; i++)
-        {
-            for (OffsetT j = 0; j < width; j++)
-            {
-                for (OffsetT k = 0; k < width; k++)
-                {
-
-                    OffsetT me = (i * width * width) + (j * width) + k;
-
-                    // Up
-                    OffsetT neighbor = (i * width * width) + (j * width) + (k - 1);
-                    if (k - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // Down
-                    neighbor = (i * width * width) + (j * width) + (k + 1);
-                    if (k + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // West
-                    neighbor = (i * width * width) + ((j - 1) * width) + k;
-                    if (j - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // East
-                    neighbor = (i * width * width) + ((j + 1) * width) + k;
-                    if (j + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // North
-                    neighbor = ((i - 1) * width * width) + (j * width) + k;
-                    if (i - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // South
-                    neighbor = ((i + 1) * width * width) + (j * width) + k;
-                    if (i + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    if (self_loop)
-                    {
-                        neighbor = me;
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-                }
-            }
-        }
-
-        // Sort by rows, then columns, update dims
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-};
-
-
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-
-/**
- * CSR sparse format matrix
- */
-template<
-    typename ValueT,
-    typename OffsetT>
-struct CsrMatrix
-{
-    int         num_rows;
-    int         num_cols;
-    int         num_nonzeros;
-    OffsetT*    row_offsets;
-    OffsetT*    column_indices;
-    ValueT*     values;
-    bool        numa_malloc;
-
-    /**
-     * Constructor
-     */
-    CsrMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), row_offsets(NULL), column_indices(NULL), values(NULL) 
-    {
-#ifdef CUB_MKL
-        numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1));
-#else
-        numa_malloc = false;
-#endif
-    }
-
-
-    /**
-     * Clear
-     */
-    void Clear()
-    {
-#ifdef CUB_MKL
-        if (numa_malloc) 
-        {
-            numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1));
-            numa_free(values, sizeof(ValueT) * num_nonzeros);
-            numa_free(column_indices, sizeof(OffsetT) * num_nonzeros);
-        }
-        else
-        {
-            if (row_offsets)    mkl_free(row_offsets);
-            if (column_indices) mkl_free(column_indices);
-            if (values)         mkl_free(values);
-        }
-
-#else
-        if (row_offsets)    delete[] row_offsets;
-        if (column_indices) delete[] column_indices;
-        if (values)         delete[] values;
-#endif
-
-        row_offsets = NULL;
-        column_indices = NULL;
-        values = NULL;
-    }
-
-    /**
-     * Destructor
-     */
-    ~CsrMatrix()
-    {
-        Clear();
-    }
-
-    GraphStats Stats()
-    {
-        GraphStats stats;
-        stats.num_rows = num_rows;
-        stats.num_cols = num_cols;
-        stats.num_nonzeros = num_nonzeros;
-
-        //
-        // Compute diag-distance statistics
-        //
-
-        OffsetT samples     = 0;
-        double  mean        = 0.0;
-        double  ss_tot      = 0.0;
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-                double x                = (col > row) ? col - row : row - col;
-
-                samples++;
-                double delta            = x - mean;
-                mean                    = mean + (delta / samples);
-                ss_tot                  += delta * (x - mean);
-            }
-        }
-        stats.diag_dist_mean            = mean;
-        double variance                 = ss_tot / samples;
-        stats.diag_dist_std_dev         = sqrt(variance);
-
-
-        //
-        // Compute deming statistics
-        //
-
-        samples         = 0;
-        double mean_x   = 0.0;
-        double mean_y   = 0.0;
-        double ss_x     = 0.0;
-        double ss_y     = 0.0;
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-
-                samples++;
-                double x                = col;
-                double y                = row;
-                double delta;
-
-                delta                   = x - mean_x;
-                mean_x                  = mean_x + (delta / samples);
-                ss_x                    += delta * (x - mean_x);
-
-                delta                   = y - mean_y;
-                mean_y                  = mean_y + (delta / samples);
-                ss_y                    += delta * (y - mean_y);
-            }
-        }
-
-        samples         = 0;
-        double s_xy     = 0.0;
-        double s_xxy    = 0.0;
-        double s_xyy    = 0.0;
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-
-                samples++;
-                double x                = col;
-                double y                = row;
-
-                double xy =             (x - mean_x) * (y - mean_y);
-                double xxy =            (x - mean_x) * (x - mean_x) * (y - mean_y);
-                double xyy =            (x - mean_x) * (y - mean_y) * (y - mean_y);
-                double delta;
-
-                delta                   = xy - s_xy;
-                s_xy                    = s_xy + (delta / samples);
-
-                delta                   = xxy - s_xxy;
-                s_xxy                   = s_xxy + (delta / samples);
-
-                delta                   = xyy - s_xyy;
-                s_xyy                   = s_xyy + (delta / samples);
-            }
-        }
-
-        double s_xx     = ss_x / num_nonzeros;
-        double s_yy     = ss_y / num_nonzeros;
-
-        double deming_slope = (s_yy - s_xx + sqrt(((s_yy - s_xx) * (s_yy - s_xx)) + (4 * s_xy * s_xy))) / (2 * s_xy);
-
-        stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y));
-
-
-        //
-        // Compute row-length statistics
-        //
-
-        // Sample mean
-        stats.row_length_mean       = double(num_nonzeros) / num_rows;
-        variance                    = 0.0;
-        stats.row_length_skewness   = 0.0;
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT length              = row_offsets[row + 1] - row_offsets[row];
-            double delta                = double(length) - stats.row_length_mean;
-            variance   += (delta * delta);
-            stats.row_length_skewness   += (delta * delta * delta);
-        }
-        variance                    /= num_rows;
-        stats.row_length_std_dev    = sqrt(variance);
-        stats.row_length_skewness   = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0);
-        stats.row_length_variation  = stats.row_length_std_dev / stats.row_length_mean;
-
-        return stats;
-    }
-
-    /**
-     * Build CSR matrix from sorted COO matrix
-     */
-    void FromCoo(const CooMatrix<ValueT, OffsetT> &coo_matrix)
-    {
-        num_rows        = coo_matrix.num_rows;
-        num_cols        = coo_matrix.num_cols;
-        num_nonzeros    = coo_matrix.num_nonzeros;
-
-#ifdef CUB_MKL
-
-        if (numa_malloc)
-        {
-            numa_set_strict(1);
-//            numa_set_bind_policy(1);
-
-//        values          = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros);
-//        row_offsets     = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1));
-//        column_indices  = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros);
-
-            row_offsets     = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0);
-            column_indices  = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0);
-            values          = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1);
-        }
-        else
-        {
-            values          = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096);
-            row_offsets     = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096);
-            column_indices  = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096);
-
-        }
-
-#else
-        row_offsets     = new OffsetT[num_rows + 1];
-        column_indices  = new OffsetT[num_nonzeros];
-        values          = new ValueT[num_nonzeros];
-#endif
-
-        OffsetT prev_row = -1;
-        for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++)
-        {
-            OffsetT current_row = coo_matrix.coo_tuples[current_edge].row;
-
-            // Fill in rows up to and including the current row
-            for (OffsetT row = prev_row + 1; row <= current_row; row++)
-            {
-                row_offsets[row] = current_edge;
-            }
-            prev_row = current_row;
-
-            column_indices[current_edge]    = coo_matrix.coo_tuples[current_edge].col;
-            values[current_edge]            = coo_matrix.coo_tuples[current_edge].val;
-        }
-
-        // Fill out any trailing edgeless vertices (and the end-of-list element)
-        for (OffsetT row = prev_row + 1; row <= num_rows; row++)
-        {
-            row_offsets[row] = num_nonzeros;
-        }
-    }
-
-
-    /**
-     * Display log-histogram to stdout
-     */
-    void DisplayHistogram()
-    {
-        // Initialize
-        int log_counts[9];
-        for (int i = 0; i < 9; i++)
-        {
-            log_counts[i] = 0;
-        }
-
-        // Scan
-        int max_log_length = -1;
-        for (OffsetT row = 0; row < num_rows; row++)
-        {
-            OffsetT length = row_offsets[row + 1] - row_offsets[row];
-
-            int log_length = -1;
-            while (length > 0)
-            {
-                length /= 10;
-                log_length++;
-            }
-            if (log_length > max_log_length)
-            {
-                max_log_length = log_length;
-            }
-
-            log_counts[log_length + 1]++;
-        }
-        printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros);
-        for (int i = -1; i < max_log_length + 1; i++)
-        {
-            printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols);
-        }
-        fflush(stdout);
-    }
-
-
-    /**
-     * Display matrix to stdout
-     */
-    void Display()
-    {
-        printf("Input Matrix:\n");
-        for (OffsetT row = 0; row < num_rows; row++)
-        {
-            printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]);
-            for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++)
-            {
-                printf("%d (%f), ", column_indices[current_edge], values[current_edge]);
-            }
-            printf("\n");
-        }
-        fflush(stdout);
-    }
-
-
-};
-
-
-
-/******************************************************************************
- * Matrix transformations
- ******************************************************************************/
-
-// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first)
-template <typename OffsetT>
-struct OrderByLow
-{
-    OffsetT* row_degrees;
-    OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {}
-
-    bool operator()(const OffsetT &a, const OffsetT &b)
-    {
-        if (row_degrees[a] < row_degrees[b])
-            return true;
-        else if (row_degrees[a] > row_degrees[b])
-            return false;
-        else
-            return (a < b);
-    }
-};
-
-// Comparator for ordering rows by degree (highest first), then by row-id (lowest first)
-template <typename OffsetT>
-struct OrderByHigh
-{
-    OffsetT* row_degrees;
-    OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {}
-
-    bool operator()(const OffsetT &a, const OffsetT &b)
-    {
-        if (row_degrees[a] > row_degrees[b])
-            return true;
-        else if (row_degrees[a] < row_degrees[b])
-            return false;
-        else
-            return (a < b);
-    }
-};
-
-
-
-/**
- * Reverse Cuthill-McKee
- */
-template <typename ValueT, typename OffsetT>
-void RcmRelabel(
-    CsrMatrix<ValueT, OffsetT>&     matrix,
-    OffsetT*                        relabel_indices)
-{
-    // Initialize row degrees
-    OffsetT* row_degrees_in     = new OffsetT[matrix.num_rows];
-    OffsetT* row_degrees_out    = new OffsetT[matrix.num_rows];
-    for (OffsetT row = 0; row < matrix.num_rows; ++row)
-    {
-        row_degrees_in[row]         = 0;
-        row_degrees_out[row]        = matrix.row_offsets[row + 1] - matrix.row_offsets[row];
-    }
-    for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero)
-    {
-        row_degrees_in[matrix.column_indices[nonzero]]++;
-    }
-
-    // Initialize unlabeled set 
-    typedef std::set<OffsetT, OrderByLow<OffsetT> > UnlabeledSet;
-    typename UnlabeledSet::key_compare  unlabeled_comp(row_degrees_in);
-    UnlabeledSet                        unlabeled(unlabeled_comp);
-    for (OffsetT row = 0; row < matrix.num_rows; ++row)
-    {
-        relabel_indices[row]    = -1;
-        unlabeled.insert(row);
-    }
-
-    // Initialize queue set
-    std::deque<OffsetT> q;
-
-    // Process unlabeled vertices (traverse connected components)
-    OffsetT relabel_idx = 0;
-    while (!unlabeled.empty())
-    {
-        // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree
-        OffsetT vertex = *unlabeled.begin();
-        q.push_back(vertex);
-
-        while (!q.empty())
-        {
-            vertex = q.front();
-            q.pop_front();
-
-            if (relabel_indices[vertex] == -1)
-            {
-                // Update this vertex
-                unlabeled.erase(vertex);
-                relabel_indices[vertex] = relabel_idx;
-                relabel_idx++;
-
-                // Sort neighbors by degree
-                OrderByLow<OffsetT> neighbor_comp(row_degrees_in);
-                std::sort(
-                    matrix.column_indices + matrix.row_offsets[vertex],
-                    matrix.column_indices + matrix.row_offsets[vertex + 1],
-                    neighbor_comp);
-
-                // Inspect neighbors, adding to the out frontier if unlabeled
-                for (OffsetT neighbor_idx = matrix.row_offsets[vertex];
-                    neighbor_idx < matrix.row_offsets[vertex + 1];
-                    ++neighbor_idx)
-                {
-                    OffsetT neighbor = matrix.column_indices[neighbor_idx];
-                    q.push_back(neighbor);
-                }
-            }
-        }
-    }
-
-/*
-    // Reverse labels
-    for (int row = 0; row < matrix.num_rows; ++row)
-    {
-        relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1;
-    }
-*/
-
-    // Cleanup
-    if (row_degrees_in) delete[] row_degrees_in;
-    if (row_degrees_out) delete[] row_degrees_out;
-}
-
-
-/**
- * Reverse Cuthill-McKee
- */
-template <typename ValueT, typename OffsetT>
-void RcmRelabel(
-    CsrMatrix<ValueT, OffsetT>&     matrix,
-    bool                            verbose = false)
-{
-    // Do not process if not square
-    if (matrix.num_cols != matrix.num_rows)
-    {
-        if (verbose) {
-            printf("RCM transformation ignored (not square)\n"); fflush(stdout);
-        }
-        return;
-    }
-
-    // Initialize relabel indices
-    OffsetT* relabel_indices = new OffsetT[matrix.num_rows];
-
-    if (verbose) {
-        printf("RCM relabeling... "); fflush(stdout);
-    }
-
-    RcmRelabel(matrix, relabel_indices);
-
-    if (verbose) {
-        printf("done. Reconstituting... "); fflush(stdout);
-    }
-
-    // Create a COO matrix from the relabel indices
-    CooMatrix<ValueT, OffsetT> coo_matrix;
-    coo_matrix.InitCsrRelabel(matrix, relabel_indices);
-
-    // Reconstitute the CSR matrix from the sorted COO tuples
-    if (relabel_indices) delete[] relabel_indices;
-    matrix.Clear();
-    matrix.FromCoo(coo_matrix);
-
-    if (verbose) {
-        printf("done. "); fflush(stdout);
-    }
-}
-
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu
deleted file mode 100644
index b64297d8d..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_compare.cu
+++ /dev/null
@@ -1,917 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-//---------------------------------------------------------------------
-// SpMV comparison tool
-//---------------------------------------------------------------------
-
-#include <stdio.h>
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <cstdio>
-#include <fstream>
-
-#include <cusparse.h>
-
-#include "sparse_matrix.h"
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <cub/device/device_spmv.cuh>
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/tex_ref_input_iterator.cuh>
-#include <test/test_util.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants, and type declarations
-//---------------------------------------------------------------------
-
-bool                    g_quiet     = false;        // Whether to display stats in CSV format
-bool                    g_verbose   = false;        // Whether to display output to console
-bool                    g_verbose2  = false;        // Whether to display input to console
-CachingDeviceAllocator  g_allocator(true);          // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// SpMV verification
-//---------------------------------------------------------------------
-
-// Compute reference SpMV y = Ax
-template <
-    typename ValueT,
-    typename OffsetT>
-void SpmvGold(
-    CsrMatrix<ValueT, OffsetT>&     a,
-    ValueT*                         vector_x,
-    ValueT*                         vector_y_in,
-    ValueT*                         vector_y_out,
-    ValueT                          alpha,
-    ValueT                          beta)
-{
-    for (OffsetT row = 0; row < a.num_rows; ++row)
-    {
-        ValueT partial = beta * vector_y_in[row];
-        for (
-            OffsetT offset = a.row_offsets[row];
-            offset < a.row_offsets[row + 1];
-            ++offset)
-        {
-            partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]];
-        }
-        vector_y_out[row] = partial;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// GPU I/O proxy
-//---------------------------------------------------------------------
-
-/**
- * Read every matrix nonzero value, read every corresponding vector value
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ValueT,
-    typename    OffsetT,
-    typename    VectorItr>
-__launch_bounds__ (int(BLOCK_THREADS))
-__global__ void NonZeroIoKernel(
-    SpmvParams<ValueT, OffsetT> params,
-    VectorItr                   d_vector_x)
-{
-    enum
-    {
-        TILE_ITEMS      = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-
-    ValueT nonzero = 0.0;
-
-    int tile_idx = blockIdx.x;
-
-    OffsetT block_offset = tile_idx * TILE_ITEMS;
-
-    OffsetT column_indices[ITEMS_PER_THREAD];
-    ValueT values[ITEMS_PER_THREAD];
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
-
-        OffsetT* ci = params.d_column_indices + nonzero_idx;
-        ValueT*a = params.d_values + nonzero_idx;
-
-        column_indices[ITEM]    = (nonzero_idx < params.num_nonzeros) ? *ci : 0;
-        values[ITEM]            = (nonzero_idx < params.num_nonzeros) ? *a : 0.0;
-    }
-
-    __syncthreads();
-
-    // Read vector
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        ValueT vector_value    = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]);
-        nonzero                += vector_value * values[ITEM];
-    }
-
-    __syncthreads();
-
-    if (block_offset < params.num_rows)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
-            if (row_idx < params.num_rows)
-            {
-                OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx);
-
-                if ((row_end_offset >= 0) && (nonzero == nonzero))
-                    params.d_vector_y[row_idx] = nonzero;
-            }
-        }
-    }
-
-}
-
-
-/**
- * Run GPU I/O proxy
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-float TestGpuCsrIoProxy(
-    SpmvParams<ValueT, OffsetT>&    params,
-    int                             timing_iterations)
-{
-    enum {
-        BLOCK_THREADS       = 128,
-        ITEMS_PER_THREAD    = 7,
-        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-//    size_t smem = 1024 * 16;
-    size_t smem = 1024 * 0;
-
-    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
-    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
-    unsigned int blocks = std::max(nonzero_blocks, row_blocks);
-
-    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
-    TexItr x_itr;
-    CubDebugExit(x_itr.BindTexture(params.d_vector_x));
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
-
-
-    int spmv_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
-
-    if (!g_quiet)
-        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
-
-    // Warmup
-    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
-
-    // Check for failures
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(SyncStream(0));
-
-    // Timing
-    GpuTimer timer;
-    float elapsed_millis = 0.0;
-    timer.Start();
-    for (int it = 0; it < timing_iterations; ++it)
-    {
-        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    CubDebugExit(x_itr.UnbindTexture());
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-
-//---------------------------------------------------------------------
-// cuSparse HybMV
-//---------------------------------------------------------------------
-
-/**
- * Run cuSparse HYB SpMV (specialized for fp32)
- */
-template <
-    typename OffsetT>
-float TestCusparseHybmv(
-    float*                          vector_y_in,
-    float*                          reference_vector_y_out,
-    SpmvParams<float, OffsetT>&     params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    CpuTimer cpu_timer;
-    cpu_timer.Start();
-
-    // Construct Hyb matrix
-    cusparseMatDescr_t mat_desc;
-    cusparseHybMat_t hyb_desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
-    cusparseStatus_t status = cusparseScsr2hyb(
-        cusparse,
-        params.num_rows, params.num_cols,
-        mat_desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        hyb_desc,
-        0,
-        CUSPARSE_HYB_PARTITION_AUTO);
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
-
-    cudaDeviceSynchronize();
-    cpu_timer.Stop();
-    float elapsed_millis = cpu_timer.ElapsedMillis();
-    printf("HYB setup ms, %.5f, ", elapsed_millis);
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
-        cusparse,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &params.alpha, mat_desc,
-        hyb_desc,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
-            cusparse,
-            CUSPARSE_OPERATION_NON_TRANSPOSE,
-            &params.alpha, mat_desc,
-            hyb_desc,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    // Cleanup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-/**
- * Run cuSparse HYB SpMV (specialized for fp64)
- */
-template <
-    typename OffsetT>
-float TestCusparseHybmv(
-    double*                         vector_y_in,
-    double*                         reference_vector_y_out,
-    SpmvParams<double, OffsetT>&    params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    CpuTimer cpu_timer;
-    cpu_timer.Start();
-
-    // Construct Hyb matrix
-    cusparseMatDescr_t mat_desc;
-    cusparseHybMat_t hyb_desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
-        cusparse,
-        params.num_rows, params.num_cols,
-        mat_desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        hyb_desc,
-        0,
-        CUSPARSE_HYB_PARTITION_AUTO));
-
-    cudaDeviceSynchronize();
-    cpu_timer.Stop();
-    float elapsed_millis = cpu_timer.ElapsedMillis();
-    printf("HYB setup ms, %.5f, ", elapsed_millis);
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
-        cusparse,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &params.alpha, mat_desc,
-        hyb_desc,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
-            cusparse,
-            CUSPARSE_OPERATION_NON_TRANSPOSE,
-            &params.alpha, mat_desc,
-            hyb_desc,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    // Cleanup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-
-//---------------------------------------------------------------------
-// cuSparse CsrMV
-//---------------------------------------------------------------------
-
-/**
- * Run cuSparse SpMV (specialized for fp32)
- */
-template <
-    typename OffsetT>
-float TestCusparseCsrmv(
-    float*                          vector_y_in,
-    float*                          reference_vector_y_out,
-    SpmvParams<float, OffsetT>&     params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    cusparseMatDescr_t desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
-        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    float elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
-            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
-    return elapsed_millis / timing_iterations;
-}
-
-
-/**
- * Run cuSparse SpMV (specialized for fp64)
- */
-template <
-    typename OffsetT>
-float TestCusparseCsrmv(
-    double*                         vector_y_in,
-    double*                         reference_vector_y_out,
-    SpmvParams<double, OffsetT>&    params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    cusparseMatDescr_t desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
-        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    float elapsed_millis = 0.0;
-    GpuTimer timer;
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
-            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
-    return elapsed_millis / timing_iterations;
-}
-
-//---------------------------------------------------------------------
-// GPU Merge-based SpMV
-//---------------------------------------------------------------------
-
-/**
- * Run CUB SpMV
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-float TestGpuMergeCsrmv(
-    ValueT*                         vector_y_in,
-    ValueT*                         reference_vector_y_out,
-    SpmvParams<ValueT, OffsetT>&    params,
-    int                             timing_iterations)
-{
-    // Allocate temporary storage
-    size_t temp_storage_bytes = 0;
-    void *d_temp_storage = NULL;
-
-    // Get amount of temporary storage needed
-    CubDebugExit(DeviceSpmv::CsrMV(
-        d_temp_storage, temp_storage_bytes,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, params.d_vector_y,
-        params.num_rows, params.num_cols, params.num_nonzeros,
-// params.alpha, params.beta,
-        (cudaStream_t) 0, false));
-
-    // Allocate
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    CubDebugExit(DeviceSpmv::CsrMV(
-        d_temp_storage, temp_storage_bytes,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, params.d_vector_y,
-        params.num_rows, params.num_cols, params.num_nonzeros, 
-// params.alpha, params.beta,
-        (cudaStream_t) 0, !g_quiet));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    GpuTimer timer;
-    float elapsed_millis = 0.0;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        CubDebugExit(DeviceSpmv::CsrMV(
-            d_temp_storage, temp_storage_bytes,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, params.d_vector_y,
-            params.num_rows, params.num_cols, params.num_nonzeros, 
-// params.alpha, params.beta,
-            (cudaStream_t) 0, false));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    return elapsed_millis / timing_iterations;
-}
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Display perf
- */
-template <typename ValueT, typename OffsetT>
-void DisplayPerf(
-    float                           device_giga_bandwidth,
-    double                          avg_millis,
-    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
-{
-    double nz_throughput, effective_bandwidth;
-    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
-        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
-
-    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
-    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
-
-    if (!g_quiet)
-        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
-            sizeof(ValueT) * 8,
-            avg_millis,
-            2 * nz_throughput,
-            effective_bandwidth,
-            effective_bandwidth / device_giga_bandwidth * 100);
-    else
-        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
-            avg_millis,
-            2 * nz_throughput,
-            effective_bandwidth,
-            effective_bandwidth / device_giga_bandwidth * 100);
-
-    fflush(stdout);
-}
-
-
-
-/**
- * Run tests
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-void RunTest(
-    bool                        rcm_relabel,
-    ValueT                      alpha,
-    ValueT                      beta,
-    CooMatrix<ValueT, OffsetT>& coo_matrix,
-    int                         timing_iterations,
-    CommandLineArgs&            args)
-{
-    // Adaptive timing iterations: run 16 billion nonzeros through
-    if (timing_iterations == -1)
-        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
-
-    if (!g_quiet)
-        printf("\t%d timing iterations\n", timing_iterations);
-
-    // Convert to CSR
-    CsrMatrix<ValueT, OffsetT> csr_matrix;
-    csr_matrix.FromCoo(coo_matrix);
-    if (!args.CheckCmdLineFlag("csrmv"))
-        coo_matrix.Clear();
-
-    // Relabel
-    if (rcm_relabel)
-    {
-        if (!g_quiet)
-        {
-            csr_matrix.Stats().Display();
-            printf("\n");
-            csr_matrix.DisplayHistogram();
-            printf("\n");
-            if (g_verbose2)
-                csr_matrix.Display();
-            printf("\n");
-        }
-
-        RcmRelabel(csr_matrix, !g_quiet);
-
-        if (!g_quiet) printf("\n");
-    }
-
-    // Display matrix info
-    csr_matrix.Stats().Display(!g_quiet);
-    if (!g_quiet)
-    {
-        printf("\n");
-        csr_matrix.DisplayHistogram();
-        printf("\n");
-        if (g_verbose2)
-            csr_matrix.Display();
-        printf("\n");
-    }
-    fflush(stdout);
-
-    // Allocate input and output vectors
-    ValueT* vector_x        = new ValueT[csr_matrix.num_cols];
-    ValueT* vector_y_in     = new ValueT[csr_matrix.num_rows];
-    ValueT* vector_y_out    = new ValueT[csr_matrix.num_rows];
-
-    for (int col = 0; col < csr_matrix.num_cols; ++col)
-        vector_x[col] = 1.0;
-
-    for (int row = 0; row < csr_matrix.num_rows; ++row)
-        vector_y_in[row] = 1.0;
-
-    // Compute reference answer
-    SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta);
-
-    float avg_millis;
-
-    if (g_quiet) {
-        printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout);
-    }
-
-    // Get GPU device bandwidth (GB/s)
-    float device_giga_bandwidth = args.device_giga_bandwidth;
-
-    // Allocate and initialize GPU problem
-    SpmvParams<ValueT, OffsetT> params;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_values,          sizeof(ValueT) * csr_matrix.num_nonzeros));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_x,        sizeof(ValueT) * csr_matrix.num_cols));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_y,        sizeof(ValueT) * csr_matrix.num_rows));
-    params.num_rows         = csr_matrix.num_rows;
-    params.num_cols         = csr_matrix.num_cols;
-    params.num_nonzeros     = csr_matrix.num_nonzeros;
-    params.alpha            = alpha;
-    params.beta             = beta;
-
-    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
-
-    if (!g_quiet) printf("\n\n");
-    printf("GPU CSR I/O Prox, "); fflush(stdout);
-    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
-    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-
-    if (args.CheckCmdLineFlag("csrmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("CUB, "); fflush(stdout);
-        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-    // Initialize cuSparse
-    cusparseHandle_t cusparse;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
-
-    if (args.CheckCmdLineFlag("csrmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("Cusparse CsrMV, "); fflush(stdout);
-        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-    if (args.CheckCmdLineFlag("hybmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("Cusparse HybMV, "); fflush(stdout);
-
-        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-
-    // Cleanup
-    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
-    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
-    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
-    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
-    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
-
-    if (vector_x)                   delete[] vector_x;
-    if (vector_y_in)                delete[] vector_y_in;
-    if (vector_y_out)               delete[] vector_y_out;
-}
-
-/**
- * Run tests
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-void RunTests(
-    bool                rcm_relabel,
-    ValueT              alpha,
-    ValueT              beta,
-    const std::string&  mtx_filename,
-    int                 grid2d,
-    int                 grid3d,
-    int                 wheel,
-    int                 dense,
-    int                 timing_iterations,
-    CommandLineArgs&    args)
-{
-    // Initialize matrix in COO form
-    CooMatrix<ValueT, OffsetT> coo_matrix;
-
-    if (!mtx_filename.empty())
-    {
-        // Parse matrix market file
-        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
-        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
-
-        if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
-        {
-            if (!g_quiet) printf("Trivial dataset\n");
-            exit(0);
-        }
-    }
-    else if (grid2d > 0)
-    {
-        // Generate 2D lattice
-        printf("grid2d_%d, ", grid2d); fflush(stdout);
-        coo_matrix.InitGrid2d(grid2d, false);
-    }
-    else if (grid3d > 0)
-    {
-        // Generate 3D lattice
-        printf("grid3d_%d, ", grid3d); fflush(stdout);
-        coo_matrix.InitGrid3d(grid3d, false);
-    }
-    else if (wheel > 0)
-    {
-        // Generate wheel graph
-        printf("wheel_%d, ", grid2d); fflush(stdout);
-        coo_matrix.InitWheel(wheel);
-    }
-    else if (dense > 0)
-    {
-        // Generate dense graph
-        OffsetT size = 1 << 24; // 16M nnz
-        args.GetCmdLineArgument("size", size);
-
-        OffsetT rows = size / dense;
-        printf("dense_%d_x_%d, ", rows, dense); fflush(stdout);
-        coo_matrix.InitDense(rows, dense);
-    }
-    else
-    {
-        fprintf(stderr, "No graph type specified.\n");
-        exit(1);
-    }
-
-    RunTest(
-        rcm_relabel,
-        alpha,
-        beta,
-        coo_matrix,
-        timing_iterations,
-        args);
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char **argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf(
-            "%s "
-            "[--csrmv | --hybmv | --bsrmv ] "
-            "[--device=<device-id>] "
-            "[--quiet] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "[--fp64] "
-            "[--rcm] "
-            "[--alpha=<alpha scalar (default: 1.0)>] "
-            "[--beta=<beta scalar (default: 0.0)>] "
-            "\n\t"
-                "--mtx=<matrix market file> "
-            "\n\t"
-                "--dense=<cols>"
-            "\n\t"
-                "--grid2d=<width>"
-            "\n\t"
-                "--grid3d=<width>"
-            "\n\t"
-                "--wheel=<spokes>"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    bool                fp64;
-    bool                rcm_relabel;
-    std::string         mtx_filename;
-    int                 grid2d              = -1;
-    int                 grid3d              = -1;
-    int                 wheel               = -1;
-    int                 dense               = -1;
-    int                 timing_iterations   = -1;
-    float               alpha               = 1.0;
-    float               beta                = 0.0;
-
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose2 = args.CheckCmdLineFlag("v2");
-    g_quiet = args.CheckCmdLineFlag("quiet");
-    fp64 = args.CheckCmdLineFlag("fp64");
-    rcm_relabel = args.CheckCmdLineFlag("rcm");
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("mtx", mtx_filename);
-    args.GetCmdLineArgument("grid2d", grid2d);
-    args.GetCmdLineArgument("grid3d", grid3d);
-    args.GetCmdLineArgument("wheel", wheel);
-    args.GetCmdLineArgument("dense", dense);
-    args.GetCmdLineArgument("alpha", alpha);
-    args.GetCmdLineArgument("beta", beta);
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run test(s)
-    if (fp64)
-    {
-        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
-    }
-    else
-    {
-        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
-    }
-
-    CubDebugExit(cudaDeviceSynchronize());
-    printf("\n");
-
-    return 0;
-}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh
deleted file mode 100755
index f43204315..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/experimental/spmv_script.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
-do
-	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
-done
-
-echo
-echo
-
-for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
-do
-    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
-    then
-    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
-    fi
-done
-
-echo
-echo
-
-for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
-#for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
-do 
-    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
-    then
-    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
-    fi
-done 
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h
deleted file mode 100644
index f032f2154..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/half.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * Utilities for interacting with the opaque CUDA __half type
- */
-
-#include <stdint.h>
-#include <cuda_fp16.h>
-#include <iosfwd>
-
-#include <cub/util_type.cuh>
-
-
-/******************************************************************************
- * half_t
- ******************************************************************************/
-
-/**
- * Host-based fp16 data type compatible and convertible with __half
- */
-struct half_t
-{
-    uint16_t __x;
-
-    /// Constructor from __half
-    __host__ __device__ __forceinline__
-    half_t(const __half &other)
-    {
-        __x = reinterpret_cast<const uint16_t&>(other);
-    }
-
-    /// Constructor from integer
-    __host__ __device__ __forceinline__
-    half_t(int a)
-    {
-        *this = half_t(float(a));
-    }
-
-    /// Default constructor
-    __host__ __device__ __forceinline__
-    half_t() : __x(0)
-    {}
-
-    /// Constructor from float
-    __host__ __device__ __forceinline__
-    half_t(float a)
-    {
-        // Stolen from Norbert Juffa
-        uint32_t ia = *reinterpret_cast<uint32_t*>(&a);
-        uint16_t ir;
-
-        ir = (ia >> 16) & 0x8000;
-
-        if ((ia & 0x7f800000) == 0x7f800000)
-        {
-            if ((ia & 0x7fffffff) == 0x7f800000)
-            {
-                ir |= 0x7c00; /* infinity */
-            }
-            else
-            {
-                ir = 0x7fff; /* canonical NaN */
-            }
-        }
-        else if ((ia & 0x7f800000) >= 0x33000000)
-        {
-            int32_t shift = (int32_t) ((ia >> 23) & 0xff) - 127;
-            if (shift > 15)
-            {
-                ir |= 0x7c00; /* infinity */
-            }
-            else
-            {
-                ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-                if (shift < -14)
-                { /* denormal */
-                    ir |= ia >> (-1 - shift);
-                    ia = ia << (32 - (-1 - shift));
-                }
-                else
-                { /* normal */
-                    ir |= ia >> (24 - 11);
-                    ia = ia << (32 - (24 - 11));
-                    ir = ir + ((14 + shift) << 10);
-                }
-                /* IEEE-754 round to nearest of even */
-                if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1)))
-                {
-                    ir++;
-                }
-            }
-        }
-
-        this->__x = ir;
-    }
-
-    /// Cast to __half
-    __host__ __device__ __forceinline__
-    operator __half() const
-    {
-        return reinterpret_cast<const __half&>(__x);
-    }
-
-    /// Cast to float
-    __host__ __device__ __forceinline__
-    operator float() const
-    {
-        // Stolen from Andrew Kerr
-
-        int sign        = ((this->__x >> 15) & 1);
-        int exp         = ((this->__x >> 10) & 0x1f);
-        int mantissa    = (this->__x & 0x3ff);
-        uint32_t f      = 0;
-
-        if (exp > 0 && exp < 31)
-        {
-            // normal
-            exp += 112;
-            f = (sign << 31) | (exp << 23) | (mantissa << 13);
-        }
-        else if (exp == 0)
-        {
-            if (mantissa)
-            {
-                // subnormal
-                exp += 113;
-                while ((mantissa & (1 << 10)) == 0)
-                {
-                    mantissa <<= 1;
-                    exp--;
-                }
-                mantissa &= 0x3ff;
-                f = (sign << 31) | (exp << 23) | (mantissa << 13);
-            }
-            else if (sign)
-            {
-                f = 0x80000000; // negative zero
-            }
-            else
-            {
-                f = 0x0;        // zero
-            }
-        }
-        else if (exp == 31)
-        {
-            if (mantissa)
-            {
-                f = 0x7fffffff;     // not a number
-            }
-            else
-            {
-                f = (0xff << 23) | (sign << 31);    //  inf
-            }
-        }
-        return *reinterpret_cast<float const *>(&f);
-    }
-
-
-    /// Get raw storage
-    __host__ __device__ __forceinline__
-    uint16_t raw()
-    {
-        return this->__x;
-    }
-
-    /// Equality
-    __host__ __device__ __forceinline__
-    bool operator ==(const half_t &other)
-    {
-        return (this->__x == other.__x);
-    }
-
-    /// Inequality
-    __host__ __device__ __forceinline__
-    bool operator !=(const half_t &other)
-    {
-        return (this->__x != other.__x);
-    }
-
-    /// Assignment by sum
-    __host__ __device__ __forceinline__
-    half_t& operator +=(const half_t &rhs)
-    {
-        *this = half_t(float(*this) + float(rhs));
-        return *this;
-    }
-
-    /// Multiply
-    __host__ __device__ __forceinline__
-    half_t operator*(const half_t &other)
-    {
-        return half_t(float(*this) * float(other));
-    }
-
-    /// Add
-    __host__ __device__ __forceinline__
-    half_t operator+(const half_t &other)
-    {
-        return half_t(float(*this) + float(other));
-    }
-
-    /// Less-than
-    __host__ __device__ __forceinline__
-    bool operator<(const half_t &other) const
-    {
-        return float(*this) < float(other);
-    }
-
-    /// Less-than-equal
-    __host__ __device__ __forceinline__
-    bool operator<=(const half_t &other) const
-    {
-        return float(*this) <= float(other);
-    }
-
-    /// Greater-than
-    __host__ __device__ __forceinline__
-    bool operator>(const half_t &other) const
-    {
-        return float(*this) > float(other);
-    }
-
-    /// Greater-than-equal
-    __host__ __device__ __forceinline__
-    bool operator>=(const half_t &other) const
-    {
-        return float(*this) >= float(other);
-    }
-
-    /// numeric_traits<half_t>::max
-    __host__ __device__ __forceinline__
-    static half_t max() {
-        uint16_t max_word = 0x7BFF;
-        return reinterpret_cast<half_t&>(max_word);
-    }
-
-    /// numeric_traits<half_t>::lowest
-    __host__ __device__ __forceinline__
-    static half_t lowest() {
-        uint16_t lowest_word = 0xFBFF;
-        return reinterpret_cast<half_t&>(lowest_word);
-    }
-};
-
-
-/******************************************************************************
- * I/O stream overloads
- ******************************************************************************/
-
-/// Insert formatted \p half_t into the output stream
-std::ostream& operator<<(std::ostream &out, const half_t &x)
-{
-    out << (float)x;
-    return out;
-}
-
-
-/// Insert formatted \p __half into the output stream
-std::ostream& operator<<(std::ostream &out, const __half &x)
-{
-    return out << half_t(x);
-}
-
-
-/******************************************************************************
- * Traits overloads
- ******************************************************************************/
-
-template <>
-struct cub::FpLimits<half_t>
-{
-    static __host__ __device__ __forceinline__ half_t Max() { return half_t::max(); }
-
-    static __host__ __device__ __forceinline__ half_t Lowest() { return half_t::lowest(); }
-};
-
-template <> struct cub::NumericTraits<half_t> : cub::BaseTraits<FLOATING_POINT, true, false, unsigned short, half_t> {};
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu
deleted file mode 100644
index 8a9b19f93..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_a.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <cub/cub.cuh>
-
-void a()
-{
-    printf("a() called\n");
-
-    cub::DoubleBuffer<unsigned int>     d_keys;
-    cub::DoubleBuffer<cub::NullType>    d_values;
-    size_t                              temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
-}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu
deleted file mode 100644
index a19ec407d..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_b.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <cub/cub.cuh>
-
-void b()
-{
-    printf("b() called\n");
-
-    cub::DoubleBuffer<unsigned int>     d_keys;
-    cub::DoubleBuffer<cub::NullType>    d_values;
-    size_t                              temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
-}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h
deleted file mode 100644
index 76aae809d..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/mersenne.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- A C-program for MT19937, with initialization improved 2002/1/26.
- Coded by Takuji Nishimura and Makoto Matsumoto.
-
- Before using, initialize the state by using init_genrand(seed)
- or init_by_array(init_key, key_length).
-
- Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. The names of its contributors may not be used to endorse or promote
- products derived from this software without specific prior written
- permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
- Any feedback is very welcome.
- http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
- email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
- */
-
-#include <stdio.h>
-
-namespace mersenne {
-
-/* Period parameters */
-const unsigned int N          = 624;
-const unsigned int M          = 397;
-const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
-const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
-const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
-
-static unsigned int mt[N];  /* the array for the state vector  */
-static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
-
-/* initializes mt[N] with a seed */
-void init_genrand(unsigned int s)
-{
-    mt[0] = s & 0xffffffff;
-    for (mti = 1; mti < N; mti++)
-    {
-        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
-
-        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
-        /* In the previous versions, MSBs of the seed affect   */
-        /* only MSBs of the array mt[].                        */
-        /* 2002/01/09 modified by Makoto Matsumoto             */
-
-        mt[mti] &= 0xffffffff;
-        /* for >32 bit machines */
-    }
-}
-
-/* initialize by an array with array-length */
-/* init_key is the array for initializing keys */
-/* key_length is its length */
-/* slight change for C++, 2004/2/26 */
-void init_by_array(unsigned int init_key[], int key_length)
-{
-    int i, j, k;
-    init_genrand(19650218);
-    i = 1;
-    j = 0;
-    k = (N > key_length ? N : key_length);
-    for (; k; k--)
-    {
-        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
-            + init_key[j] + j;  /* non linear */
-        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
-        i++;
-        j++;
-        if (i >= N)
-        {
-            mt[0] = mt[N - 1];
-            i = 1;
-        }
-        if (j >= key_length) j = 0;
-    }
-    for (k = N - 1; k; k--)
-    {
-        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
-        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
-        i++;
-        if (i >= N)
-        {
-            mt[0] = mt[N - 1];
-            i = 1;
-        }
-    }
-
-    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
-}
-
-/* generates a random number on [0,0xffffffff]-interval */
-unsigned int genrand_int32(void)
-{
-    unsigned int y;
-    static unsigned int mag01[2] = { 0x0, MATRIX_A };
-
-    /* mag01[x] = x * MATRIX_A  for x=0,1 */
-
-    if (mti >= N)
-    { /* generate N words at one time */
-        int kk;
-
-        if (mti == N + 1) /* if init_genrand() has not been called, */
-        init_genrand(5489); /* a defat initial seed is used */
-
-        for (kk = 0; kk < N - M; kk++)
-        {
-            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
-            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
-        }
-        for (; kk < N - 1; kk++)
-        {
-            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
-            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
-        }
-        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
-        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
-
-        mti = 0;
-    }
-
-    y = mt[mti++];
-
-    /* Tempering */
-    y ^= (y >> 11);
-    y ^= (y << 7) & 0x9d2c5680;
-    y ^= (y << 15) & 0xefc60000;
-    y ^= (y >> 18);
-
-    return y;
-}
-
-
-
-} // namespace mersenne
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu
deleted file mode 100644
index e5da5ed59..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_allocator.cu
+++ /dev/null
@@ -1,459 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test evaluation for caching allocator of device memory
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>]"
-            "[--bytes=<timing bytes>]"
-            "[--i=<timing iterations>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-#if (CUB_PTX_ARCH == 0)
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get number of GPUs and current GPU
-    int num_gpus;
-    int initial_gpu;
-    int timing_iterations           = 10000;
-    int timing_bytes                = 1024 * 1024;
-
-    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
-    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("bytes", timing_bytes);
-
-    // Create default allocator (caches up to 6MB in device allocations per GPU)
-    CachingDeviceAllocator allocator;
-    allocator.debug = true;
-
-    printf("Running single-gpu tests...\n"); fflush(stdout);
-
-    //
-    // Test0
-    //
-
-    // Create a new stream
-    cudaStream_t other_stream;
-    CubDebugExit(cudaStreamCreate(&other_stream));
-
-    // Allocate 999 bytes on the current gpu in stream0
-    char *d_999B_stream0_a;
-    char *d_999B_stream0_b;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-
-    // Run some big kernel in stream 0
-    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
-
-    // Free d_999B_stream0_a
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-
-    // Allocate another 999 bytes in stream 0
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Run some big kernel in stream 0
-    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
-
-    // Free d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Allocate 999 bytes on the current gpu in other_stream
-    char *d_999B_stream_other_a;
-    char *d_999B_stream_other_b;
-    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
-
-    // Check that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that we have one cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    // Run some big kernel in other_stream
-    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
-
-    // Free d_999B_stream_other
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
-
-    // Check that we can now use both allocations in stream 0 after synchronizing the device
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Free d_999B_stream0_a and d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Check that we can now use both allocations in other_stream
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
-
-    // Check that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Run some big kernel in other_stream
-    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
-
-    // Free d_999B_stream_other_a and d_999B_stream_other_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
-
-    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(cudaStreamDestroy(other_stream));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Free d_999B_stream0_a and d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Free all cached
-    CubDebugExit(allocator.FreeAllCached());
-
-    //
-    // Test1
-    //
-
-    // Allocate 5 bytes on the current gpu
-    char *d_5B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
-
-    // Check that we have zero free bytes cached on the initial GPU
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
-
-    // Check that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    //
-    // Test2
-    //
-
-    // Allocate 4096 bytes on the current gpu
-    char *d_4096B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
-
-    // Check that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    //
-    // Test3
-    //
-
-    // DeviceFree d_5B
-    CubDebugExit(allocator.DeviceFree(d_5B));
-
-    // Check that we have min_bin_bytes free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that we have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test4
-    //
-
-    // DeviceFree d_4096B
-    CubDebugExit(allocator.DeviceFree(d_4096B));
-
-    // Check that we have the 4096 + min_bin free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
-
-    // Check that we have 0 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 0);
-
-    // Check that we have 2 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 2);
-
-    //
-    // Test5
-    //
-
-    // Allocate 768 bytes on the current gpu
-    char *d_768B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
-
-    // Check that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that we have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test6
-    //
-
-    // Allocate max_cached_bytes on the current gpu
-    char *d_max_cached;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
-
-    // DeviceFree d_max_cached
-    CubDebugExit(allocator.DeviceFree(d_max_cached));
-
-    // Check that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that we still have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test7
-    //
-
-    // Free all cached blocks on all GPUs
-    CubDebugExit(allocator.FreeAllCached());
-
-    // Check that we have 0 bytes cached on the initial GPU
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
-
-    // Check that we have 0 cached blocks across all GPUs
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Check that still we have 1 live block across all GPUs
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    //
-    // Test8
-    //
-
-    // Allocate max cached bytes + 1 on the current gpu
-    char *d_max_cached_plus;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
-
-    // DeviceFree max cached bytes
-    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
-
-    // DeviceFree d_768B
-    CubDebugExit(allocator.DeviceFree(d_768B));
-
-    unsigned int power;
-    size_t rounded_bytes;
-    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
-
-    // Check that we have 4096 free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
-
-    // Check that we have 1 cached blocks across all GPUs
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    // Check that still we have 0 live block across all GPUs
-    AssertEquals(allocator.live_blocks.size(), 0);
-
-#ifndef CUB_CDP
-    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
-
-    if (num_gpus > 1)
-    {
-        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
-
-        //
-        // Test9
-        //
-
-        // Allocate 768 bytes on the next gpu
-        int next_gpu = (initial_gpu + 1) % num_gpus;
-        char *d_768B_2;
-        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
-
-        // DeviceFree d_768B on the next gpu
-        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
-
-        // Re-allocate 768 bytes on the next gpu
-        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
-
-        // Re-free d_768B on the next gpu
-        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
-
-        // Check that we have 4096 free bytes cached on the initial gpu
-        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
-
-        // Check that we have 4096 free bytes cached on the second gpu
-        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
-
-        // Check that we have 2 cached blocks across all GPUs
-        AssertEquals(allocator.cached_blocks.size(), 2);
-
-        // Check that still we have 0 live block across all GPUs
-        AssertEquals(allocator.live_blocks.size(), 0);
-    }
-#endif  // CUB_CDP
-
-    //
-    // Performance
-    //
-
-    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
-    fflush(stdout); fflush(stderr);
-
-    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
-    CpuTimer    cpu_timer;
-    char        *d_1024MB                       = NULL;
-    allocator.debug                             = false;
-
-    // Prime the caching allocator and the kernel
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-    CubDebugExit(allocator.DeviceFree(d_1024MB));
-    cub::EmptyKernel<void><<<1, 32>>>();
-
-    // CUDA
-    cpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
-        CubDebugExit(cudaFree(d_1024MB));
-    }
-    cpu_timer.Stop();
-    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
-
-    // CUB
-    cpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-        CubDebugExit(allocator.DeviceFree(d_1024MB));
-    }
-    cpu_timer.Stop();
-    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
-
-    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
-        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
-        cuda_malloc_elapsed_millis / timing_iterations,
-        cub_calloc_elapsed_millis / timing_iterations);
-
-    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
-    GpuTimer gpu_timer;
-
-    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
-    fflush(stdout); fflush(stderr);
-
-    // Kernel-only
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        cub::EmptyKernel<void><<<1, 32>>>();
-    }
-    gpu_timer.Stop();
-    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // CUDA
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
-        cub::EmptyKernel<void><<<1, 32>>>();
-        CubDebugExit(cudaFree(d_1024MB));
-    }
-    gpu_timer.Stop();
-    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
-
-    // CUB
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-        cub::EmptyKernel<void><<<1, 32>>>();
-        CubDebugExit(allocator.DeviceFree(d_1024MB));
-    }
-    gpu_timer.Stop();
-    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
-
-    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
-        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
-        cuda_malloc_elapsed_millis / timing_iterations,
-        cub_calloc_elapsed_millis / timing_iterations);
-
-
-#endif
-
-    printf("Success\n");
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu
deleted file mode 100644
index b76466fc6..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_histogram.cu
+++ /dev/null
@@ -1,310 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockHistogram utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <string>
-#include <typeinfo>
-
-#include <cub/block/block_histogram.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * BlockHistogram test kernel.
- */
-template <
-    int                     BINS,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm ALGORITHM,
-    typename                T,
-    typename                HistoCounter>
-__global__ void BlockHistogramKernel(
-    T                       *d_samples,
-    HistoCounter            *d_histogram)
-{
-    // Parameterize BlockHistogram type for our thread block
-    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockHistogram::TempStorage temp_storage;
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
-
-    // Test histo (writing directly to histogram buffer in global)
-    BlockHistogram(temp_storage).Histogram(data, d_histogram);
-}
-
-
-/**
- * Initialize problem (and solution)
- */
-template <
-    int             BINS,
-    typename        SampleT>
-void Initialize(
-    GenMode         gen_mode,
-    SampleT         *h_samples,
-    int             *h_histograms_linear,
-    int             num_samples)
-{
-    // Init bins
-    for (int bin = 0; bin < BINS; ++bin)
-    {
-        h_histograms_linear[bin] = 0;
-    }
-
-    if (g_verbose) printf("Samples: \n");
-
-    // Initialize interleaved channel samples and histogram them correspondingly
-    for (int i = 0; i < num_samples; ++i)
-    {
-        InitValue(gen_mode, h_samples[i], i);
-        h_samples[i] %= BINS;
-
-        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";
-
-        h_histograms_linear[h_samples[i]]++;
-    }
-
-    if (g_verbose) printf("\n\n");
-}
-
-
-/**
- * Test BlockHistogram
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm     ALGORITHM>
-void Test(
-    GenMode                     gen_mode)
-{
-    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
-        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
-        num_samples,
-        typeid(SampleT).name(),
-        (int) sizeof(SampleT),
-        BINS,
-        BLOCK_THREADS,
-        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
-    fflush(stdout);
-
-    // Allocate host arrays
-    SampleT         *h_samples          = new SampleT[num_samples];
-    int   *h_reference = new int[BINS];
-
-    // Initialize problem
-    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);
-
-    // Allocate problem device arrays
-    SampleT         *d_samples = NULL;
-    int             *d_histogram = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));
-
-    // Initialize/clear device arrays
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));
-
-    // Run kernel
-    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
-        d_samples,
-        d_histogram);
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
-    printf("\t%s\n\n", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-    if (h_reference) delete[] h_reference;
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test different sample distributions
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm     ALGORITHM>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
-}
-
-
-/**
- * Test different ALGORITHM
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
-}
-
-
-/**
- * Test different ITEMS_PER_THREAD
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, 1>();
-    Test<SampleT, BINS, BLOCK_THREADS, 5>();
-}
-
-
-/**
- * Test different BLOCK_THREADS
- */
-template <
-    typename                    SampleT,
-    int                         BINS>
-void Test()
-{
-    Test<SampleT, BINS, 32>();
-    Test<SampleT, BINS, 96>();
-    Test<SampleT, BINS, 128>();
-}
-
-
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<total input samples across all channels> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
-    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        Test<unsigned char, 32>();
-        Test<unsigned char, 256>();
-        Test<unsigned short, 1024>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu
deleted file mode 100644
index f1a0bf37e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_load_store.cu
+++ /dev/null
@@ -1,549 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockLoad and BlockStore utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <stdio.h>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/iterator/cache_modified_input_iterator.cuh>
-#include <cub/iterator/cache_modified_output_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/**
- * Test load/store kernel.
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    typename            InputIteratorT,
-    typename            OutputIteratorT>
-__launch_bounds__ (BLOCK_THREADS, 1)
-__global__ void Kernel(
-    InputIteratorT    d_in,
-    OutputIteratorT    d_out_unguarded,
-    OutputIteratorT    d_out_guarded,
-    int               num_items)
-{
-    enum
-    {
-        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
-    };
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Threadblock load/store abstraction types
-    typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
-    typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
-
-    // Shared memory type for this thread block
-    union TempStorage
-    {
-        typename BlockLoad::TempStorage     load;
-        typename BlockStore::TempStorage    store;
-    };
-
-    // Allocate temp storage in shared memory
-    __shared__ TempStorage temp_storage;
-
-    // Threadblock work bounds
-    int block_offset = blockIdx.x * TILE_SIZE;
-    int guarded_elements = num_items - block_offset;
-
-    // Tile of items
-    OutputT data[ITEMS_PER_THREAD];
-
-    // Load data
-    BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
-
-    __syncthreads();
-
-    // Store data
-    BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
-
-    __syncthreads();
-
-    // reset data
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        data[ITEM] = OutputT();
-
-    __syncthreads();
-
-    // Load data
-    BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
-
-    __syncthreads();
-
-    // Store data
-    BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
-}
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Test load/store variants
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    typename            InputIteratorT,
-    typename            OutputIteratorT>
-void TestKernel(
-    T                   *h_in,
-    InputIteratorT      d_in,
-    OutputIteratorT      d_out_unguarded_itr,
-    OutputIteratorT      d_out_guarded_itr,
-    T                   *d_out_unguarded_ptr,
-    T                   *d_out_guarded_ptr,
-    int                 grid_size,
-    int                 guarded_elements)
-{
-    int compare;
-
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Test with discard output iterator
-    typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
-    DiscardOutputIterator<OffsetT> discard_itr;
-
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
-        <<<grid_size, BLOCK_THREADS>>>(
-            d_in,
-            discard_itr,
-            discard_itr,
-            guarded_elements);
-
-    // Test with regular output iterator
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
-        <<<grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out_unguarded_itr,
-            d_out_guarded_itr,
-            guarded_elements);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
-    printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check results
-    compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
-    printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test native pointer.  Specialized for sufficient resources
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM>
-void TestNative(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<true>      sufficient_resources)
-{
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-    int guarded_elements = int(fraction_valid * float(unguarded_elements));
-
-    // Allocate host arrays
-    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
-
-    // Allocate device arrays
-    T *d_in = NULL;
-    T *d_out_unguarded = NULL;
-    T *d_out_guarded = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
-    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
-    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
-
-    // Initialize problem on host and device
-    for (int i = 0; i < unguarded_elements; ++i)
-    {
-        InitValue(INTEGER_SEED, h_in[i], i);
-    }
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
-
-    printf("TestNative "
-        "grid_size(%d) "
-        "guarded_elements(%d) "
-        "unguarded_elements(%d) "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "LOAD_ALGORITHM(%d) "
-        "STORE_ALGORITHM(%d) "
-        "sizeof(T)(%d)\n",
-            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
-
-    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
-        h_in,
-        (T const *) d_in,   // Test const
-        d_out_unguarded,
-        d_out_guarded,
-        d_out_unguarded,
-        d_out_guarded,
-        grid_size,
-        guarded_elements);
-
-    // Cleanup
-    if (h_in) free(h_in);
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
-    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
-}
-
-
-/**
- * Test native pointer.  Specialized for insufficient resources
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM>
-void TestNative(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<false>      sufficient_resources)
-{}
-
-
-/**
- * Test iterator.  Specialized for sufficient resources.
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    CacheLoadModifier   LOAD_MODIFIER,
-    CacheStoreModifier  STORE_MODIFIER>
-void TestIterator(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<true>      sufficient_resources)
-{
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-    int guarded_elements = int(fraction_valid * float(unguarded_elements));
-
-    // Allocate host arrays
-    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
-
-    // Allocate device arrays
-    T *d_in = NULL;
-    T *d_out_unguarded = NULL;
-    T *d_out_guarded = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
-    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
-    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
-
-    // Initialize problem on host and device
-    for (int i = 0; i < unguarded_elements; ++i)
-    {
-        InitValue(INTEGER_SEED, h_in[i], i);
-    }
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
-
-    printf("TestIterator "
-        "grid_size(%d) "
-        "guarded_elements(%d) "
-        "unguarded_elements(%d) "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "LOAD_ALGORITHM(%d) "
-        "STORE_ALGORITHM(%d) "
-        "LOAD_MODIFIER(%d) "
-        "STORE_MODIFIER(%d) "
-        "sizeof(T)(%d)\n",
-            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
-
-    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
-        h_in,
-        CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
-        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
-        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
-        d_out_unguarded,
-        d_out_guarded,
-        grid_size,
-        guarded_elements);
-
-    // Cleanup
-    if (h_in) free(h_in);
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
-    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
-}
-
-/**
- * Test iterator.  Specialized for insufficient resources.
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    CacheLoadModifier   LOAD_MODIFIER,
-    CacheStoreModifier  STORE_MODIFIER>
-void TestIterator(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<false>     sufficient_resources)
-{}
-
-
-/**
- * Evaluate different pointer access types
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockLoadAlgorithm      LOAD_ALGORITHM,
-    BlockStoreAlgorithm     STORE_ALGORITHM>
-void TestPointerType(
-    int             grid_size,
-    float           fraction_valid)
-{
-    // Threadblock load/store abstraction types
-    typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
-    typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
-
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
-    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
-    static const bool sufficient_threads    = BLOCK_THREADS <= 512;
-#else
-    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
-    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
-    static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
-#endif
-
-    static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
-
-    TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
-    TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
-}
-
-
-/**
- * Evaluate different time-slicing strategies
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockLoadAlgorithm      LOAD_ALGORITHM,
-    BlockStoreAlgorithm     STORE_ALGORITHM>
-void TestSlicedStrategy(
-    int             grid_size,
-    float           fraction_valid)
-{
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
-}
-
-
-
-/**
- * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
- */
-template <
-    typename        T,
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD>
-void TestStrategy(
-    int             grid_size,
-    float           fraction_valid,
-    Int2Type<false> is_warp_multiple)
-{
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
-}
-
-
-/**
- * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
- */
-template <
-    typename        T,
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD>
-void TestStrategy(
-    int             grid_size,
-    float           fraction_valid,
-    Int2Type<true>  is_warp_multiple)
-{
-    TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
-}
-
-
-/**
- * Evaluate different register blocking
- */
-template <
-    typename T,
-    int BLOCK_THREADS>
-void TestItemsPerThread(
-    int grid_size,
-    float fraction_valid)
-{
-    Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
-
-    TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
-}
-
-
-/**
- * Evaluate different thread block sizes
- */
-template <typename T>
-void TestThreads(
-    int grid_size,
-    float fraction_valid)
-{
-    TestItemsPerThread<T, 15>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 32>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 72>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 96>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 128>(grid_size, fraction_valid);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
-    TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
-
-#else
-
-    // Compile/run thorough tests
-    TestThreads<char>(2, 0.8f);
-    TestThreads<int>(2, 0.8f);
-    TestThreads<long>(2, 0.8f);
-    TestThreads<long2>(2, 0.8f);
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        TestThreads<double2>(2, 0.8f);
-    TestThreads<TestFoo>(2, 0.8f);
-    TestThreads<TestBar>(2, 0.8f);
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu
deleted file mode 100644
index 959018b1a..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_radix_sort.cu
+++ /dev/null
@@ -1,717 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockRadixSort utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-#include <iostream>
-
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/// Specialized descending, blocked -> blocked
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<true>              is_descending,
-    Int2Type<true>              is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectBlocked(threadIdx.x, d_keys, keys);
-    StoreDirectBlocked(threadIdx.x, d_values, values);
-}
-
-/// Specialized descending, blocked -> striped
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<true>              is_descending,
-    Int2Type<false>             is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
-}
-
-/// Specialized ascending, blocked -> blocked
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<false>             is_descending,
-    Int2Type<true>              is_blocked_output)
-{
-    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectBlocked(threadIdx.x, d_keys, keys);
-    StoreDirectBlocked(threadIdx.x, d_values, values);
-}
-
-/// Specialized ascending, blocked -> striped
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<false>             is_descending,
-    Int2Type<false>             is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
-}
-
-
-
-/**
- * BlockRadixSort kernel
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    int                 RADIX_BITS,
-    bool                MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig SMEM_CONFIG,
-    int                 DESCENDING,
-    int                 BLOCKED_OUTPUT,
-    typename            Key,
-    typename            Value>
-__launch_bounds__ (BLOCK_THREADS, 1)
-__global__ void Kernel(
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     *d_elapsed)
-{
-    // Threadblock load/store abstraction types
-    typedef BlockRadixSort<
-            Key,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            Value,
-            RADIX_BITS,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG>
-        BlockRadixSortT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockRadixSortT::TempStorage temp_storage;
-
-    // Items per thread
-    Key     keys[ITEMS_PER_THREAD];
-    Value   values[ITEMS_PER_THREAD];
-
-    LoadDirectBlocked(threadIdx.x, d_keys, keys);
-    LoadDirectBlocked(threadIdx.x, d_values, values);
-
-    // Start cycle timer
-    clock_t stop;
-    clock_t start = clock();
-
-    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
-        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
-
-    // Store time
-    if (threadIdx.x == 0)
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-}
-
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Simple key-value pairing
- */
-template <
-    typename Key,
-    typename Value,
-    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
-struct Pair
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-/**
- * Simple key-value pairing (specialized for floating point types)
- */
-template <typename Key, typename Value>
-struct Pair<Key, Value, true>
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // Key in unsigned bits
-        typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-        // Return true if key is negative zero and b.key is positive zero
-        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
-        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&b.key));
-        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key-value sorting problem.
- */
-template <bool DESCENDING, typename Key, typename Value>
-void Initialize(
-    GenMode         gen_mode,
-    Key             *h_keys,
-    Value           *h_values,
-    Key             *h_reference_keys,
-    Value           *h_reference_values,
-    int             num_items,
-    int             entropy_reduction,
-    int             begin_bit,
-    int             end_bit)
-{
-    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_keys[i], i);
-
-        RandomBits(h_values[i]);
-
-        // Mask off unwanted portions
-        int num_bits = end_bit - begin_bit;
-        if ((begin_bit > 0) || (end_bit < sizeof(Key) * 8))
-        {
-            unsigned long long base = 0;
-            memcpy(&base, &h_keys[i], sizeof(Key));
-            base &= ((1ull << num_bits) - 1) << begin_bit;
-            memcpy(&h_keys[i], &base, sizeof(Key));
-        }
-
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
-    std::stable_sort(h_pairs, h_pairs + num_items);
-    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_keys[i]     = h_pairs[i].key;
-        h_reference_values[i]   = h_pairs[i].value;
-    }
-
-    delete[] h_pairs;
-}
-
-
-
-
-/**
- * Test BlockRadixSort kernel
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestDriver(
-    GenMode                 gen_mode,
-    int                     entropy_reduction,
-    int                     begin_bit,
-    int                     end_bit)
-{
-    enum
-    {
-        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
-        KEYS_ONLY = Equals<Value, NullType>::VALUE,
-    };
-
-    // Allocate host arrays
-    Key     *h_keys             = new Key[TILE_SIZE];
-    Key     *h_reference_keys   = new Key[TILE_SIZE];
-    Value   *h_values           = new Value[TILE_SIZE];
-    Value   *h_reference_values = new Value[TILE_SIZE];
-
-    // Allocate device arrays
-    Key     *d_keys     = NULL;
-    Value   *d_values   = NULL;
-    clock_t *d_elapsed  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-
-    // Initialize problem and solution on host
-    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
-        TILE_SIZE, entropy_reduction, begin_bit, end_bit);
-
-    // Copy problem to device
-    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));
-
-    printf("%s "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "RADIX_BITS(%d) "
-        "MEMOIZE_OUTER_SCAN(%d) "
-        "INNER_SCAN_ALGORITHM(%d) "
-        "SMEM_CONFIG(%d) "
-        "DESCENDING(%d) "
-        "BLOCKED_OUTPUT(%d) "
-        "sizeof(Key)(%d) "
-        "sizeof(Value)(%d) "
-        "gen_mode(%d), "
-        "entropy_reduction(%d) "
-        "begin_bit(%d) "
-        "end_bit(%d), "
-        "samples(%d)\n",
-            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            RADIX_BITS,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            DESCENDING,
-            BLOCKED_OUTPUT,
-            (int) sizeof(Key),
-            (int) sizeof(Value),
-            gen_mode,
-            entropy_reduction,
-            begin_bit,
-            end_bit,
-            g_num_rand_samples);
-
-    // Set shared memory config
-    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);
-
-    // Run kernel
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
-        d_keys, d_values, begin_bit, end_bit, d_elapsed);
-
-    // Flush kernel output / errors
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check keys results
-    printf("\tKeys: ");
-    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check value results
-    if (!KEYS_ONLY)
-    {
-        printf("\tValues: ");
-        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-    printf("\n");
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-    printf("\n");
-
-    // Cleanup
-    if (h_keys)             delete[] h_keys;
-    if (h_reference_keys)   delete[] h_reference_keys;
-    if (h_values)           delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
-    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
-    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test driver (valid tile size <= MAX_SMEM_BYTES)
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestValid(Int2Type<true> fits_smem_capacity)
-{
-    // Iterate begin_bit
-    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
-    {
-        // Iterate end bit
-        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
-        {
-            // Uniform key distribution
-            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                UNIFORM, 0, begin_bit, end_bit);
-
-            // Sequential key distribution
-            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                INTEGER_SEED, 0, begin_bit, end_bit);
-
-            // Iterate random with entropy_reduction
-            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
-            {
-                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                    RANDOM, entropy_reduction, begin_bit, end_bit);
-            }
-        }
-    }
-}
-
-
-/**
- * Test driver (invalid tile size)
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestValid(Int2Type<false> fits_smem_capacity)
-{}
-
-
-/**
- * Test ascending/descending and to-blocked/to-striped
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    typename                Key,
-    typename                Value>
-void Test()
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
-
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
-#else
-    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
-#endif
-
-    // Sort-ascending, to-striped
-    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
-
-    // Sort-descending, to-blocked
-    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
-
-    // Not necessary
-//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
-//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
-}
-
-
-/**
- * Test value type and smem config
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    typename                Key>
-void TestKeys()
-{
-    // Test keys-only sorting with both smem configs
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
-#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
-#endif
-}
-
-
-/**
- * Test value type and smem config
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    typename                Key>
-void TestKeysAndPairs()
-{
-    // Test pairs sorting with only 4-byte configs
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
-}
-
-
-/**
- * Test key type
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
-void Test()
-{
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef TEST_KEYS_ONLY
-
-    // Test unsigned types with keys-only
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned char>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned short>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned int>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long long>();
-
-#else
-
-    // Test signed and fp types with paired values
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long long>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, float>();
-    if (ptx_version > 120)
-    {
-        // Don't check doubles on PTX120 or below because they're down-converted
-        TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, double>();
-    }
-
-#endif
-}
-
-
-/**
- * Test inner scan algorithm
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_RAKING>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_WARP_SCANS>();
-}
-
-
-/**
- * Test outer scan algorithm
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, true>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, false>();
-}
-
-
-/**
- * Test radix bits
- */
-template <
-    int BLOCK_THREADS,
-    int ITEMS_PER_THREAD>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 5>();
-}
-
-
-/**
- * Test items per thread
- */
-template <int BLOCK_THREADS>
-void Test()
-{
-    Test<BLOCK_THREADS, 1>();
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    // Open64 compiler can't handle the number of test cases
-#else
-    Test<BLOCK_THREADS, 4>();
-#endif
-    Test<BLOCK_THREADS, 11>();
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    {
-        typedef float T;
-        TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8);
-    }
-/*
-    // Compile/run quick tests
-    typedef unsigned int T;
-    TestDriver<64, 17, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-    TestDriver<96, 8, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-    TestDriver<128, 2, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-*/
-
-#else
-
-    // Compile/run thorough tests
-    Test<32>();
-    Test<64>();
-    Test<160>();
-
-
-#endif  // QUICK_TEST
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu
deleted file mode 100644
index c8df4bcfa..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_reduce.cu
+++ /dev/null
@@ -1,822 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <device_functions.h>
-#include <typeinfo>
-
-#include <cub/block/block_reduce.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/util_ptx.cuh>
-#include <cub/util_allocator.cuh>
-#include <cub/util_debug.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/// Generic reduction (full, 1)
-template <typename BlockReduceT, typename T, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
-{
-    return block_reduce.Reduce(data[0], reduction_op);
-}
-
-/// Generic reduction (full, ITEMS_PER_THREAD)
-template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
-{
-    return block_reduce.Reduce(data, reduction_op);
-}
-
-/// Generic reduction (partial, 1)
-template <typename BlockReduceT, typename T, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
-{
-    return block_reduce.Reduce(data, reduction_op, valid_threads);
-}
-
-/// Sum reduction (full, 1)
-template <typename BlockReduceT, typename T>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
-{
-    return block_reduce.Sum(data[0]);
-}
-
-/// Sum reduction (full, ITEMS_PER_THREAD)
-template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
-{
-    return block_reduce.Sum(data);
-}
-
-/// Sum reduction (partial, 1)
-template <typename BlockReduceT, typename T>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
-{
-    return block_reduce.Sum(data, valid_threads);
-}
-
-
-/**
- * Test full-tile reduction kernel (where num_items is an even
- * multiple of BLOCK_THREADS)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void FullTileReduceKernel(
-    T                       *d_in,
-    T                       *d_out,
-    ReductionOp             reduction_op,
-    int                     tiles,
-    clock_t                 *d_elapsed)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Cooperative thread block reduction utility type (returns aggregate in thread 0)
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-
-    // Load first tile of data
-    int block_offset = 0;
-
-    if (block_offset < TILE_SIZE * tiles)
-    {
-        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
-        block_offset += TILE_SIZE;
-
-        // Start cycle timer
-        clock_t start = clock();
-
-        // Cooperative reduce first tile
-        BlockReduceT block_reduce(temp_storage) ;
-        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
-
-        // Stop cycle timer
- #if CUB_PTX_ARCH == 100
-        // Bug: recording stop clock causes mis-write of running prefix value
-        clock_t stop = 0;
-#else
-        clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-        clock_t elapsed = (start > stop) ? start - stop : stop - start;
-
-        // Loop over input tiles
-        while (block_offset < TILE_SIZE * tiles)
-        {
-            // TestBarrier between thread block reductions
-            __syncthreads();
-    
-            // Load tile of data
-            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
-            block_offset += TILE_SIZE;
-
-            // Start cycle timer
-            clock_t start = clock();
-
-            // Cooperatively reduce the tile's aggregate
-            BlockReduceT block_reduce(temp_storage) ;
-            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
-
-            // Stop cycle timer
-#if CUB_PTX_ARCH == 100
-            // Bug: recording stop clock causes mis-write of running prefix value
-            clock_t stop = 0;
-#else
-            clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-            elapsed += (start > stop) ? start - stop : stop - start;
-
-            // Reduce thread block aggregate
-            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
-        }
-
-        // Store data
-        if (linear_tid == 0)
-        {
-            d_out[0] = block_aggregate;
-            *d_elapsed = elapsed;
-        }
-    }
-}
-
-
-
-/**
- * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void PartialTileReduceKernel(
-    T                       *d_in,
-    T                       *d_out,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    clock_t                 *d_elapsed)
-{
-    // Cooperative thread block reduction utility type (returns aggregate only in thread-0)
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T partial;
-
-    // Load partial tile data
-    if (linear_tid < num_items)
-    {
-        partial = d_in[linear_tid];
-    }
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Cooperatively reduce the tile's aggregate
-    BlockReduceT block_reduce(temp_storage) ;
-    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
-
-    // Stop cycle timer
-#if CUB_PTX_ARCH == 100
-    // Bug: recording stop clock causes mis-write of running prefix value
-    clock_t stop = 0;
-#else
-    clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-
-    clock_t elapsed = (start > stop) ? start - stop : stop - start;
-
-    // Store data
-    if (linear_tid == 0)
-    {
-        d_out[0] = tile_aggregate;
-        *d_elapsed = elapsed;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem (and solution)
- */
-template <
-    typename    T,
-    typename    ReductionOp>
-void Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           h_reference[1],
-    ReductionOp reduction_op,
-    int         num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        if (i == 0)
-            h_reference[0] = h_in[0];
-        else
-            h_reference[0] = reduction_op(h_reference[0], h_in[i]);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n");
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Full tile test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Test full-tile reduction.  (Specialized for sufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op,
-    Int2Type<true>          sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    int num_items = TILE_SIZE * tiles;
-
-    // Allocate host arrays
-    T *h_in = new T[num_items];
-    T h_reference[1];
-
-    // Initialize problem
-    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
-
-    // Test multi-tile (unguarded)
-    printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
-        Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        gen_mode,
-        num_items,
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        ITEMS_PER_THREAD,
-        tiles,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        reduction_op,
-        tiles,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test full-tile reduction.  (Specialized for insufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op,
-    Int2Type<false>         sufficient_resources)
-{}
-
-
-/**
- * Test full-tile reduction.
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
-
-    enum 
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
-        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
-#else
-        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
-        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
-#endif
-    };
-
-    TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
-}
-
-
-/**
- * Run battery of tests for different thread block dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
-}
-
-/**
- * Run battery of tests for different thread items
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
-}
-
-
-/**
- * Run battery of full-tile tests for different numbers of tiles
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    ReductionOp             reduction_op)
-{
-    for (int tiles = 1; tiles < 3; tiles++)
-    {
-        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Partial-tile test generation
-//---------------------------------------------------------------------
-
-/**
- * Test partial-tile reduction.  (Specialized for sufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    Int2Type<true>          sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS;
-
-    // Allocate host arrays
-    T *h_in = new T[num_items];
-    T h_reference[1];
-
-    // Initialize problem
-    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
-
-    printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        gen_mode,
-        num_items,
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        num_items,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-
-/**
- * Test partial-tile reduction (specialized for insufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    Int2Type<false>         sufficient_resources)
-{}
-
-
-/**
- *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
-
-    enum 
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 16 * 1024,
-        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 512,
-#else
-        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 48 * 1024,
-        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 1024,
-#endif
-    };
-
-    TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
-}
-
-
-
-/**
- *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    ReductionOp             reduction_op)
-{
-    for (
-        int num_items = 1;
-        num_items < BLOCK_THREADS;
-        num_items += CUB_MAX(1, BLOCK_THREADS / 5))
-    {
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Run battery of full-tile tests for different gen modes
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void Test(
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
-    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
-
-    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
-    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
-
-    if (Traits<T>::CATEGORY != FLOATING_POINT)
-    {
-        // Don't test randomly-generated floats b/c of stability
-        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
-    }
-}
-
-
-/**
- * Run battery of tests for different block-reduction algorithmic variants
- */
-template <
-    int             BLOCK_THREADS,
-    typename        T,
-    typename        ReductionOp>
-void Test(
-    ReductionOp     reduction_op)
-{
-#ifdef TEST_RAKING
-    Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
-    Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
-#endif
-#ifdef TEST_WARP_REDUCTIONS
-    Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
-#endif
-}
-
-
-/**
- * Run battery of tests for different block sizes
- */
-template <
-    typename        T,
-    typename        ReductionOp>
-void Test(
-    ReductionOp     reduction_op)
-{
-    Test<7,   T>(reduction_op);
-    Test<32,  T>(reduction_op);
-    Test<63,  T>(reduction_op);
-    Test<97,  T>(reduction_op);
-    Test<128, T>(reduction_op);
-    Test<238, T>(reduction_op);
-}
-
-
-/**
- * Run battery of tests for different block sizes
- */
-template <typename T>
-void Test()
-{
-    Test<T>(Sum());
-    Test<T>(Max());
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-
-
-    printf("\n full tile ------------------------\n\n");
-
-    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 4, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 4, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 4, int>(RANDOM, 1, Sum());
-
-    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 1, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 1, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 1, int>(RANDOM, 1, Sum());
-
-    printf("\n partial tile ------------------------\n\n");
-
-    TestPartialTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, int>(RANDOM, 7, Sum());
-    TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, int>(RANDOM, 7, Sum());
-    TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, int>(RANDOM, 7, Sum());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // primitives
-        Test<char>();
-        Test<short>();
-        Test<int>();
-        Test<long long>();
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            Test<double>();
-
-        Test<float>();
-
-        // vector types
-        Test<char2>();
-        Test<short2>();
-        Test<int2>();
-        Test<longlong2>();
-
-        Test<char4>();
-        Test<short4>();
-        Test<int4>();
-        Test<longlong4>();
-
-        // Complex types
-        Test<TestFoo>();
-        Test<TestBar>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu
deleted file mode 100644
index 192fb5107..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_block_scan.cu
+++ /dev/null
@@ -1,929 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-#include <limits>
-#include <typeinfo>
-
-#include <cub/block/block_scan.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_ptx.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * Primitive variant to test
- */
-enum TestMode
-{
-    BASIC,
-    AGGREGATE,
-    PREFIX,
-};
-
-
-/**
- * Scan mode to test
- */
-enum ScanMode
-{
-    EXCLUSIVE,
-    INCLUSIVE
-};
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-
-/**
- * Stateful prefix functor
- */
-template <
-    typename T,
-    typename ScanOpT>
-struct BlockPrefixCallbackOp
-{
-    int     linear_tid;
-    T       prefix;
-    ScanOpT  scan_op;
-
-    __device__ __forceinline__
-    BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) :
-        linear_tid(linear_tid),
-        prefix(prefix),
-        scan_op(scan_op)
-    {}
-
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-        // For testing purposes
-        T retval = (linear_tid == 0) ? prefix  : T();
-        prefix = scan_op(prefix, block_aggregate);
-        return retval;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Exclusive scan
-//---------------------------------------------------------------------
-
-/// Exclusive scan (BASIC, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op);
-}
-
-/// Exclusive scan (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, initial_value, scan_op);
-}
-
-/// Exclusive scan (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate);
-}
-
-/// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate);
-}
-
-/// Exclusive scan (PREFIX, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op);
-}
-
-/// Exclusive scan (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, scan_op, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Exclusive sum
-//---------------------------------------------------------------------
-
-/// Exclusive sum (BASIC, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0]);
-}
-
-/// Exclusive sum (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data);
-}
-
-/// Exclusive sum (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0], block_aggregate);
-}
-
-/// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data, block_aggregate);
-}
-
-/// Exclusive sum (PREFIX, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0], prefix_op);
-}
-
-/// Exclusive sum (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Inclusive scan
-//---------------------------------------------------------------------
-
-/// Inclusive scan (BASIC, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op);
-}
-
-/// Inclusive scan (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op);
-}
-
-/// Inclusive scan (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate);
-}
-
-/// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op, block_aggregate);
-}
-
-/// Inclusive scan (PREFIX, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op);
-}
-
-/// Inclusive scan (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Inclusive sum
-//---------------------------------------------------------------------
-
-/// Inclusive sum (BASIC, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0]);
-}
-
-/// Inclusive sum (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data);
-}
-
-/// Inclusive sum (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0], block_aggregate);
-}
-
-/// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data, block_aggregate);
-}
-
-/// Inclusive sum (PREFIX, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0], prefix_op);
-}
-
-/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data, prefix_op);
-}
-
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * BlockScan test kernel.
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            T,
-    typename            ScanOpT>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void BlockScanKernel(
-    T                   *d_in,
-    T                   *d_out,
-    T                   *d_aggregate,
-    ScanOpT              scan_op,
-    T                   initial_value,
-    clock_t             *d_elapsed)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Parameterize BlockScan type for our thread block
-    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockScanT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    LoadDirectBlocked(linear_tid, d_in, data);
-
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test scan
-    T                                   block_aggregate;
-    BlockScanT                          block_scan(temp_storage);
-    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);
-
-    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
-        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
-
-    // Stop cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Store output
-    StoreDirectBlocked(linear_tid, d_out, data);
-
-    // Store block_aggregate
-    if (TEST_MODE != BASIC)
-        d_aggregate[linear_tid] = block_aggregate;
-
-    // Store prefix
-    if (TEST_MODE == PREFIX)
-    {
-        if (linear_tid == 0)
-            d_out[TILE_SIZE] = prefix_op.prefix;
-    }
-
-    // Store time
-    if (linear_tid == 0)
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive-scan problem (and solution)
- */
-template <typename T, typename ScanOpT>
-T Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         num_items,
-    ScanOpT     scan_op,
-    T           initial_value,
-    Int2Type<EXCLUSIVE>)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    h_reference[0]      = initial_value;
-    T inclusive         = scan_op(initial_value, h_in[0]);
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        h_reference[i] = inclusive;
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Initialize inclusive-scan problem (and solution)
- */
-template <typename T, typename ScanOpT>
-T Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         num_items,
-    ScanOpT      scan_op,
-    T           initial_value,
-    Int2Type<INCLUSIVE>)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    T inclusive         = scan_op(initial_value, h_in[0]);
-    h_reference[0]      = inclusive;
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-        h_reference[i] = inclusive;
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Test thread block scan.  (Specialized for sufficient resources)
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value,
-    Int2Type<true>      sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    T *h_in = new T[TILE_SIZE];
-    T *h_reference = new T[TILE_SIZE];
-    T *h_aggregate = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    T block_aggregate = Initialize(
-        gen_mode,
-        h_in,
-        h_reference,
-        TILE_SIZE,
-        scan_op,
-        initial_value,
-        Int2Type<SCAN_MODE>());
-
-    // Test reference block_aggregate is returned in all threads
-    for (int i = 0; i < BLOCK_THREADS; ++i)
-    {
-        h_aggregate[i] = block_aggregate;
-    }
-
-    // Run kernel
-    printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n",
-        TEST_MODE, gen_mode, ALGORITHM,
-        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        ITEMS_PER_THREAD,  TILE_SIZE,
-        typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    T       *d_aggregate = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
-    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-        {
-            std::cout << CoutCast(h_in[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Run block_aggregate/prefix kernel
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        d_aggregate,
-        scan_op,
-        initial_value,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tScan results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (TEST_MODE == AGGREGATE)
-    {
-        // Copy out and display block_aggregate
-        printf("\tScan block aggregate: ");
-        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    if (TEST_MODE == PREFIX)
-    {
-        // Copy out and display updated prefix
-        printf("\tScan running total: ");
-        T running_total = scan_op(initial_value, block_aggregate);
-        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_aggregate) delete[] h_aggregate;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test thread block scan.  (Specialized for insufficient resources)
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value,
-    Int2Type<false>     sufficient_resources)
-{}
-
-
-/**
- * Test thread block scan.
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
-
-    enum
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
-        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 512),
-#else
-        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
-        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 1024),
-#endif
-
-#if defined(_WIN32) || defined(_WIN64)
-        // Accommodate ptxas crash bug (access violation) on Windows
-        special_skip            = ((TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1)),
-#else
-        special_skip            = false,
-#endif
-        sufficient_resources    = (sufficient_smem && sufficient_threads && !special_skip),
-    };
-
-    Test<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(
-        gen_mode, scan_op, initial_value, Int2Type<sufficient_resources>());
-}
-
-
-
-/**
- * Run test for different thread block dimensions
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-    Test<BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
-    Test<BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
-}
-
-
-/**
- * Run test for different policy types
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    ScanMode    SCAN_MODE,
-    TestMode    TEST_MODE,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-#ifdef TEST_RAKING
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING>(gen_mode, scan_op, initial_value);
-#endif
-#ifdef TEST_RAKING_MEMOIZE
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING_MEMOIZE>(gen_mode, scan_op, initial_value);
-#endif
-#ifdef TEST_WARP_SCANS
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_WARP_SCANS>(gen_mode, scan_op, initial_value);
-#endif
-}
-
-
-/**
- * Run tests for different primitive variants
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           identity,
-    T           initial_value)
-{
-    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, scan_op, identity);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, scan_op, identity);
-
-    // Exclusive (non-specialized, so we can use initial-value)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-
-    // Inclusive
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, BASIC>(gen_mode, scan_op, identity);      // This scan doesn't take an initial value
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);  // This scan doesn't take an initial value
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, PREFIX>(gen_mode, scan_op, initial_value);
-}
-
-
-/**
- * Run tests for different problem-generation options
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    ScanOpT     scan_op,
-    T           identity,
-    T           initial_value)
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(UNIFORM, scan_op, identity, initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(INTEGER_SEED, scan_op, identity, initial_value);
-
-    // Don't test randomly-generated floats b/c of stability
-    if (Traits<T>::CATEGORY != FLOATING_POINT)
-        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(RANDOM, scan_op, identity, initial_value);
-}
-
-
-/**
- * Run tests for different data types and scan ops
- */
-template <
-    int BLOCK_THREADS,
-    int ITEMS_PER_THREAD>
-void Test()
-{
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // primitive
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned char) 0, (unsigned char) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned short) 0, (unsigned short) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned int) 0, (unsigned int) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned long long) 0, (unsigned long long) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (float) 0, (float) 99);
-
-    // primitive (alternative scan op)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<char>::min(), (char) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<short>::min(), (short) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<int>::min(), (int) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<long long>::min(), (long long) 99);
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<double>::max() * -1, (double) 99);
-
-    // vec-1
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar1(0), make_uchar1(17));
-
-    // vec-2
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar2(0, 0), make_uchar2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ushort2(0, 0), make_ushort2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uint2(0, 0), make_uint2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21));
-
-    // vec-4
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85));
-
-    // complex
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21));
-
-}
-
-
-/**
- * Run tests for different items per thread
- */
-template <int BLOCK_THREADS>
-void Test()
-{
-    Test<BLOCK_THREADS, 1>();
-    Test<BLOCK_THREADS, 2>();
-    Test<BLOCK_THREADS, 9>();
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
-
-    // Compile/run quick tests
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0));
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0));
-
-    Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85));
-
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Run tests for different thread block sizes
-        Test<17>();
-        Test<32>();
-        Test<62>();
-        Test<65>();
-//            Test<96>();             // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21));
-        Test<128>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu
deleted file mode 100644
index 8da69266b..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_histogram.cu
+++ /dev/null
@@ -1,1686 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceHistogram utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <algorithm>
-#include <typeinfo>
-
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-    #include <npp.h>
-#endif
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_histogram.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    NPP,        // NPP method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-bool                    g_verbose_input     = false;
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-
-
-//---------------------------------------------------------------------
-// Dispatch to NPP histogram
-//---------------------------------------------------------------------
-
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-
-/**
- * Dispatch to single-channel 8b NPP histo-even
- */
-template <typename CounterT, typename LevelT, typename OffsetT>
-//CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<1>             num_channels,
-    Int2Type<1>             num_active_channels,
-    Int2Type<NPP>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef unsigned char SampleT;
-
-    cudaError_t error = cudaSuccess;
-    NppiSize oSizeROI = {
-        num_row_pixels,
-        num_rows
-    };
-
-    if (d_temp_storage_bytes == NULL)
-    {
-        int nDeviceBufferSize;
-        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
-        temp_storage_bytes = nDeviceBufferSize;
-    }
-    else
-    {
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            // compute the histogram
-            nppiHistogramEven_8u_C1R(
-                d_samples,
-                row_stride_bytes,
-                oSizeROI,
-                d_histogram[0],
-                num_levels[0],
-                lower_level[0],
-                upper_level[0],
-                (Npp8u*) d_temp_storage);
-        }
-    }
-
-    return error;
-}
-
-
-/**
- * Dispatch to 3/4 8b NPP histo-even
- */
-template <typename CounterT, typename LevelT, typename OffsetT>
-//CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<4>          num_channels,
-    Int2Type<3>   num_active_channels,
-    Int2Type<NPP>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef unsigned char SampleT;
-
-    cudaError_t error = cudaSuccess;
-    NppiSize oSizeROI = {
-        num_row_pixels,
-        num_rows
-    };
-
-    if (d_temp_storage_bytes == NULL)
-    {
-        int nDeviceBufferSize;
-        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
-        temp_storage_bytes = nDeviceBufferSize;
-    }
-    else
-    {
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            // compute the histogram
-            nppiHistogramEven_8u_AC4R(
-                d_samples,
-                row_stride_bytes,
-                oSizeROI,
-                d_histogram,
-                num_levels,
-                lower_level,
-                upper_level,
-                (Npp8u*) d_temp_storage);
-        }
-    }
-
-    return error;
-}
-
-
-#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
-
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceHistogram entrypoints
-//---------------------------------------------------------------------
-
-template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, int BACKEND>
-struct Dispatch;
-
-template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS>
-struct Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, CUB>
-{
-    /**
-     * Dispatch to CUB multi histogram-range entrypoint
-     */
-    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-    //CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Range(
-        int                     timing_timing_iterations,
-        size_t                  *d_temp_storage_bytes,
-        cudaError_t             *d_cdp_error,
-
-        void*               d_temp_storage,
-        size_t&             temp_storage_bytes,
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 *num_levels,                                ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              *(&d_levels)[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,
-        bool                debug_synchronous)
-    {
-        cudaError_t error = cudaSuccess;
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_histogram,
-                num_levels,
-                d_levels,
-                num_row_pixels,
-                num_rows,
-                row_stride_bytes,
-                stream,
-                debug_synchronous);
-        }
-        return error;
-    }
-
-
-    /**
-     * Dispatch to CUB multi histogram-even entrypoint
-     */
-    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-    //CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Even(
-        int                     timing_timing_iterations,
-        size_t                  *d_temp_storage_bytes,
-        cudaError_t             *d_cdp_error,
-
-        void*               d_temp_storage,
-        size_t&             temp_storage_bytes,
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT            *(&d_histogram)[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 *num_levels,            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              *lower_level,           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              *upper_level,           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,
-        bool                debug_synchronous)
-    {
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        cudaError_t error = cudaSuccess;
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_histogram,
-                num_levels,
-                lower_level,
-                upper_level,
-                num_row_pixels,
-                num_rows,
-                row_stride_bytes,
-                stream,
-                debug_synchronous);
-        }
-        return error;
-    }
-
-};
-
-
-template <>
-struct Dispatch<1, 1, CUB>
-{
-
-    /**
-     * Dispatch to CUB single histogram-range entrypoint
-     */
-    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-    //CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Range(
-        int                     timing_timing_iterations,
-        size_t                  *d_temp_storage_bytes,
-        cudaError_t             *d_cdp_error,
-
-        void*               d_temp_storage,
-        size_t&             temp_storage_bytes,
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 *num_levels,                            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              (&d_levels)[1],                         ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,
-        bool                debug_synchronous)
-    {
-        cudaError_t error = cudaSuccess;
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            error = DeviceHistogram::HistogramRange(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_histogram[0],
-                num_levels[0],
-                d_levels[0],
-                num_row_pixels,
-                num_rows,
-                row_stride_bytes,
-                stream,
-                debug_synchronous);
-        }
-        return error;
-    }
-
-
-    /**
-     * Dispatch to CUB single histogram-even entrypoint
-     */
-    template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-    //CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Even(
-        int                     timing_timing_iterations,
-        size_t                  *d_temp_storage_bytes,
-        cudaError_t             *d_cdp_error,
-
-        void*               d_temp_storage,
-        size_t&             temp_storage_bytes,
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           (&d_histogram)[1],                      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 *num_levels,                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              *lower_level,                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              *upper_level,                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,
-        bool                debug_synchronous)
-    {
-        cudaError_t error = cudaSuccess;
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            error = DeviceHistogram::HistogramEven(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_histogram[0],
-                num_levels[0],
-                lower_level[0],
-                upper_level[0],
-                num_row_pixels,
-                num_rows,
-                row_stride_bytes,
-                stream,
-                debug_synchronous);
-        }
-        return error;
-    }
-
-};
-
-
-
-//---------------------------------------------------------------------
-// CUDA nested-parallelism test kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceHistogram
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-__global__ void CnpDispatchKernel(
-    Int2Type<ALGORITHM> algorithm,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
-    int                 num_samples,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/ **
- * Dispatch to CDP kernel
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-cudaError_t Dispatch(
-    Int2Type<ALGORITHM> algorithm,
-    Int2Type<true>      use_cdp,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
-    int                 num_samples,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-*/
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-// Searches for bin given a list of bin-boundary levels
-template <typename LevelT>
-struct SearchTransform
-{
-    LevelT          *levels;      // Pointer to levels array
-    int             num_levels;   // Number of levels in array
-
-    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
-        if (bin < 0)
-        {
-            // Sample out of range
-            return num_levels;
-        }
-        return bin;
-    }
-};
-
-
-// Scales samples to evenly-spaced bins
-template <typename LevelT>
-struct ScaleTransform
-{
-    int    num_levels;  // Number of levels in array
-    LevelT max;         // Max sample level (exclusive)
-    LevelT min;         // Min sample level (inclusive)
-    LevelT scale;       // Bin scaling factor
-
-    void Init(
-        int    num_levels,  // Number of levels in array
-        LevelT max,         // Max sample level (exclusive)
-        LevelT min,         // Min sample level (inclusive)
-        LevelT scale)       // Bin scaling factor
-    {
-        this->num_levels = num_levels;
-        this->max = max;
-        this->min = min;
-        this->scale = scale;
-    }
-
-    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        if ((sample < min) || (sample >= max))
-        {
-            // Sample out of range
-            return num_levels;
-        }
-
-        return (int) ((((LevelT) sample) - min) / scale);
-    }
-};
-
-// Scales samples to evenly-spaced bins
-template <>
-struct ScaleTransform<float>
-{
-    int   num_levels;  // Number of levels in array
-    float max;         // Max sample level (exclusive)
-    float min;         // Min sample level (inclusive)
-    float scale;       // Bin scaling factor
-
-    void Init(
-        int    num_levels,  // Number of levels in array
-        float max,         // Max sample level (exclusive)
-        float min,         // Min sample level (inclusive)
-        float scale)       // Bin scaling factor
-    {
-        this->num_levels = num_levels;
-        this->max = max;
-        this->min = min;
-        this->scale = 1.0f / scale;
-    }
-
-    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        if ((sample < min) || (sample >= max))
-        {
-            // Sample out of range
-            return num_levels;
-        }
-
-        return (int) ((((float) sample) - min) * scale);
-    }
-};
-
-
-/**
- * Generate sample
- */
-template <typename T, typename LevelT>
-void Sample(T &datum, LevelT max_level, int entropy_reduction)
-{
-    unsigned int max = (unsigned int) -1;
-    unsigned int bits;
-    RandomBits(bits, entropy_reduction);
-    float fraction = (float(bits) / max);
-
-    datum = (T) (fraction * max_level);
-}
-
-
-/**
- * Initialize histogram samples
- */
-template <
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        LevelT,
-    typename        SampleT,
-    typename        OffsetT>
-void InitializeSamples(
-    LevelT          max_level,
-    int             entropy_reduction,
-    SampleT         *h_samples,
-    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    // Initialize samples
-    for (OffsetT row = 0; row < num_rows; ++row)
-    {
-        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
-        {
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                // Sample offset
-                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
-
-                // Init sample value
-                Sample(h_samples[offset], max_level, entropy_reduction);
-                if (g_verbose_input)
-                {
-                    if (channel > 0) printf(", ");
-                    std::cout << CoutCast(h_samples[offset]);
-                }
-            }
-        }
-    }
-}
-
-
-/**
- * Initialize histogram solutions
- */
-template <
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        SampleIteratorT,
-    typename        TransformOp,
-    typename        OffsetT>
-void InitializeBins(
-    SampleIteratorT h_samples,
-    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    // Init bins
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
-        {
-            h_histogram[CHANNEL][bin] = 0;
-        }
-    }
-
-    // Initialize samples
-    if (g_verbose_input) printf("Samples: \n");
-    for (OffsetT row = 0; row < num_rows; ++row)
-    {
-        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
-        {
-            if (g_verbose_input) printf("[");
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                // Sample offset
-                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
-
-                // Update sample bin
-                int bin = transform_op[channel](h_samples[offset]);
-                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
-                if ((bin >= 0) && (bin < num_levels[channel] - 1))
-                {
-                    // valid bin
-                    h_histogram[channel][bin]++;
-                }
-            }
-            if (g_verbose_input) printf("]");
-        }
-        if (g_verbose_input) printf("\n\n");
-    }
-}
-
-
-
-/**
- * Test histogram-even
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT,
-    typename        SampleIteratorT>
-void TestEven(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    SampleIteratorT h_samples,
-    SampleIteratorT d_samples)
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
-        (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
-        (int) (num_row_pixels * num_rows),
-        (int) num_rows,
-        (int) num_row_pixels,
-        (int) row_stride_bytes,
-        (int) total_samples,
-        (int) sizeof(SampleT),
-        typeid(SampleT).name(),
-        entropy_reduction,
-        typeid(CounterT).name(),
-        NUM_ACTIVE_CHANNELS,
-        NUM_CHANNELS);
-    std::cout << CoutCast(max_level) << "\n";
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
-    fflush(stdout);
-
-    // Allocate and initialize host and device data
-
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
-    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int bins = num_levels[channel] - 1;
-        h_histogram[channel] = new CounterT[bins];
-
-        transform_op[channel].Init(
-            num_levels[channel],
-            upper_level[channel],
-            lower_level[channel],
-            ((upper_level[channel] - lower_level[channel]) / bins));
-    }
-
-    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
-        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
-    }
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
-        1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Allocate temporary storage with "canary" zones
-    int     canary_bytes    = 256;
-    char    canary_token    = 8;
-    char*   canary_zone     = new char[canary_bytes];
-
-    memset(canary_zone, canary_token, canary_bytes);
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
-    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
-
-    // Run warmup/correctness iteration
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
-        1, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Check canary zones
-    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
-        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
-        error |= channel_error;
-    }
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Even(
-        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, false);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(SampleT);
-        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
-            avg_millis,
-            giga_rate,
-            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
-            giga_rate / NUM_CHANNELS,
-            giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        if (h_histogram[channel])
-            delete[] h_histogram[channel];
-
-        if (d_histogram[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
-    }
-
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, error);
-}
-
-
-/**
- * Test histogram-even (native pointer input)
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEvenNative(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    // Allocate and initialize host sample data
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    SampleT*                    h_samples = new Foo[total_samples];
-
-    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-    SampleT* d_samples = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
-
-    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        h_samples, d_samples);
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-}
-
-
-/**
- * Test histogram-even (native pointer input)
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEvenIterator(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    SampleT sample = (SampleT) lower_level[0];
-    ConstantInputIterator<SampleT> sample_itr(sample);
-
-    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        sample_itr, sample_itr);
-
-}
-
-
-/**
- * Test histogram-range
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestRange(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
-        (int) (num_row_pixels * num_rows),
-        (int) num_rows,
-        (int) num_row_pixels,
-        (int) row_stride_bytes,
-        (int) total_samples,
-        (int) sizeof(SampleT),
-        typeid(SampleT).name(),
-        entropy_reduction,
-        typeid(CounterT).name(),
-        NUM_ACTIVE_CHANNELS,
-        NUM_CHANNELS);
-    std::cout << CoutCast(max_level) << "\n";
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
-        std::cout << levels[channel][0];
-        for (int level = 1; level < num_levels[channel]; ++level)
-            std::cout << ", " << levels[channel][level];
-        printf("]\n");
-    }
-    fflush(stdout);
-
-    // Allocate and initialize host and device data
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    SampleT*                    h_samples = new Foo[total_samples];
-    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
-    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        transform_op[channel].levels = levels[channel];
-        transform_op[channel].num_levels = num_levels[channel];
-
-        int bins = num_levels[channel] - 1;
-        h_histogram[channel] = new CounterT[bins];
-    }
-
-    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
-
-    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-    SampleT*        d_samples = NULL;
-    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
-    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
-        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
-
-        int bins = num_levels[channel] - 1;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
-        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
-    }
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
-        1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples,
-        d_histogram,
-        num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Allocate temporary storage with "canary" zones
-    int     canary_bytes    = 256;
-    char    canary_token    = 9;
-    char*   canary_zone     = new char[canary_bytes];
-
-    memset(canary_zone, canary_token, canary_bytes);
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
-    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
-
-    // Run warmup/correctness iteration
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
-        1, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples,
-        d_histogram,
-        num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Check canary zones
-    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
-        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
-        error |= channel_error;
-    }
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    Dispatch<NUM_ACTIVE_CHANNELS, NUM_CHANNELS, BACKEND>::Range(
-        g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples,
-        d_histogram,
-        num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, false);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(SampleT);
-        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
-            avg_millis,
-            giga_rate,
-            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
-            giga_rate / NUM_CHANNELS,
-            giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        if (h_histogram[channel])
-            delete[] h_histogram[channel];
-
-        if (d_histogram[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
-
-        if (d_levels[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
-    }
-
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, error);
-}
-
-
-/**
- * Test histogram-even
- */
-template <
-    Backend         BACKEND,
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEven(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    LevelT lower_level[NUM_ACTIVE_CHANNELS];
-    LevelT upper_level[NUM_ACTIVE_CHANNELS];
-
-    // Find smallest level increment
-    int max_bins = max_num_levels - 1;
-    LevelT min_level_increment = max_level / max_bins;
-
-    // Set upper and lower levels for each channel
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int num_bins = num_levels[channel] - 1;
-        lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
-        upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
-    }
-
-    // Test pointer-based samples
-    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Test iterator-based samples (CUB-only)
-    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
-}
-
-
-
-/**
- * Test histogram-range
- */
-template <
-    Backend         BACKEND,
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestRange(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    // Find smallest level increment
-    int max_bins = max_num_levels - 1;
-    LevelT min_level_increment = max_level / max_bins;
-
-    LevelT* levels[NUM_ACTIVE_CHANNELS];
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        levels[channel] = new LevelT[num_levels[channel]];
-
-        int num_bins = num_levels[channel] - 1;
-        LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
-
-        for (int level = 0; level < num_levels[channel]; ++level)
-            levels[channel][level] = lower_level + (level * min_level_increment);
-    }
-
-    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-        delete[] levels[channel];
-
-}
-
-
-
-/**
- * Test different entrypoints
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-
-    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-}
-
-
-/**
- * Test different number of levels
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    int num_levels[NUM_ACTIVE_CHANNELS];
-
-// Unnecessary testing
-//    // All the same level
-//    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-//    {
-//        num_levels[channel] = max_num_levels;
-//    }
-//    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-//        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-
-    // All different levels
-    num_levels[0] = max_num_levels;
-    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
-    }
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-}
-
-
-
-/**
- * Test different entropy-levels
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
-
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
-
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
-}
-
-
-/**
- * Test different row strides
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
-
-    // No padding
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
-
-    // 13 samples padding
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
-}
-
-
-/**
- * Test different problem sizes
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    // 0 row/col images
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(0), OffsetT(0), max_level, max_num_levels);
-
-    // 1080 image
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
-
-    // Sample different aspect ratios sizes
-    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
-    {
-        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
-        {
-            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-                cols, rows, max_level, max_num_levels);
-        }
-    }
-
-    // Randomly select linear problem size between 1:10,000,000
-    unsigned int max_int = (unsigned int) -1;
-    for (int i = 0; i < 4; ++i)
-    {
-        unsigned int num_items;
-        RandomBits(num_items);
-        num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-        num_items = CUB_MAX(1, num_items);
-
-        Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-            OffsetT(num_items), 1, max_level, max_num_levels);
-    }
-}
-
-
-
-/**
- * Test different channel interleavings (valid specialiation)
- */
-template <
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestChannels(
-    LevelT          max_level,
-    int             max_num_levels,
-    Int2Type<true>  is_valid_tag)
-{
-    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-}
-
-
-/**
- * Test different channel interleavings (invalid specialiation)
- */
-template <
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestChannels(
-    LevelT          max_level,
-    int             max_num_levels,
-    Int2Type<false> is_valid_tag)
-{}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_row_pixels = -1;
-    int entropy_reduction = 0;
-    int num_rows = 1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose_input = args.CheckCmdLineFlag("v2");
-    args.GetCmdLineArgument("n", num_row_pixels);
-
-    int row_stride_pixels = num_row_pixels;
-
-    args.GetCmdLineArgument("rows", num_rows);
-    args.GetCmdLineArgument("stride", row_stride_pixels);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-    bool compare_npp = args.CheckCmdLineFlag("npp");
-#endif
-
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<pixels per row> "
-            "[--rows=<number of rows> "
-            "[--stride=<row stride in pixels> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--entropy=<entropy-reduction factor (default 0)>]"
-            "[--v] "
-            "[--cdp]"
-            "[--npp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    if (num_row_pixels < 0)
-    {
-        num_row_pixels      = 1920 * 1080;
-        row_stride_pixels   = num_row_pixels;
-    }
-
-#if defined(QUICKER_TEST)
-
-    // Compile/run quick tests
-    {
-        // HistogramEven: unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramRange: signed char 256 bins
-        typedef signed char         SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    {
-        // HistogramEven: unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 4/4 multichannel Unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[4]       = {257, 257, 257, 257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 3/4 multichannel Unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[3]       = {257, 257, 257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: short [0,1024] 256 bins
-        typedef unsigned short      SampleT;
-        typedef unsigned short      LevelT;
-
-        LevelT  max_level           = 1024;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: float [0,1.0] 256 bins
-        typedef float               SampleT;
-        typedef float               LevelT;
-
-        LevelT  max_level           = 1.0;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
-        typedef float               SampleT;
-        typedef float               LevelT;
-
-         LevelT  max_level           = 1.0;
-         int     num_levels[3]       = {257, 257, 257};
-         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramRange: signed char 256 bins
-        typedef signed char         SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[3]       = {257, 129, 65};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-    {
-        // HistogramEven: double [0,1.0] 64 bins
-        typedef double              SampleT;
-        typedef double              LevelT;
-
-        LevelT  max_level           = 1.0;
-        int     num_levels[1]       = {65};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: short [0,1024] 512 bins
-        typedef unsigned short      SampleT;
-        typedef unsigned short      LevelT;
-
-        LevelT  max_level           = 1024;
-        int     num_levels[1]       = {513};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
-        TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
-        TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
-        TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
-        TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
-
-		// Test down-conversion of size_t offsets to int
-        TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
-    }
-
-#endif
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu
deleted file mode 100644
index 06e030e70..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_radix_sort.cu
+++ /dev/null
@@ -1,1305 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceRadixSort utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-#include <typeinfo>
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    #include <cuda_fp16.h>
-#endif
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_segmented_radix_sort.cuh>
-
-#include "test_util.h"
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/reverse.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,                        // CUB method (allows overwriting of input)
-    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
-
-    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
-    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
-
-    THRUST,                     // Thrust method
-    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceRadixSort entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to CUB sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>         is_descending,
-    Int2Type<CUB>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>             is_descending,
-    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
-    int                         *d_selector,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-/**
- * Dispatch to CUB sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>          is_descending,
-    Int2Type<CUB>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-}
-
-
-/**
- * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>              is_descending,
-    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
-    int                         *d_selector,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceRadixSort entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>         is_descending,
-    Int2Type<CUB_SEGMENTED> dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceSegmentedRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>                         is_descending,
-    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
-    int                                     *d_selector,
-    size_t                                  *d_temp_storage_bytes,
-    cudaError_t                             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-
-/**
- * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>          is_descending,
-    Int2Type<CUB_SEGMENTED> dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceSegmentedRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>                          is_descending,
-    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
-    int                                     *d_selector,
-    size_t                                  *d_temp_storage_bytes,
-    cudaError_t                             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch keys-only to Thrust sorting entrypoint
- */
-template <int IS_DESCENDING, typename KeyT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<THRUST>        dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<NullType>  &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
-
-        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-        thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
-        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch key-value pairs to Thrust sorting entrypoint
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<THRUST>        dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
-        thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
-
-        if (IS_DESCENDING) {
-            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
-        }
-
-        thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
-
-        if (IS_DESCENDING) {
-            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceRadixSort
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-__global__ void CnpDispatchKernel(
-    Int2Type<IS_DESCENDING> is_descending,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  temp_storage_bytes,
-    DoubleBuffer<KeyT>      d_keys,
-    DoubleBuffer<ValueT>    d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    bool                    debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error            = cudaErrorNotSupported;
-#else
-    *d_cdp_error            = Dispatch(
-                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
-                                num_items, num_segments, d_segment_offsets,
-                                begin_bit, end_bit, 0, debug_synchronous);
-    *d_temp_storage_bytes   = temp_storage_bytes;
-    *d_selector             = d_keys.selector;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<CDP>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, debug_synchronous);
-
-    // Copy out selector
-    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
-    d_values.selector = d_keys.selector;
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-
-/**
- * Simple key-value pairing
- */
-template <
-    typename KeyT,
-    typename ValueT,
-    bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
-struct Pair
-{
-    KeyT     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-
-/**
- * Simple key-value pairing (specialized for bool types)
- */
-template <typename ValueT>
-struct Pair<bool, ValueT, false>
-{
-    bool     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (!key && b.key);
-    }
-};
-
-
-/**
- * Simple key-value pairing (specialized for floating point types)
- */
-template <typename KeyT, typename ValueT>
-struct Pair<KeyT, ValueT, true>
-{
-    KeyT     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // KeyT in unsigned bits
-        typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-        // Return true if key is negative zero and b.key is positive zero
-        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&key));
-        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&b.key));
-        UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key data
- */
-template <typename KeyT>
-void InitializeKeyBits(
-    GenMode         gen_mode,
-    KeyT            *h_keys,
-    int             num_items,
-    int             entropy_reduction)
-{
-    for (int i = 0; i < num_items; ++i)
-        InitValue(gen_mode, h_keys[i], i);
-}
-
-
-/**
- * Initialize solution
- */
-template <bool IS_DESCENDING, typename KeyT>
-void InitializeSolution(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit,
-    int     *&h_reference_ranks,
-    KeyT    *&h_reference_keys)
-{
-    typedef Pair<KeyT, int> PairT;
-
-    PairT *h_pairs = new PairT[num_items];
-
-    int num_bits = end_bit - begin_bit;
-    for (int i = 0; i < num_items; ++i)
-    {
-
-        // Mask off unwanted portions
-        if (num_bits < sizeof(KeyT) * 8)
-        {
-            unsigned long long base = 0;
-            memcpy(&base, &h_keys[i], sizeof(KeyT));
-            base &= ((1ull << num_bits) - 1) << begin_bit;
-            memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
-        }
-        else
-        {
-            h_pairs[i].key = h_keys[i];
-        }
-
-        h_pairs[i].value = i;
-    }
-
-    printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
-
-    for (int i = 0; i < num_segments; ++i)
-    {
-        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-        std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-    }
-
-    printf(" Done.\n"); fflush(stdout);
-
-    h_reference_ranks  = new int[num_items];
-    h_reference_keys   = new KeyT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_ranks[i]    = h_pairs[i].value;
-        h_reference_keys[i]     = h_keys[h_pairs[i].value];
-    }
-
-    if (h_pairs) delete[] h_pairs;
-}
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Test DeviceRadixSort
- */
-template <
-    Backend     BACKEND,
-    bool        IS_DESCENDING,
-    typename    KeyT,
-    typename    ValueT>
-void Test(
-    KeyT        *h_keys,
-    ValueT      *h_values,
-    int         num_items,
-    int         num_segments,
-    int         *h_segment_offsets,
-    int         begin_bit,
-    int         end_bit,
-    KeyT        *h_reference_keys,
-    ValueT      *h_reference_values)
-{
-    // Key alias type
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    typedef typename If<Equals<KeyT, half_t>::VALUE, __half, KeyT>::Type KeyAliasT;
-#else
-    typedef KeyT KeyAliasT;
-#endif
-
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
-        (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (KEYS_ONLY) ? "keys-only" : "key-value",
-        num_items, num_segments,
-        (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
-        IS_DESCENDING, begin_bit, end_bit);
-    fflush(stdout);
-
-    if (g_verbose)
-    {
-        printf("Input keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-    }
-
-    // Allocate device arrays
-    DoubleBuffer<KeyAliasT> d_keys;
-    DoubleBuffer<ValueT>    d_values;
-    int                     *d_selector;
-    int                     *d_segment_offsets;
-    size_t                  *d_temp_storage_bytes;
-    cudaError_t             *d_cdp_error;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
-    if (!KEYS_ONLY)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
-    }
-
-    // Allocate temporary storage (and make it un-aligned)
-    size_t  temp_storage_bytes  = 0;
-    void    *d_temp_storage     = NULL;
-    CubDebugExit(Dispatch(
-        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, 0, true));
-
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
-    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
-
-    // Initialize/clear device arrays
-    d_keys.selector = 0;
-    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
-    if (!KEYS_ONLY)
-    {
-        d_values.selector = 0;
-        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
-    }
-    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(
-        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, 0, true));
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    printf("Warmup done.  Checking results:\n"); fflush(stdout);
-    int compare = CompareDeviceResults(h_reference_keys, reinterpret_cast<KeyT*>(d_keys.Current()), num_items, true, g_verbose);
-    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
-    if (!KEYS_ONLY)
-    {
-        int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
-        compare |= values_compare;
-        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-    if (BACKEND == CUB_NO_OVERWRITE)
-    {
-        // Check that input isn't overwritten
-        int input_compare = CompareDeviceResults(h_keys, reinterpret_cast<KeyT*>(d_keys.d_buffers[0]), num_items, true, g_verbose);
-        compare |= input_compare;
-        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Performance
-    if (g_timing_iterations)
-        printf("\nPerforming timing iterations:\n"); fflush(stdout);
-
-    GpuTimer gpu_timer;
-    float elapsed_millis = 0.0f;
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Initialize/clear device arrays
-        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
-        if (!KEYS_ONLY)
-        {
-            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
-        }
-
-        gpu_timer.Start();
-        CubDebugExit(Dispatch(
-            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
-            num_items, num_segments, d_segment_offsets,
-            begin_bit, end_bit, 0, false));
-        gpu_timer.Stop();
-        elapsed_millis += gpu_timer.ElapsedMillis();
-    }
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = (KEYS_ONLY) ?
-            giga_rate * sizeof(KeyT) * 2 :
-            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
-        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
-    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
-    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
-    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
-    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test backend
- */
-template <bool IS_DESCENDING, typename KeyT, typename ValueT>
-void TestBackend(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit,
-    KeyT    *h_reference_keys,
-    int     *h_reference_ranks)
-{
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    ValueT *h_values             = NULL;
-    ValueT *h_reference_values   = NULL;
-
-    if (!KEYS_ONLY)
-    {
-        h_values            = new ValueT[num_items];
-        h_reference_values  = new ValueT[num_items];
-
-        for (int i = 0; i < num_items; ++i)
-        {
-            InitValue(INTEGER_SEED, h_values[i], i);
-            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
-        }
-    }
-
-#ifdef SEGMENTED_SORT
-    // Test multi-segment implementations
-    Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-    Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-#else   // SEGMENTED_SORT
-    if (num_segments == 1)
-    {
-        // Test single-segment implementations
-        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-        Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-    #ifdef CUB_CDP
-        Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-    #endif
-    }
-#endif  // SEGMENTED_SORT
-
-    if (h_values) delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-}
-
-
-
-
-/**
- * Test value type
- */
-template <bool IS_DESCENDING, typename KeyT>
-void TestValueTypes(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit)
-{
-    // Initialize the solution
-
-    int *h_reference_ranks = NULL;
-    KeyT *h_reference_keys = NULL;
-    InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
-
-    // Test keys-only
-    TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 8b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 32b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 64b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with non-trivially-constructable value
-    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Cleanup
-    if (h_reference_ranks) delete[] h_reference_ranks;
-    if (h_reference_keys) delete[] h_reference_keys;
-}
-
-
-
-/**
- * Test ascending/descending
- */
-template <typename KeyT>
-void TestDirection(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit)
-{
-    TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-    TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-}
-
-
-/**
- * Test different bit ranges
- */
-template <typename KeyT>
-void TestBits(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets)
-{
-    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
-    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
-    {
-        // Partial bits
-        int begin_bit = 1;
-        int end_bit = (sizeof(KeyT) * 8) - 1;
-        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
-        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-
-        // Across subword boundaries
-        int mid_bit = sizeof(KeyT) * 4;
-        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
-        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
-    }
-
-    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
-    TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
-}
-
-
-/**
- * Test different segment compositions
- */
-template <typename KeyT>
-void TestSegments(
-    KeyT    *h_keys,
-    int     num_items,
-    int     max_segments)
-{
-    int *h_segment_offsets = new int[max_segments + 1];
-
-#ifdef SEGMENTED_SORT
-    for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
-    {
-        if (num_items / num_segments < 128 * 1000) {
-            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-            InitializeSegments(num_items, num_segments, h_segment_offsets);
-            TestBits(h_keys, num_items, num_segments, h_segment_offsets);
-        }
-    }
-#else
-    // Test single segment
-    if (num_items < 128 * 1000) {
-        // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-        InitializeSegments(num_items, 1, h_segment_offsets);
-        TestBits(h_keys, num_items, 1, h_segment_offsets);
-    }
-#endif
-    if (h_segment_offsets) delete[] h_segment_offsets;
-}
-
-
-/**
- * Test different (sub)lengths and number of segments
- */
-template <typename KeyT>
-void TestSizes(
-    KeyT    *h_keys,
-    int     max_items,
-    int     max_segments)
-{
-    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
-    {
-        TestSegments(h_keys, num_items, max_segments);
-    }
-    TestSegments(h_keys, 1, max_segments);
-    TestSegments(h_keys, 0, max_segments);
-}
-
-
-/**
- * Test key sampling distributions
- */
-template <typename KeyT>
-void TestGen(
-    int             max_items,
-    int             max_segments)
-{
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    if (max_items < 0)
-        max_items = (ptx_version > 100) ? 9000003 : max_items = 5000003;
-
-    if (max_segments < 0)
-        max_segments = 5003;
-
-    KeyT *h_keys = new KeyT[max_items];
-
-    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
-    {
-        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
-        InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
-        TestSizes(h_keys, max_items, max_segments);
-    }
-
-    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
-    InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
-    TestSizes(h_keys, max_items, max_segments);
-
-    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
-    InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
-    TestSizes(h_keys, max_items, max_segments);
-
-    if (h_keys) delete[] h_keys;
-}
-
-
-//---------------------------------------------------------------------
-// Simple test
-//---------------------------------------------------------------------
-
-template <
-    Backend     BACKEND,
-    typename    KeyT,
-    typename    ValueT,
-    bool        IS_DESCENDING>
-void Test(
-    int         num_items,
-    int         num_segments,
-    GenMode     gen_mode,
-    int         entropy_reduction,
-    int         begin_bit,
-    int         end_bit)
-{
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    KeyT    *h_keys             = new KeyT[num_items];
-    int     *h_reference_ranks  = NULL;
-    KeyT    *h_reference_keys   = NULL;
-    ValueT  *h_values           = NULL;
-    ValueT  *h_reference_values = NULL;
-    int     *h_segment_offsets  = new int[num_segments + 1];
-
-    if (end_bit < 0)
-        end_bit = sizeof(KeyT) * 8;
-
-    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
-    InitializeSegments(num_items, num_segments, h_segment_offsets);
-    InitializeSolution<IS_DESCENDING>(
-        h_keys, num_items, num_segments, h_segment_offsets,
-        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
-
-    if (!KEYS_ONLY)
-    {
-        h_values            = new ValueT[num_items];
-        h_reference_values  = new ValueT[num_items];
-
-        for (int i = 0; i < num_items; ++i)
-        {
-            InitValue(INTEGER_SEED, h_values[i], i);
-            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
-        }
-    }
-    if (h_reference_ranks) delete[] h_reference_ranks;
-
-    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
-    Test<BACKEND, IS_DESCENDING>(
-        h_keys, h_values,
-        num_items, num_segments, h_segment_offsets,
-        begin_bit, end_bit, h_reference_keys, h_reference_values);
-
-    if (h_keys)             delete[] h_keys;
-    if (h_reference_keys)   delete[] h_reference_keys;
-    if (h_values)           delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int bits = -1;
-    int num_items = -1;
-    int num_segments = -1;
-    int entropy_reduction = 0;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("s", num_segments);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("bits", bits);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--bits=<valid key bits>]"
-            "[--n=<input items> "
-            "[--s=<num segments> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--entropy=<entropy-reduction factor (default 0)>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    enum {
-        IS_DESCENDING   = false
-    };
-
-    // Compile/run basic CUB test
-    if (num_items < 0)      num_items       = 48000000;
-    if (num_segments < 0)   num_segments    = 5000;
-
-    Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(num_items, num_segments, RANDOM, entropy_reduction, 0, bits);
-
-    printf("\n-------------------------------\n");
-
-    Test<CUB,           unsigned char,      NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    printf("\n-------------------------------\n");
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    Test<CUB,           half_t,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-#endif
-    Test<CUB,           float,              NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           double,             NullType, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    printf("\n-------------------------------\n");
-
-    Test<CUB,           unsigned char,      unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0)      num_items       = 48000000;
-    if (num_segments < 0)   num_segments    = 5000;
-
-    // Compare CUB and thrust on 32b keys-only
-    Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    // Compare CUB and thrust on 64b keys-only
-    Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-
-    // Compare CUB and thrust on 32b key-value pairs
-    Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    // Compare CUB and thrust on 64b key + 32b value pairs
-    Test<CUB, unsigned long long, unsigned int, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned long long, unsigned int, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        TestGen<bool>                 (num_items, num_segments);
-
-        TestGen<char>                 (num_items, num_segments);
-        TestGen<signed char>          (num_items, num_segments);
-        TestGen<unsigned char>        (num_items, num_segments);
-
-        TestGen<short>                (num_items, num_segments);
-        TestGen<unsigned short>       (num_items, num_segments);
-
-        TestGen<int>                  (num_items, num_segments);
-        TestGen<unsigned int>         (num_items, num_segments);
-
-        TestGen<long>                 (num_items, num_segments);
-        TestGen<unsigned long>        (num_items, num_segments);
-
-        TestGen<long long>            (num_items, num_segments);
-        TestGen<unsigned long long>   (num_items, num_segments);
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-        TestGen<half_t>                (num_items, num_segments);
-#endif
-        TestGen<float>                (num_items, num_segments);
-
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            TestGen<double>           (num_items, num_segments);
-
-    }
-
-#endif
-
-    return 0;
-}
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu
deleted file mode 100644
index 95126d633..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce.cu
+++ /dev/null
@@ -1,1350 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/iterator/transform_input_iterator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-int                     g_ptx_version;
-int                     g_sm_count;
-double                  g_device_giga_bandwidth;
-bool                    g_verbose           = false;
-bool                    g_verbose_input     = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-// Dispatch types
-enum Backend
-{
-    CUB,            // CUB method
-    CUB_SEGMENTED,  // CUB segmented method
-    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-    THRUST,         // Thrust method
-};
-
-
-// Custom max functor
-struct CustomMax
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename OutputT>
-    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceReduce entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce entrypoint (custom-max)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Max-identity
-    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, num_items, reduction_op, identity,
-            stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-/**
- * Dispatch to sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Sum            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-/**
- * Dispatch to min entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Min            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-/**
- * Dispatch to max entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Max            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-/**
- * Dispatch to argmin entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMin         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-/**
- * Dispatch to argmax entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMax         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSegmentedReduce entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce entrypoint (custom-max)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Max-identity
-    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Sum            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to min entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Min            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to max entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Max            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmin entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMin         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmax entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMax         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduction entrypoint (min or max specialization)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        OutputT init;
-        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
-
-        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
-        OutputT retval;
-        for (int i = 0; i < timing_iterations; ++i)
-        {
-            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
-        }
-
-        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
-            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-/**
- * Dispatch to reduction entrypoint (sum specialization)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    Sum                 reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
-        OutputT retval;
-        for (int i = 0; i < timing_iterations; ++i)
-        {
-            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
-        }
-
-        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
-            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA nested-parallelism test kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceReduce
- */
-template <
-    typename            InputIteratorT,
-    typename            OutputIteratorT,
-    typename            OffsetIteratorT,
-    typename            ReductionOpT>
-__global__ void CnpDispatchKernel(
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CUB_CDP kernel
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_CDP>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-/// Initialize problem
-template <typename InputT>
-void Initialize(
-    GenMode         gen_mode,
-    InputT          *h_in,
-    int             num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-
-    if (g_verbose_input)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/// Solve problem (max/custom-max functor)
-template <typename ReductionOpT, typename InputT, typename _OutputT>
-struct Solution
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        ReductionOpT reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-/// Solve problem (min functor)
-template <typename InputT, typename _OutputT>
-struct Solution<cub::Min, InputT, _OutputT>
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::Min reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-/// Solve problem (sum functor)
-template <typename InputT, typename _OutputT>
-struct Solution<cub::Sum, InputT, _OutputT>
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::Sum reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate;
-            InitValue(INTEGER_SEED, aggregate, 0);
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-/// Solve problem (argmin functor)
-template <typename InputValueT, typename OutputValueT>
-struct Solution<cub::ArgMin, InputValueT, OutputValueT>
-{
-    typedef KeyValuePair<int, OutputValueT> OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::ArgMin reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-            {
-                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
-                aggregate = reduction_op(aggregate, item);
-            }
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-/// Solve problem (argmax functor)
-template <typename InputValueT, typename OutputValueT>
-struct Solution<cub::ArgMax, InputValueT, OutputValueT>
-{
-    typedef KeyValuePair<int, OutputValueT> OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::ArgMax reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-            {
-                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
-                aggregate = reduction_op(aggregate, item);
-            }
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-/// Test DeviceReduce for a given problem input
-template <
-    typename                BackendT,
-    typename                DeviceInputIteratorT,
-    typename                DeviceOutputIteratorT,
-    typename                HostReferenceIteratorT,
-    typename                OffsetT,
-    typename                OffsetIteratorT,
-    typename                ReductionOpT>
-void Test(
-    BackendT                backend,
-    DeviceInputIteratorT    d_in,
-    DeviceOutputIteratorT   d_out,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         d_segment_offsets,
-    ReductionOpT            reduction_op,
-    HostReferenceIteratorT  h_reference)
-{
-    // Input data types
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
-
-    // Allocate CUB_CDP device arrays for temp storage size and error
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Inquire temp device storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(backend, 1,
-        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, num_segments, d_segment_offsets,
-        reduction_op, 0, true));
-
-    // Allocate temp device storage
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(backend, 1,
-        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, num_segments, d_segment_offsets,
-        reduction_op, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    if (g_timing_iterations > 0)
-    {
-        GpuTimer gpu_timer;
-        gpu_timer.Start();
-
-        CubDebugExit(Dispatch(backend, g_timing_iterations,
-            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-            d_in, d_out, num_items, num_segments, d_segment_offsets,
-            reduction_op, 0, false));
-
-        gpu_timer.Stop();
-        float elapsed_millis = gpu_timer.ElapsedMillis();
-
-        // Display performance
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(InputT);
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
-            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-
-    }
-
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/// Test DeviceReduce
-template <
-    Backend                 BACKEND,
-    typename                OutputValueT,
-    typename                HostInputIteratorT,
-    typename                DeviceInputIteratorT,
-    typename                OffsetT,
-    typename                OffsetIteratorT,
-    typename                ReductionOpT>
-void SolveAndTest(
-    HostInputIteratorT      h_in,
-    DeviceInputIteratorT    d_in,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         h_segment_offsets,
-    OffsetIteratorT         d_segment_offsets,
-    ReductionOpT            reduction_op)
-{
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
-    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
-    typedef typename SolutionT::OutputT                                         OutputT;
-
-    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
-        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
-        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
-    fflush(stdout);
-
-    // Allocate and solve solution
-    OutputT *h_reference = new OutputT[num_segments];
-    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
-
-//    // Run with discard iterator
-//    DiscardOutputIterator<OffsetT> discard_itr;
-//    Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
-
-    // Run with output data (cleared for sanity-check)
-    OutputT *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
-    Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (h_reference) delete[] h_reference;
-}
-
-
-/// Test specific problem type
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        OffsetT,
-    typename        ReductionOpT>
-void TestProblem(
-    OffsetT         num_items,
-    OffsetT         num_segments,
-    GenMode         gen_mode,
-    ReductionOpT    reduction_op)
-{
-    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
-    fflush(stdout);
-
-    // Initialize value data
-    InputT* h_in = new InputT[num_items];
-    Initialize(gen_mode, h_in, num_items);
-
-    // Initialize segment data
-    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
-    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
-
-    // Initialize device data
-    OffsetT *d_segment_offsets      = NULL;
-    InputT  *d_in                   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
-    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
-
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (h_in)               delete[] h_in;
-    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/// Test different operators
-template <
-    Backend             BACKEND,
-    typename            OutputT,
-    typename            HostInputIteratorT,
-    typename            DeviceInputIteratorT,
-    typename            OffsetT,
-    typename            OffsetIteratorT>
-void TestByOp(
-    HostInputIteratorT      h_in,
-    DeviceInputIteratorT    d_in,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         h_segment_offsets,
-    OffsetIteratorT         d_segment_offsets)
-{
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
-}
-
-
-/// Test different backends
-template <
-    typename    InputT,
-    typename    OutputT,
-    typename    OffsetT>
-void TestByBackend(
-    OffsetT     num_items,
-    OffsetT     max_segments,
-    GenMode     gen_mode)
-{
-    // Initialize host data
-    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
-        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
-
-    InputT  *h_in               = new InputT[num_items];
-    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
-    Initialize(gen_mode, h_in, num_items);
-
-    // Initialize device data
-    InputT  *d_in               = NULL;
-    OffsetT *d_segment_offsets  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-
-    //
-    // Test single-segment implementations
-    //
-
-    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
-
-    // Page-aligned-input tests
-    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
-#ifdef CUB_CDP
-    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
-#endif
-
-    // Non-page-aligned-input tests
-    if (num_items > 1)
-    {
-        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
-        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
-    }
-
-    //
-    // Test segmented implementation
-    //
-
-    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-    int max_items_per_segment = 128000;
-
-    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
-        num_segments < max_segments;
-        num_segments = (num_segments * 32) + 1)
-    {
-        // Test with segment pointer
-        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
-        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-        TestByOp<CUB_SEGMENTED, OutputT>(
-            h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
-
-        // Test with segment iterator
-        typedef CastOp<OffsetT> IdentityOpT;
-        IdentityOpT identity_op;
-        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
-            h_segment_offsets,
-            identity_op);
-       TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
-            d_segment_offsets,
-            identity_op);
-
-        TestByOp<CUB_SEGMENTED, OutputT>(
-            h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
-    }
-
-    if (h_in)               delete[] h_in;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-}
-
-
-/// Test different input-generation modes
-template <
-    typename InputT,
-    typename OutputT,
-    typename OffsetT>
-void TestByGenMode(
-    OffsetT num_items,
-    OffsetT max_segments)
-{
-    //
-    // Test pointer support using different input-generation modes
-    //
-
-    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
-    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
-    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
-
-    //
-    // Test iterator support using a constant-iterator and SUM
-    //
-
-    InputT val;
-    InitValue(UNIFORM, val, 0);
-    ConstantInputIterator<InputT, OffsetT> h_in(val);
-
-    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
-    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
-
-    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
-#ifdef CUB_CDP
-    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
-#endif
-
-    if (h_segment_offsets) delete[] h_segment_offsets;
-}
-
-
-/// Test different problem sizes
-template <
-    typename InputT,
-    typename OutputT,
-    typename OffsetT>
-struct TestBySize
-{
-    OffsetT max_items;
-    OffsetT max_segments;
-
-    TestBySize(OffsetT max_items, OffsetT max_segments) :
-        max_items(max_items),
-        max_segments(max_segments)
-    {}
-
-    template <typename ActivePolicyT>
-    cudaError_t Invoke()
-    {
-        //
-        // Black-box testing on all backends
-        //
-
-        // Test 0, 1, many
-        TestByGenMode<InputT, OutputT>(0,           max_segments);
-        TestByGenMode<InputT, OutputT>(1,           max_segments);
-        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
-
-        // Test random problem sizes from a log-distribution [8, max_items-ish)
-        int     num_iterations = 8;
-        double  max_exp = log(double(max_items)) / log(double(2.0));
-        for (int i = 0; i < num_iterations; ++i)
-        {
-            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
-            TestByGenMode<InputT, OutputT>(num_items, max_segments);
-        }
-
-        //
-        // White-box testing of single-segment problems around specific sizes
-        //
-
-        // Tile-boundaries: multiple blocks, one tile per block
-        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
-
-        // Tile-boundaries: multiple blocks, multiple tiles per block
-        OffsetT sm_occupancy = 32;
-        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
-        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
-
-        return cudaSuccess;
-    }
-};
-
-
-/// Test problem type
-template <
-    typename    InputT,
-    typename    OutputT,
-    typename    OffsetT>
-void TestType(
-    OffsetT     max_items,
-    OffsetT     max_segments)
-{
-    typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
-
-    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
-
-    MaxPolicyT::Invoke(g_ptx_version, dispatch);
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    typedef int OffsetT;
-
-    OffsetT max_items       = 27000000;
-    OffsetT max_segments    = 34000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose_input = args.CheckCmdLineFlag("v2");
-    args.GetCmdLineArgument("n", max_items);
-    args.GetCmdLineArgument("s", max_segments);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--s=<num segments> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-
-    // Get ptx version
-    CubDebugExit(PtxVersion(g_ptx_version));
-
-    // Get SM count
-    g_sm_count = args.deviceProp.multiProcessorCount;
-
-    std::numeric_limits<float>::max();
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic test
-
-
-    TestProblem<CUB, char, int>(            max_items, 1, RANDOM_BIT, Sum());
-    TestProblem<CUB, short, int>(           max_items, 1, RANDOM_BIT, Sum());
-
-    printf("\n-------------------------------\n");
-
-    TestProblem<CUB, int, int>(             max_items, 1, RANDOM_BIT, Sum());
-    TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());
-
-    printf("\n-------------------------------\n");
-
-    TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
-    TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());
-
-    printf("\n-------------------------------\n");
-
-    TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick comparison tests
-
-    TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
-    TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
-    TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
-    TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
-    TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
-    TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        TestType<char, char>(max_items, max_segments);
-
-        TestType<unsigned char, unsigned char>(max_items, max_segments);
-
-        TestType<char, int>(max_items, max_segments);
-
-        TestType<short, short>(max_items, max_segments);
-        TestType<int, int>(max_items, max_segments);
-        TestType<long, long>(max_items, max_segments);
-        TestType<long long, long long>(max_items, max_segments);
-
-        TestType<uchar2, uchar2>(max_items, max_segments);
-        TestType<uint2, uint2>(max_items, max_segments);
-        TestType<ulonglong2, ulonglong2>(max_items, max_segments);
-        TestType<ulonglong4, ulonglong4>(max_items, max_segments);
-
-        TestType<TestFoo, TestFoo>(max_items, max_segments);
-        TestType<TestBar, TestBar>(max_items, max_segments);
-    }
-
-#endif
-
-
-    printf("\n");
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu
deleted file mode 100644
index 7d35eef3a..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_reduce_by_key.cu
+++ /dev/null
@@ -1,853 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce::ReduceByKey utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/iterator/constant_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/thread/thread_operators.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce-by-key entrypoint
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                  equality_op,
-    ReductionOpT                 reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::ReduceByKey(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_keys_out,
-            d_values_in,
-            d_values_out,
-            d_num_runs,
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce-by-key entrypoint
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input keys type
-    typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
-        typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
-        typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
-        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
-
-        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
-        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
-
-        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_ends = thrust::reduce_by_key(
-                d_keys_in_wrapper,
-                d_keys_in_wrapper + num_items,
-                d_values_in_wrapper,
-                d_keys_out_wrapper,
-                d_values_out_wrapper);
-        }
-
-        OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
-
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-
-        int repeat;
-
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve problem.  Returns total number of segments identified
- */
-template <
-    typename        KeyInputIteratorT,
-    typename        ValueInputIteratorT,
-    typename        KeyT,
-    typename        ValueT,
-    typename        EqualityOpT,
-    typename        ReductionOpT>
-int Solve(
-    KeyInputIteratorT       h_keys_in,
-    KeyT                    *h_keys_reference,
-    ValueInputIteratorT     h_values_in,
-    ValueT                  *h_values_reference,
-    EqualityOpT             equality_op,
-    ReductionOpT            reduction_op,
-    int                     num_items)
-{
-    // First item
-    KeyT previous        = h_keys_in[0];
-    ValueT aggregate     = h_values_in[0];
-    int num_segments    = 0;
-
-    // Subsequent items
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (!equality_op(previous, h_keys_in[i]))
-        {
-            h_keys_reference[num_segments] = previous;
-            h_values_reference[num_segments] = aggregate;
-            num_segments++;
-            aggregate = h_values_in[i];
-        }
-        else
-        {
-            aggregate = reduction_op(aggregate, h_values_in[i]);
-        }
-        previous = h_keys_in[i];
-    }
-
-    h_keys_reference[num_segments] = previous;
-    h_values_reference[num_segments] = aggregate;
-    num_segments++;
-
-    return num_segments;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceKeyInputIteratorT,
-    typename            DeviceValueInputIteratorT,
-    typename            KeyT,
-    typename            ValueT,
-    typename            EqualityOpT,
-    typename            ReductionOpT>
-void Test(
-    DeviceKeyInputIteratorT     d_keys_in,
-    DeviceValueInputIteratorT   d_values_in,
-    KeyT*                       h_keys_reference,
-    ValueT*                     h_values_reference,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    int                         num_segments,
-    int                         num_items)
-{
-    // Allocate device output arrays and number of segments
-    KeyT*   d_keys_out             = NULL;
-    ValueT* d_values_out           = NULL;
-    int*    d_num_runs         = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output arrays
-    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
-    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
-    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
-    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
-    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
-
-    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
-    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float   avg_millis  = elapsed_millis / g_timing_iterations;
-        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
-        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
-    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
-    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2 | compare3);
-}
-
-
-/**
- * Test DeviceSelect on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment,
-    ReductionOpT    reduction_op)
-{
-    // Allocate host arrays
-    KeyT* h_keys_in        = new KeyT[num_items];
-    KeyT* h_keys_reference = new KeyT[num_items];
-
-    ValueT* h_values_in        = new ValueT[num_items];
-    ValueT* h_values_reference = new ValueT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-        InitValue(INTEGER_SEED, h_values_in[i], 1);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
-    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
-
-    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
-        num_items, num_segments, float(num_items) / num_segments,
-        typeid(KeyT).name(), typeid(ValueT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    KeyT     *d_keys_in = NULL;
-    ValueT   *d_values_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
-
-    // Cleanup
-    if (h_keys_in) delete[] h_keys_in;
-    if (h_values_in) delete[] h_values_in;
-    if (h_keys_reference) delete[] h_keys_reference;
-    if (h_values_reference) delete[] h_values_reference;
-    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
-    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestIterator(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment,
-    ReductionOpT    reduction_op)
-{
-    // Allocate host arrays
-    KeyT* h_keys_in        = new KeyT[num_items];
-    KeyT* h_keys_reference = new KeyT[num_items];
-
-    ValueT one_val;
-    InitValue(INTEGER_SEED, one_val, 1);
-    ConstantInputIterator<ValueT, int> h_values_in(one_val);
-    ValueT* h_values_reference = new ValueT[num_items];
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
-    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
-
-    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
-        num_items, num_segments, float(num_items) / num_segments,
-        typeid(KeyT).name(), typeid(ValueT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    KeyT     *d_keys_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
-
-    // Cleanup
-    if (h_keys_in) delete[] h_keys_in;
-    if (h_keys_reference) delete[] h_keys_reference;
-    if (h_values_reference) delete[] h_values_reference;
-    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void Test(
-    int             num_items,
-    ReductionOpT    reduction_op,
-    int             max_segment)
-{
-    // 0 key-bit entropy reduction rounds
-    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
-
-    if (max_segment > 1)
-    {
-        // 2 key-bit entropy reduction rounds
-        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
-
-        // 7 key-bit entropy reduction rounds
-        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
-    }
-}
-
-
-/**
- * Test different avg segment lengths modes
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void Test(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
-    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
-
-    // Evaluate different max-segment lengths
-    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
-    {
-        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
-    }
-}
-
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestDispatch(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
-#ifdef CUB_CDP
-    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestSize(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    if (num_items < 0)
-    {
-        TestDispatch<KeyT, ValueT>(1,        reduction_op);
-        TestDispatch<KeyT, ValueT>(100,      reduction_op);
-        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
-        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
-    }
-    else
-    {
-        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
-    }
-
-}
-
-
-template <
-    typename        KeyT,
-    typename        ValueT>
-void TestOp(
-    int             num_items)
-{
-    TestSize<KeyT, ValueT>(num_items, cub::Sum());
-    TestSize<KeyT, ValueT>(num_items, cub::Max());
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int maxseg              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", maxseg);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    printf("\n");
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("---- RLE int ---- \n");
-    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- RLE long long ---- \n");
-    TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- int ---- \n");
-    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- float ---- \n");
-    TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-    {
-        printf("---- double ---- \n");
-        TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-        TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-    }
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-
-        // Test different input types
-        TestOp<int, char>(num_items);
-        TestOp<int, short>(num_items);
-        TestOp<int, int>(num_items);
-        TestOp<int, long>(num_items);
-        TestOp<int, long long>(num_items);
-        TestOp<int, float>(num_items);
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            TestOp<int, double>(num_items);
-
-        TestOp<int, uchar2>(num_items);
-        TestOp<int, uint2>(num_items);
-        TestOp<int, uint3>(num_items);
-        TestOp<int, uint4>(num_items);
-        TestOp<int, ulonglong4>(num_items);
-        TestOp<int, TestFoo>(num_items);
-        TestOp<int, TestBar>(num_items);
-
-        TestOp<char, int>(num_items);
-        TestOp<long long, int>(num_items);
-        TestOp<TestFoo, int>(num_items);
-        TestOp<TestBar, int>(num_items);
-
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu
deleted file mode 100644
index 7309db9cd..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_run_length_encode.cu
+++ /dev/null
@@ -1,890 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce::RunLengthEncode utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/iterator/constant_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/thread/thread_operators.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-// Operation types
-enum RleMethod
-{
-    RLE,                // Run length encode
-    NON_TRIVIAL,
-    CSR,
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to run-length encode entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<RLE>               method,
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceRunLengthEncode::Encode(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to non-trivial runs entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<NON_TRIVIAL>       method,
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to run-length encode entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-cudaError_t Dispatch(
-    Int2Type<RLE>               method,
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
-
-    // The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
-        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
-        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
-
-        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
-
-        LengthT one_val;
-        InitValue(INTEGER_SEED, one_val, 1);
-        thrust::constant_iterator<LengthT> constant_one(one_val);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_ends = thrust::reduce_by_key(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                constant_one,
-                d_unique_out_wrapper,
-                d_lengths_out_wrapper);
-        }
-
-        OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceRunLengthEncode
- */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    Int2Type<RLE_METHOD>            method,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<RLE_METHOD>        method,
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    EqualityOp                  equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences for the current run
-        int repeat;
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve problem.  Returns total number of segments identified
- */
-template <
-    RleMethod       RLE_METHOD,
-    typename        InputIteratorT,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT,
-    typename        EqualityOp>
-int Solve(
-    InputIteratorT  h_in,
-    T               *h_unique_reference,
-    OffsetT         *h_offsets_reference,
-    LengthT         *h_lengths_reference,
-    EqualityOp      equality_op,
-    int             num_items)
-{
-    if (num_items == 0) 
-        return 0;
-
-    // First item
-    T       previous        = h_in[0];
-    LengthT  length          = 1;
-    int     num_runs        = 0;
-    int     run_begin       = 0;
-
-    // Subsequent items
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (!equality_op(previous, h_in[i]))
-        {
-            if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
-            {
-                h_unique_reference[num_runs]      = previous;
-                h_offsets_reference[num_runs]     = run_begin;
-                h_lengths_reference[num_runs]     = length;
-                num_runs++;
-            }
-            length = 1;
-            run_begin = i;
-        }
-        else
-        {
-            length++;
-        }
-        previous = h_in[i];
-    }
-
-    if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
-    {
-        h_unique_reference[num_runs]    = previous;
-        h_offsets_reference[num_runs]   = run_begin;
-        h_lengths_reference[num_runs]   = length;
-        num_runs++;
-    }
-
-    return num_runs;
-}
-
-
-
-/**
- * Test DeviceRunLengthEncode for a given problem input
- */
-template <
-    RleMethod           RLE_METHOD,
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            T,
-    typename            OffsetT,
-    typename            LengthT,
-    typename            EqualityOp>
-void Test(
-    DeviceInputIteratorT d_in,
-    T                   *h_unique_reference,
-    OffsetT             *h_offsets_reference,
-    LengthT             *h_lengths_reference,
-    EqualityOp          equality_op,
-    int                 num_runs,
-    int                 num_items)
-{
-    // Allocate device output arrays and number of segments
-    T*          d_unique_out       = NULL;
-    LengthT*    d_offsets_out      = NULL;
-    OffsetT*    d_lengths_out      = NULL;
-    int*        d_num_runs         = NULL;
-
-    if (RLE_METHOD == RLE)
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
-    if (RLE_METHOD == NON_TRIVIAL)
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t*          d_temp_storage_bytes = NULL;
-    cudaError_t*     d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void*           d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output arrays
-    if (RLE_METHOD == RLE)
-        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
-    if (RLE_METHOD == NON_TRIVIAL)
-        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
-    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
-    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare0 = 0;
-    int compare1 = 0;
-    int compare2 = 0;
-    int compare3 = 0;
-
-    if (RLE_METHOD == RLE)
-    {
-        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
-        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
-    }
-
-    if (RLE_METHOD != RLE)
-    {
-        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
-        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
-    }
-
-    if (RLE_METHOD != CSR)
-    {
-        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
-        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
-    }
-
-    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
-    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
-        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
-    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
-    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
-    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
-}
-
-
-/**
- * Test DeviceRunLengthEncode on pointer type
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment)
-{
-    // Allocate host arrays
-    T*      h_in                    = new T[num_items];
-    T*      h_unique_reference      = new T[num_items];
-    OffsetT* h_offsets_reference     = new OffsetT[num_items];
-    LengthT* h_lengths_reference     = new LengthT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_in, num_items, max_segment);
-
-    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
-
-    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_runs, float(num_items) / num_runs,
-        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T* d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_unique_reference) delete[] h_unique_reference;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestIterator(
-    int             num_items,
-    Int2Type<true>  is_primitive)
-{
-    // Allocate host arrays
-    T* h_unique_reference       = new T[num_items];
-    OffsetT* h_offsets_reference = new OffsetT[num_items];
-    LengthT* h_lengths_reference = new LengthT[num_items];
-
-    T one_val;
-    InitValue(INTEGER_SEED, one_val, 1);
-    ConstantInputIterator<T, int> h_in(one_val);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
-
-    printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_runs, float(num_items) / num_runs,
-        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
-    fflush(stdout);
-
-    // Run Test
-    Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
-
-    // Cleanup
-    if (h_unique_reference) delete[] h_unique_reference;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-}
-
-
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestIterator(
-    int             num_items,
-    Int2Type<false> is_primitive)
-{}
-
-
-/**
- * Test different gen modes
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void Test(
-    int             num_items)
-{
-    // Test iterator (one run)
-    TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
-
-    // num_items runs
-    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
-
-    // Evaluate different run lengths
-    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
-    {
-        // Uniform selection run length
-        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
-
-        // Reduced-entropy run length
-        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
-    }
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestDispatch(
-    int             num_items)
-{
-    Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
-    Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
-
-#ifdef CUB_CDP
-    Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
-    Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestSize(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestDispatch<T, OffsetT, LengthT>(0);
-        TestDispatch<T, OffsetT, LengthT>(1);
-        TestDispatch<T, OffsetT, LengthT>(100);
-        TestDispatch<T, OffsetT, LengthT>(10000);
-        TestDispatch<T, OffsetT, LengthT>(1000000);
-
-        // Randomly select problem size between 1:10,000,000
-        unsigned int max_int = (unsigned int) -1;
-        for (int i = 0; i < 10; ++i)
-        {
-            unsigned int num_items;
-            RandomBits(num_items);
-            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-            num_items = CUB_MAX(1, num_items);
-            TestDispatch<T, OffsetT, LengthT>(num_items);
-        }
-    }
-    else
-    {
-        TestDispatch<T, OffsetT, LengthT>(num_items);
-    }
-
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int max_segment              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", max_segment);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    printf("\n");
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        TestSize<char,          int, int>(num_items);
-        TestSize<short,         int, int>(num_items);
-        TestSize<int,           int, int>(num_items);
-        TestSize<long,          int, int>(num_items);
-        TestSize<long long,     int, int>(num_items);
-        TestSize<float,         int, int>(num_items);
-        TestSize<double,        int, int>(num_items);
-
-        TestSize<uchar2,        int, int>(num_items);
-        TestSize<uint2,         int, int>(num_items);
-        TestSize<uint3,         int, int>(num_items);
-        TestSize<uint4,         int, int>(num_items);
-        TestSize<ulonglong4,    int, int>(num_items);
-        TestSize<TestFoo,       int, int>(num_items);
-        TestSize<TestBar,       int, int>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu
deleted file mode 100644
index d3ad2422f..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_scan.cu
+++ /dev/null
@@ -1,1027 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/device/device_scan.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-double                  g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceScan entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to exclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to exclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to inclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to inclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to exclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to exclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to inclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to inclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceScan
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(
-        Int2Type<CUB>(),
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    GenMode      gen_mode,
-    T            *h_in,
-    int          num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-/**
- * Solve exclusive-scan problem
- */
-template <
-    typename        InputIteratorT,
-    typename        OutputT,
-    typename        ScanOpT>
-void Solve(
-    InputIteratorT  h_in,
-    OutputT         *h_reference,
-    int             num_items,
-    ScanOpT         scan_op,
-    OutputT         initial_value)
-{
-    if (num_items > 0)
-    {
-        OutputT val         = h_in[0];
-        h_reference[0]      = initial_value;
-        OutputT inclusive   = scan_op(initial_value, val);
-
-        for (int i = 1; i < num_items; ++i)
-        {
-            val = h_in[i];
-            h_reference[i] = inclusive;
-            inclusive = scan_op(inclusive, val);
-        }
-    }
-}
-
-
-/**
- * Solve inclusive-scan problem
- */
-template <
-    typename        InputIteratorT,
-    typename        OutputT,
-    typename        ScanOpT>
-void Solve(
-    InputIteratorT  h_in,
-    OutputT         *h_reference,
-    int             num_items,
-    ScanOpT         scan_op,
-    NullType)
-{
-    if (num_items > 0)
-    {
-        OutputT inclusive   = h_in[0];
-        h_reference[0]      = inclusive;
-
-        for (int i = 1; i < num_items; ++i)
-        {
-            OutputT val = h_in[i];
-            inclusive = scan_op(inclusive, val);
-            h_reference[i] = inclusive;
-        }
-    }
-}
-
-
-/**
- * Test DeviceScan for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT>
-void Test(
-    DeviceInputIteratorT    d_in,
-    OutputT                 *h_reference,
-    int                     num_items,
-    ScanOpT                 scan_op,
-    InitialValueT           initial_value)
-{
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
-
-    // Allocate device output array
-    OutputT *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(
-        Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        1,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output array
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(
-        Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        1,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        g_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
-            avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test DeviceScan on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void TestPointer(
-    int             num_items,
-    GenMode         gen_mode,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
-        num_items,
-        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
-        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
-    fflush(stdout);
-
-    // Allocate host arrays
-    InputT*     h_in        = new InputT[num_items];
-    OutputT*    h_reference = new OutputT[num_items];
-
-    // Initialize problem and solution
-    Initialize(gen_mode, h_in, num_items);
-    Solve(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Allocate problem device arrays
-    InputT *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test DeviceScan on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void TestIterator(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
-        num_items,
-        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
-    fflush(stdout);
-
-    // Use a constant iterator as the input
-    InputT val = InputT();
-    ConstantInputIterator<InputT, int> h_in(val);
-
-    // Allocate host arrays
-    OutputT*  h_reference = new OutputT[num_items];
-
-    // Initialize problem and solution
-    Solve(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Run Test
-    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void Test(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
-    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
-    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void Test(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
-#ifdef CUB_CDP
-    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
-#endif
-}
-
-
-/**
- * Test different operators
- */
-template <typename InputT, typename OutputT>
-void TestOp(
-    int             num_items,
-    OutputT         identity,
-    OutputT         initial_value)
-{
-    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
-    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
-    Test<InputT, OutputT>(num_items, cub::Max(), identity);
-
-    // Exclusive (non-specialized, so we can test initial-value)
-    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
-    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
-
-    // Inclusive (no initial value)
-    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
-    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename InputT,
-    typename OutputT>
-void TestSize(
-    int     num_items,
-    OutputT identity,
-    OutputT initial_value)
-{
-    if (num_items < 0)
-    {
-        TestOp<InputT>(0,        identity, initial_value);
-        TestOp<InputT>(1,        identity, initial_value);
-        TestOp<InputT>(100,      identity, initial_value);
-        TestOp<InputT>(10000,    identity, initial_value);
-        TestOp<InputT>(1000000,  identity, initial_value);
-
-        // Randomly select problem size between 1:10,000,000
-        unsigned int max_int = (unsigned int) -1;
-        for (int i = 0; i < 10; ++i)
-        {
-            unsigned int num_items;
-            RandomBits(num_items);
-            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-            num_items = CUB_MAX(1, num_items);
-            TestOp<InputT>(num_items,  identity, initial_value);
-        }
-    }
-    else
-    {
-        TestOp<InputT>(num_items, identity, initial_value);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = -1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, char, int>(            num_items    , RANDOM_BIT, Sum(), (int) (0));
-    TestPointer<CUB, short, int>(           num_items    , RANDOM_BIT, Sum(), (int) (0));
-
-    printf("----------------------------\n");
-
-    TestPointer<CUB, int, int>(             num_items    , RANDOM_BIT, Sum(), (int) (0));
-    TestPointer<CUB, long long, long long>( num_items    , RANDOM_BIT, Sum(), (long long) (0));
-
-    printf("----------------------------\n");
-
-    TestPointer<CUB, float, float>(         num_items    , RANDOM_BIT, Sum(), (float) (0));
-    TestPointer<CUB, double, double>(       num_items    , RANDOM_BIT, Sum(), (double) (0));
-
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, char, char>(        num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
-    TestPointer<THRUST, char, char>(     num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, short, short>(       num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
-    TestPointer<THRUST, short, short>(    num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
-    TestPointer<THRUST, int, int>(      num_items    , UNIFORM, Sum(), (int) (0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, long long, long long>(   num_items / 2, UNIFORM, Sum(), (long long) (0));
-    TestPointer<THRUST, long long, long long>(num_items / 2, UNIFORM, Sum(), (long long) (0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, TestBar, TestBar>(     num_items / 4, UNIFORM, Sum(), TestBar());
-    TestPointer<THRUST, TestBar, TestBar>(  num_items / 4, UNIFORM, Sum(), TestBar());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input+output data types
-        TestSize<unsigned char>(num_items,      (int) 0, (int) 99);
-
-        // Test same input+output data types
-        TestSize<unsigned char>(num_items,      (unsigned char) 0,      (unsigned char) 99);
-        TestSize<char>(num_items,               (char) 0,               (char) 99);
-        TestSize<unsigned short>(num_items,     (unsigned short) 0,     (unsigned short)99);
-        TestSize<unsigned int>(num_items,       (unsigned int) 0,       (unsigned int) 99);
-        TestSize<unsigned long long>(num_items, (unsigned long long) 0, (unsigned long long) 99);
-
-        TestSize<uchar2>(num_items,     make_uchar2(0, 0),              make_uchar2(17, 21));
-        TestSize<char2>(num_items,      make_char2(0, 0),               make_char2(17, 21));
-        TestSize<ushort2>(num_items,    make_ushort2(0, 0),             make_ushort2(17, 21));
-        TestSize<uint2>(num_items,      make_uint2(0, 0),               make_uint2(17, 21));
-        TestSize<ulonglong2>(num_items, make_ulonglong2(0, 0),          make_ulonglong2(17, 21));
-        TestSize<uchar4>(num_items,     make_uchar4(0, 0, 0, 0),        make_uchar4(17, 21, 32, 85));
-        TestSize<char4>(num_items,      make_char4(0, 0, 0, 0),         make_char4(17, 21, 32, 85));
-
-        TestSize<ushort4>(num_items,    make_ushort4(0, 0, 0, 0),       make_ushort4(17, 21, 32, 85));
-        TestSize<uint4>(num_items,      make_uint4(0, 0, 0, 0),         make_uint4(17, 21, 32, 85));
-        TestSize<ulonglong4>(num_items, make_ulonglong4(0, 0, 0, 0),    make_ulonglong4(17, 21, 32, 85));
-
-        TestSize<TestFoo>(num_items,
-            TestFoo::MakeTestFoo(0, 0, 0, 0),
-            TestFoo::MakeTestFoo(1ll << 63, 1 << 31, short(1 << 15), char(1 << 7)));
-
-        TestSize<TestBar>(num_items,
-            TestBar(0, 0),
-            TestBar(1ll << 63, 1 << 31));
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu
deleted file mode 100644
index 9bdca342a..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_if.cu
+++ /dev/null
@@ -1,1039 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceSelect::If and DevicePartition::If utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/copy.h>
-#include <thrust/partition.h>
-#include <thrust/iterator/reverse_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-#include <cub/device/device_partition.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose               = false;
-int                     g_timing_iterations     = 0;
-int                     g_repeat                = 0;
-float                   g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-// Selection functor type
-template <typename T>
-struct LessThan
-{
-    T compare;
-
-    __host__ __device__ __forceinline__
-    LessThan(T compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const T &a) const {
-        return (a < compare);
-    }
-};
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSelect entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to select if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to partition if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to select flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<false>             partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to partition flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<true>              partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to select if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT>         d_out_wrapper_end;
-        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to partition if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
-
-        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
-
-        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::partition_copy(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                d_out_wrapper,
-                d_out_unselected,
-                select_op);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to select flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The flag type
-    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT>     d_out_wrapper_end;
-        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
-        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to partition flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The flag type
-    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
-
-        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
-        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::partition_copy(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                d_flags_wrapper,
-                d_out_wrapper,
-                d_out_unselected,
-                CastOp<bool>());
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-__global__ void CnpDispatchKernel(
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    T*  h_in,
-    int num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        // Initialize each item to a randomly selected value from [0..126]
-        unsigned int value;
-        RandomBits(value, 0, 0, 7);
-        if (value == 127)
-            value = 126;
-        InitValue(INTEGER_SEED, h_in[i], value);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve selection problem (and set corresponding flags)
- */
-template <
-    typename        InputIteratorT,
-    typename        FlagIteratorT,
-    typename        SelectOpT,
-    typename        T>
-int Solve(
-    InputIteratorT  h_in,
-    SelectOpT       select_op,
-    T*              h_reference,
-    FlagIteratorT   h_flags,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if ((h_flags[i] = select_op(h_in[i])))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    bool                IS_FLAGGED,
-    bool                IS_PARTITION,
-    typename            DeviceInputIteratorT,
-    typename            FlagT,
-    typename            SelectOpT,
-    typename            T>
-void Test(
-    DeviceInputIteratorT    d_in,
-    FlagT*                  h_flags,
-    SelectOpT               select_op,
-    T*                      h_reference,
-    int                     num_selected,
-    int                     num_items)
-{
-    // Allocate device flags, output, and num-selected
-    FlagT*      d_flags = NULL;
-    T*          d_out = NULL;
-    int*        d_num_selected_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t*         d_temp_storage_bytes = NULL;
-    cudaError_t*    d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
-    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Copy flags and clear device output array
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
-    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = (IS_PARTITION) ?
-        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
-        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float   avg_millis          = elapsed_millis / g_timing_iterations;
-        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
-        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
-        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
-        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
-
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2);
-}
-
-
-/**
- * Test on pointer type
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void TestPointer(
-    int             num_items,
-    float           select_ratio)
-{
-    typedef char FlagT;
-
-    // Allocate host arrays
-    T*      h_in        = new T[num_items];
-    FlagT*  h_flags     = new FlagT[num_items];
-    T*      h_reference = new T[num_items];
-
-    // Initialize input
-    Initialize(h_in, num_items);
-
-    // Select a comparison value that is select_ratio through the space of [0,127]
-    T compare;
-    if (select_ratio <= 0.0)
-        InitValue(INTEGER_SEED, compare, 0);        // select none
-    else if (select_ratio >= 1.0)
-        InitValue(INTEGER_SEED, compare, 127);      // select all
-    else
-        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
-
-    LessThan<T> select_op(compare);
-    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
-
-    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
-    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
-        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
-        (IS_FLAGGED) ? "Flagged" : "If",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T *d_in = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_flags) delete[] h_flags;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void TestIterator(
-    int             num_items,
-    float           select_ratio)
-{
-    typedef char FlagT;
-
-    // Allocate host arrays
-    T*      h_reference = new T[num_items];
-    FlagT*  h_flags = new FlagT[num_items];
-
-    // Use counting iterator as the input
-    CountingInputIterator<T, int> h_in(0);
-
-    // Select a comparison value that is select_ratio through the space of [0,127]
-    T compare;
-    if (select_ratio <= 0.0)
-        InitValue(INTEGER_SEED, compare, 0);        // select none
-    else if (select_ratio >= 1.0)
-        InitValue(INTEGER_SEED, compare, 127);      // select all
-    else
-        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
-
-    LessThan<T> select_op(compare);
-    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
-
-    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
-    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
-        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
-        (IS_FLAGGED) ? "Flagged" : "If",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Run Test
-    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-    if (h_flags) delete[] h_flags;
-}
-
-
-/**
- * Test different selection ratios
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void Test(
-    int             num_items)
-{
-    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
-    {
-        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
-    }
-}
-
-
-/**
- * Test (select vs. partition) and (flagged vs. functor)
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestMethod(
-    int             num_items)
-{
-    // Functor
-    Test<BACKEND, false, false, T>(num_items);
-    Test<BACKEND, false, true, T>(num_items);
-
-    // Flagged
-    Test<BACKEND, true, false, T>(num_items);
-    Test<BACKEND, true, true, T>(num_items);
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T>
-void TestOp(
-    int             num_items)
-{
-    TestMethod<CUB, T>(num_items);
-#ifdef CUB_CDP
-    TestMethod<CDP, T>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <typename T>
-void Test(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestOp<T>(0);
-        TestOp<T>(1);
-        TestOp<T>(100);
-        TestOp<T>(10000);
-        TestOp<T>(1000000);
-    }
-    else
-    {
-        TestOp<T>(num_items);
-    }
-}
-
-/**
- * Test select/partition on pointer types
- */
-template <typename T>
-void ComparePointer(
-    int             num_items,
-    float           select_ratio)
-{
-    printf("-- Select-if ----------------------------\n");
-    TestPointer<CUB, false, false, T>(num_items, select_ratio);
-    TestPointer<THRUST, false, false, T>(num_items, select_ratio);
-
-    printf("-- Partition-if ----------------------------\n");
-    TestPointer<CUB, false, true, T>(num_items, select_ratio);
-    TestPointer<THRUST, false, true, T>(num_items, select_ratio);
-
-    printf("-- Select-flagged ----------------------------\n");
-    TestPointer<CUB, true, false, T>(num_items, select_ratio);
-    TestPointer<THRUST, true, false, T>(num_items, select_ratio);
-
-    printf("-- Partition-flagged ----------------------------\n");
-    TestPointer<CUB, true, true, T>(num_items, select_ratio);
-    TestPointer<THRUST, true, true, T>(num_items, select_ratio);
-
-}
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    float select_ratio      = 0.5;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("ratio", select_ratio);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--ratio=<selection ratio, default 0.5>] "
-            "[--repeat=<repetitions of entire test suite>] "
-            "[--v] "
-            "[--cdp] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Select-if ----------------------------\n");
-    TestPointer<CUB, false, false, int>(num_items, select_ratio);
-
-    printf("-- Partition-if ----------------------------\n");
-    TestPointer<CUB, false, true, int>(num_items, select_ratio);
-
-    printf("-- Select-flagged ----------------------------\n");
-    TestPointer<CUB, true, false, int>(num_items, select_ratio);
-
-    printf("-- Partition-flagged ----------------------------\n");
-    TestPointer<CUB, true, true, int>(num_items, select_ratio);
-
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Iterator ----------------------------\n");
-    TestIterator<CUB, false, false, int>(num_items, select_ratio);
-
-    ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
-    ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
-    ComparePointer<int>(        num_items,                                  select_ratio);
-    ComparePointer<long long>(  num_items / 2,                              select_ratio);
-    ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        Test<unsigned char>(num_items);
-        Test<unsigned short>(num_items);
-        Test<unsigned int>(num_items);
-        Test<unsigned long long>(num_items);
-
-        Test<uchar2>(num_items);
-        Test<ushort2>(num_items);
-        Test<uint2>(num_items);
-        Test<ulonglong2>(num_items);
-
-        Test<uchar4>(num_items);
-        Test<ushort4>(num_items);
-        Test<uint4>(num_items);
-        Test<ulonglong4>(num_items);
-
-        Test<TestFoo>(num_items);
-        Test<TestBar>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu
deleted file mode 100644
index fff2958f1..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_device_select_unique.cu
+++ /dev/null
@@ -1,651 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceSelect::Unique utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/unique.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include <thrust/device_ptr.h>
-#include <thrust/unique.h>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose               = false;
-int                     g_timing_iterations     = 0;
-int                     g_repeat                = 0;
-float                   g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSelect entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to unique entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to unique entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT> d_out_wrapper_end;
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences for the current run
-        int repeat;
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <
-    typename        InputIteratorT,
-    typename        T>
-int Solve(
-    InputIteratorT  h_in,
-    T               *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    if (num_items > 0)
-    {
-        h_reference[num_selected] = h_in[0];
-        num_selected++;
-    }
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (h_in[i] != h_in[i - 1])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-    }
-
-    return num_selected;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            T>
-void Test(
-    DeviceInputIteratorT d_in,
-    T                   *h_reference,
-    int                 num_selected,
-    int                 num_items)
-{
-    // Allocate device output array and num selected
-    T       *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output array
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
-    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis        = elapsed_millis / g_timing_iterations;
-        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2);
-}
-
-
-/**
- * Test DeviceSelect on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment)
-{
-    // Allocate host arrays
-    T*  h_in        = new T[num_items];
-    T*  h_reference = new T[num_items];
-
-    // Initialize problem and solution
-    Initialize(entropy_reduction, h_in, num_items, max_segment);
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_items) / num_selected,
-        typeid(T).name(),
-        (int) sizeof(T),
-        entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test DeviceSelect on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestIterator(
-    int             num_items)
-{
-    // Use a counting iterator as the input
-    CountingInputIterator<T, int> h_in(0);
-
-    // Allocate host arrays
-    T*  h_reference = new T[num_items];
-
-    // Initialize problem and solution
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_items) / num_selected,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run Test
-    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void Test(
-    int             num_items)
-{
-    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
-    {
-        TestPointer<BACKEND, T>(num_items, 0, max_segment);
-        TestPointer<BACKEND, T>(num_items, 2, max_segment);
-        TestPointer<BACKEND, T>(num_items, 7, max_segment);
-    }
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T>
-void TestOp(
-    int             num_items)
-{
-    Test<CUB, T>(num_items);
-#ifdef CUB_CDP
-    Test<CDP, T>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <typename T>
-void Test(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestOp<T>(0);
-        TestOp<T>(1);
-        TestOp<T>(100);
-        TestOp<T>(10000);
-        TestOp<T>(1000000);
-    }
-    else
-    {
-        TestOp<T>(num_items);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int maxseg              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", maxseg);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Iterator ----------------------------\n");
-    TestIterator<CUB, int>(        num_items);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, char>(        num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
-    TestPointer<THRUST, char>(     num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, short>(       num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
-    TestPointer<THRUST, short>(    num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
-    TestPointer<THRUST, int>(      num_items,                                 entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, long long>(   num_items / 2,                             entropy_reduction, maxseg);
-    TestPointer<THRUST, long long>(num_items / 2,                             entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, TestFoo>(     num_items / 4,                             entropy_reduction, maxseg);
-    TestPointer<THRUST, TestFoo>(  num_items / 4,                             entropy_reduction, maxseg);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        Test<unsigned char>(num_items);
-        Test<unsigned short>(num_items);
-        Test<unsigned int>(num_items);
-        Test<unsigned long long>(num_items);
-
-        Test<uchar2>(num_items);
-        Test<ushort2>(num_items);
-        Test<uint2>(num_items);
-        Test<ulonglong2>(num_items);
-
-        Test<uchar4>(num_items);
-        Test<ushort4>(num_items);
-        Test<uint4>(num_items);
-        Test<ulonglong4>(num_items);
-
-        Test<TestFoo>(num_items);
-        Test<TestBar>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu
deleted file mode 100644
index 53c689c19..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_iterator.cu
+++ /dev/null
@@ -1,805 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of iterator utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/iterator/arg_index_input_iterator.cuh>
-#include <cub/iterator/cache_modified_input_iterator.cuh>
-#include <cub/iterator/cache_modified_output_iterator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/iterator/tex_obj_input_iterator.cuh>
-#include <cub/iterator/tex_ref_input_iterator.cuh>
-#include <cub/iterator/transform_input_iterator.cuh>
-
-#include <cub/util_type.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-#include <thrust/device_ptr.h>
-#include <thrust/copy.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-template <typename T>
-struct TransformOp
-{
-    // Increment transform
-    __host__ __device__ __forceinline__ T operator()(T input) const
-    {
-        T addend;
-        InitValue(INTEGER_SEED, addend, 1);
-        return input + addend;
-    }
-};
-
-struct SelectOp
-{
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(T input)
-    {
-        return true;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * Test random access input iterator
- */
-template <
-    typename InputIteratorT,
-    typename T>
-__global__ void Kernel(
-    InputIteratorT    d_in,
-    T                 *d_out,
-    InputIteratorT    *d_itrs)
-{
-    d_out[0] = *d_in;               // Value at offset 0
-    d_out[1] = d_in[100];           // Value at offset 100
-    d_out[2] = *(d_in + 1000);      // Value at offset 1000
-    d_out[3] = *(d_in + 10000);     // Value at offset 10000
-
-    d_in++;
-    d_out[4] = d_in[0];             // Value at offset 1
-
-    d_in += 20;
-    d_out[5] = d_in[0];             // Value at offset 21
-    d_itrs[0] = d_in;               // Iterator at offset 21
-
-    d_in -= 10;
-    d_out[6] = d_in[0];             // Value at offset 11;
-
-    d_in -= 11;
-    d_out[7] = d_in[0];             // Value at offset 0
-    d_itrs[1] = d_in;               // Iterator at offset 0
-}
-
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Run iterator test on device
- */
-template <
-    typename        InputIteratorT,
-    typename        T,
-    int             TEST_VALUES>
-void Test(
-    InputIteratorT  d_in,
-    T               (&h_reference)[TEST_VALUES])
-{
-    // Allocate device arrays
-    T                 *d_out    = NULL;
-    InputIteratorT    *d_itrs   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
-
-    int compare;
-
-    // Run unguarded kernel
-    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check iterator at offset 21
-    InputIteratorT h_itr = d_in + 21;
-    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
-    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check iterator at offset 0
-    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
-    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
-}
-
-
-/**
- * Test constant iterator
- */
-template <typename T>
-void TestConstant(T base)
-{
-    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    T h_reference[8] = {base, base, base, base, base, base, base, base};
-    ConstantInputIterator<T> d_itr(base);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    int copy_items  = 100;
-    T   *h_copy     = new T[copy_items];
-    T   *d_copy     = NULL;
-
-    for (int i = 0; i < copy_items; ++i)
-        h_copy[i] = d_itr[i];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-}
-
-
-/**
- * Test counting iterator
- */
-template <typename T>
-void TestCounting(T base)
-{
-    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = base + 0;          // Value at offset 0
-    h_reference[1] = base + 100;        // Value at offset 100
-    h_reference[2] = base + 1000;       // Value at offset 1000
-    h_reference[3] = base + 10000;      // Value at offset 10000
-    h_reference[4] = base + 1;          // Value at offset 1
-    h_reference[5] = base + 21;         // Value at offset 21
-    h_reference[6] = base + 11;         // Value at offset 11
-    h_reference[7] = base + 0;          // Value at offset 0;
-
-    CountingInputIterator<T> d_itr(base);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    unsigned long long  max_items   = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
-    size_t  copy_items              = (size_t) CUB_MIN(max_items - base, 100);     // potential issue with differencing overflows when T is a smaller type than can handle the offset
-    T                   *h_copy     = new T[copy_items];
-    T                   *d_copy     = NULL;
-
-    for (unsigned long long i = 0; i < copy_items; ++i)
-        h_copy[i] = d_itr[i];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-}
-
-
-/**
- * Test modified iterator
- */
-template <typename T, typename CastT>
-void TestModified()
-{
-    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-
-    CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
-    CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);
-
-    thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-
-/**
- * Test transform iterator
- */
-template <typename T, typename CastT>
-void TestTransform()
-{
-    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        InitValue(INTEGER_SEED, h_data[i], i);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    TransformOp<T> op;
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = op(h_data[0]);          // Value at offset 0
-    h_reference[1] = op(h_data[100]);        // Value at offset 100
-    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
-    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
-    h_reference[4] = op(h_data[1]);          // Value at offset 1
-    h_reference[5] = op(h_data[21]);         // Value at offset 21
-    h_reference[6] = op(h_data[11]);         // Value at offset 11
-    h_reference[7] = op(h_data[0]);          // Value at offset 0;
-
-    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *h_copy = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-        h_copy[i] = op(h_data[i]);
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-
-/**
- * Test tex-obj texture iterator
- */
-template <typename T, typename CastT>
-void TestTexObj()
-{
-    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES          = 11000;
-    const unsigned int DUMMY_OFFSET         = 500;
-    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data   = NULL;
-    T *d_dummy  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    // Create and bind obj-based test iterator
-    TexObjInputIterator<T> d_obj_itr;
-    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    Test(d_obj_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
-    thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    CubDebugExit(d_obj_itr.UnbindTexture());
-
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
-}
-
-
-#if CUDART_VERSION >= 5050
-
-/**
- * Test tex-ref texture iterator
- */
-template <typename T, typename CastT>
-void TestTexRef()
-{
-    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES          = 11000;
-    const unsigned int DUMMY_OFFSET         = 500;
-    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data   = NULL;
-    T *d_dummy  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    // Create and bind ref-based test iterator
-    TexRefInputIterator<T, __LINE__> d_ref_itr;
-    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    // Create and bind dummy iterator of same type to check with interferance
-    TexRefInputIterator<T, __LINE__> d_ref_itr2;
-    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-
-    Test(d_ref_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
-    thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    CubDebugExit(d_ref_itr.UnbindTexture());
-    CubDebugExit(d_ref_itr2.UnbindTexture());
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
-}
-
-
-/**
- * Test texture transform iterator
- */
-template <typename T, typename CastT>
-void TestTexTransform()
-{
-    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        InitValue(INTEGER_SEED, h_data[i], i);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    TransformOp<T> op;
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = op(h_data[0]);          // Value at offset 0
-    h_reference[1] = op(h_data[100]);        // Value at offset 100
-    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
-    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
-    h_reference[4] = op(h_data[1]);          // Value at offset 1
-    h_reference[5] = op(h_data[21]);         // Value at offset 21
-    h_reference[6] = op(h_data[11]);         // Value at offset 11
-    h_reference[7] = op(h_data[0]);          // Value at offset 0;
-
-    // Create and bind texture iterator
-    typedef TexRefInputIterator<T, __LINE__> TextureIterator;
-
-    TextureIterator d_tex_itr;
-    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    // Create transform iterator
-    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
-
-    Test(xform_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *h_copy = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-        h_copy[i] = op(h_data[i]);
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    CubDebugExit(d_tex_itr.UnbindTexture());
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-#endif  // CUDART_VERSION
-
-
-
-
-/**
- * Run non-integer tests
- */
-template <typename T, typename CastT>
-void Test(Int2Type<false> is_integer)
-{
-    TestModified<T, CastT>();
-    TestTransform<T, CastT>();
-
-#if CUB_CDP
-    // Test tex-obj iterators if CUDA dynamic parallelism enabled
-    TestTexObj<T, CastT>(type_string);
-#endif  // CUB_CDP
-
-#if CUDART_VERSION >= 5050
-    // Test tex-ref iterators for CUDA 5.5
-    TestTexRef<T, CastT>();
-    TestTexTransform<T, CastT>();
-#endif  // CUDART_VERSION
-}
-
-/**
- * Run integer tests
- */
-template <typename T, typename CastT>
-void Test(Int2Type<true> is_integer)
-{
-    TestConstant<T>(0);
-    TestConstant<T>(99);
-
-    TestCounting<T>(0);
-    TestCounting<T>(99);
-
-    // Run non-integer tests
-    Test<T, CastT>(Int2Type<false>());
-}
-
-/**
- * Run tests
- */
-template <typename T>
-void Test()
-{
-    enum {
-        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
-    };
-
-    // Test non-const type
-    Test<T, T>(Int2Type<IS_INTEGER>());
-
-    // Test non-const type
-    Test<T, const T>(Int2Type<IS_INTEGER>());
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // Evaluate different data types
-    Test<char>();
-    Test<short>();
-    Test<int>();
-    Test<long>();
-    Test<long long>();
-    Test<float>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double>();
-
-    Test<char2>();
-    Test<short2>();
-    Test<int2>();
-    Test<long2>();
-    Test<longlong2>();
-    Test<float2>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double2>();
-
-    Test<char3>();
-    Test<short3>();
-    Test<int3>();
-    Test<long3>();
-    Test<longlong3>();
-    Test<float3>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double3>();
-
-    Test<char4>();
-    Test<short4>();
-    Test<int4>();
-    Test<long4>();
-    Test<longlong4>();
-    Test<float4>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double4>();
-
-    Test<TestFoo>();
-    Test<TestBar>();
-
-    printf("\nTest complete\n"); fflush(stdout);
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h
deleted file mode 100644
index d44b939eb..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_util.h
+++ /dev/null
@@ -1,1635 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-#pragma once
-
-#if defined(_WIN32) || defined(_WIN64)
-    #include <windows.h>
-    #undef small            // Windows is terrible for polluting macro namespace
-#else
-    #include <sys/resource.h>
-#endif
-
-#include <cuda_runtime.h>
-
-#include <stdio.h>
-#include <math.h>
-#include <float.h>
-
-#include <string>
-#include <vector>
-#include <sstream>
-#include <iostream>
-#include <limits>
-
-#include "mersenne.h"
-#include "half.h"
-
-#include "cub/util_debug.cuh"
-#include "cub/util_device.cuh"
-#include "cub/util_type.cuh"
-#include "cub/util_macro.cuh"
-#include "cub/iterator/discard_output_iterator.cuh"
-
-
-/******************************************************************************
- * Assertion macros
- ******************************************************************************/
-
-/**
- * Assert equals
- */
-#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
-
-
-/******************************************************************************
- * Command-line parsing functionality
- ******************************************************************************/
-
-/**
- * Utility for parsing command line arguments
- */
-struct CommandLineArgs
-{
-
-    std::vector<std::string>    keys;
-    std::vector<std::string>    values;
-    std::vector<std::string>    args;
-    cudaDeviceProp              deviceProp;
-    float                       device_giga_bandwidth;
-    size_t                      device_free_physmem;
-    size_t                      device_total_physmem;
-
-    /**
-     * Constructor
-     */
-    CommandLineArgs(int argc, char **argv) :
-        keys(10),
-        values(10)
-    {
-        using namespace std;
-
-        // Initialize mersenne generator
-        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
-        mersenne::init_by_array(mersenne_init, 4);
-
-        for (int i = 1; i < argc; i++)
-        {
-            string arg = argv[i];
-
-            if ((arg[0] != '-') || (arg[1] != '-'))
-            {
-                args.push_back(arg);
-                continue;
-            }
-
-            string::size_type pos;
-            string key, val;
-            if ((pos = arg.find('=')) == string::npos) {
-                key = string(arg, 2, arg.length() - 2);
-                val = "";
-            } else {
-                key = string(arg, 2, pos - 2);
-                val = string(arg, pos + 1, arg.length() - 1);
-            }
-
-            keys.push_back(key);
-            values.push_back(val);
-        }
-    }
-
-
-    /**
-     * Checks whether a flag "--<flag>" is present in the commandline
-     */
-    bool CheckCmdLineFlag(const char* arg_name)
-    {
-        using namespace std;
-
-        for (int i = 0; i < int(keys.size()); ++i)
-        {
-            if (keys[i] == string(arg_name))
-                return true;
-        }
-        return false;
-    }
-
-
-    /**
-     * Returns number of naked (non-flag and non-key-value) commandline parameters
-     */
-    template <typename T>
-    int NumNakedArgs()
-    {
-        return args.size();
-    }
-
-
-    /**
-     * Returns the commandline parameter for a given index (not including flags)
-     */
-    template <typename T>
-    void GetCmdLineArgument(int index, T &val)
-    {
-        using namespace std;
-        if (index < args.size()) {
-            istringstream str_stream(args[index]);
-            str_stream >> val;
-        }
-    }
-
-    /**
-     * Returns the value specified for a given commandline parameter --<flag>=<value>
-     */
-    template <typename T>
-    void GetCmdLineArgument(const char *arg_name, T &val)
-    {
-        using namespace std;
-
-        for (int i = 0; i < int(keys.size()); ++i)
-        {
-            if (keys[i] == string(arg_name))
-            {
-                istringstream str_stream(values[i]);
-                str_stream >> val;
-            }
-        }
-    }
-
-
-    /**
-     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
-     */
-    template <typename T>
-    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
-    {
-        using namespace std;
-
-        if (CheckCmdLineFlag(arg_name))
-        {
-            // Clear any default values
-            vals.clear();
-
-            // Recover from multi-value string
-            for (int i = 0; i < keys.size(); ++i)
-            {
-                if (keys[i] == string(arg_name))
-                {
-                    string val_string(values[i]);
-                    istringstream str_stream(val_string);
-                    string::size_type old_pos = 0;
-                    string::size_type new_pos = 0;
-
-                    // Iterate comma-separated values
-                    T val;
-                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
-                    {
-                        if (new_pos != old_pos)
-                        {
-                            str_stream.width(new_pos - old_pos);
-                            str_stream >> val;
-                            vals.push_back(val);
-                        }
-
-                        // skip over comma
-                        str_stream.ignore(1);
-                        old_pos = new_pos + 1;
-                    }
-
-                    // Read last value
-                    str_stream >> val;
-                    vals.push_back(val);
-                }
-            }
-        }
-    }
-
-
-    /**
-     * The number of pairs parsed
-     */
-    int ParsedArgc()
-    {
-        return (int) keys.size();
-    }
-
-    /**
-     * Initialize device
-     */
-    cudaError_t DeviceInit(int dev = -1)
-    {
-        cudaError_t error = cudaSuccess;
-
-        do
-        {
-            int deviceCount;
-            error = CubDebug(cudaGetDeviceCount(&deviceCount));
-            if (error) break;
-
-            if (deviceCount == 0) {
-                fprintf(stderr, "No devices supporting CUDA.\n");
-                exit(1);
-            }
-            if (dev < 0)
-            {
-                GetCmdLineArgument("device", dev);
-            }
-            if ((dev > deviceCount - 1) || (dev < 0))
-            {
-                dev = 0;
-            }
-
-            error = CubDebug(cudaSetDevice(dev));
-            if (error) break;
-
-            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
-
-            int ptx_version;
-            error = CubDebug(cub::PtxVersion(ptx_version));
-            if (error) break;
-
-            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
-            if (error) break;
-
-            if (deviceProp.major < 1) {
-                fprintf(stderr, "Device does not support CUDA.\n");
-                exit(1);
-            }
-
-            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
-
-            if (!CheckCmdLineFlag("quiet"))
-            {
-                printf(
-                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
-                        "%lld free / %lld total MB physmem, "
-                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
-                    dev,
-                    deviceProp.name,
-                    ptx_version,
-                    deviceProp.major * 100 + deviceProp.minor * 10,
-                    deviceProp.multiProcessorCount,
-                    (unsigned long long) device_free_physmem / 1024 / 1024,
-                    (unsigned long long) device_total_physmem / 1024 / 1024,
-                    device_giga_bandwidth,
-                    deviceProp.memoryClockRate,
-                    (deviceProp.ECCEnabled) ? "on" : "off");
-                fflush(stdout);
-            }
-
-        } while (0);
-
-        return error;
-    }
-};
-
-/******************************************************************************
- * Random bits generator
- ******************************************************************************/
-
-int g_num_rand_samples = 0;
-
-
-template <typename T>
-bool IsNaN(T val) { return false; }
-
-template<>
-__noinline__ bool IsNaN<float>(float val)
-{
-    volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);
-
-    return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) || 
-        ((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
-}
-
-template<>
-__noinline__ bool IsNaN<float1>(float1 val)
-{
-    return (IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float2>(float2 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float3>(float3 val)
-{
-    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float4>(float4 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
-}
-
-template<>
-__noinline__ bool IsNaN<double>(double val)
-{
-    volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);
-
-    return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) || 
-        ((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
-}
-
-template<>
-__noinline__ bool IsNaN<double1>(double1 val)
-{
-    return (IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double2>(double2 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double3>(double3 val)
-{
-    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double4>(double4 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
-}
-
-
-template<>
-__noinline__ bool IsNaN<half_t>(half_t val)
-{
-    volatile unsigned short bits = reinterpret_cast<unsigned short &>(val);
-
-    return (((bits >= 0x7C01) && (bits <= 0x7FFF)) ||
-        ((bits >= 0xFC01) && (bits <= 0xFFFFFFFF)));
-}
-
-
-
-/**
- * Generates random keys.
- *
- * We always take the second-order byte from rand() because the higher-order
- * bits returned by rand() are commonly considered more uniformly distributed
- * than the lower-order bits.
- *
- * We can decrease the entropy level of keys by adopting the technique
- * of Thearling and Smith in which keys are computed from the bitwise AND of
- * multiple random samples:
- *
- * entropy_reduction    | Effectively-unique bits per key
- * -----------------------------------------------------
- * -1                   | 0
- * 0                    | 32
- * 1                    | 25.95 (81%)
- * 2                    | 17.41 (54%)
- * 3                    | 10.78 (34%)
- * 4                    | 6.42 (20%)
- * ...                  | ...
- *
- */
-template <typename K>
-void RandomBits(
-    K &key,
-    int entropy_reduction = 0,
-    int begin_bit = 0,
-    int end_bit = sizeof(K) * 8)
-{
-    const int NUM_BYTES = sizeof(K);
-    const int WORD_BYTES = sizeof(unsigned int);
-    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
-
-    unsigned int word_buff[NUM_WORDS];
-
-    if (entropy_reduction == -1)
-    {
-        memset((void *) &key, 0, sizeof(key));
-        return;
-    }
-
-    if (end_bit < 0)
-        end_bit = sizeof(K) * 8;
-
-    while (true) 
-    {
-        // Generate random word_buff
-        for (int j = 0; j < NUM_WORDS; j++)
-        {
-            int current_bit = j * WORD_BYTES * 8;
-
-            unsigned int word = 0xffffffff;
-            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
-            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
-
-            for (int i = 0; i <= entropy_reduction; i++)
-            {
-                // Grab some of the higher bits from rand (better entropy, supposedly)
-                word &= mersenne::genrand_int32();
-                g_num_rand_samples++;                
-            }
-
-            word_buff[j] = word;
-        }
-
-        memcpy(&key, word_buff, sizeof(K));
-
-        K copy = key;
-        if (!IsNaN(copy))
-            break;          // avoids NaNs when generating random floating point numbers
-    }
-}
-
-/// Randomly select number between [0:max)
-template <typename T>
-T RandomValue(T max)
-{
-    unsigned int bits;
-    unsigned int max_int = (unsigned int) -1;
-    do {
-        RandomBits(bits);
-    } while (bits == max_int);
-
-    return (T) ((double(bits) / double(max_int)) * double(max));
-}
-
-
-/******************************************************************************
- * Console printing utilities
- ******************************************************************************/
-
-/**
- * Helper for casting character types to integers for cout printing
- */
-template <typename T>
-T CoutCast(T val) { return val; }
-
-int CoutCast(char val) { return val; }
-
-int CoutCast(unsigned char val) { return val; }
-
-int CoutCast(signed char val) { return val; }
-
-
-
-/******************************************************************************
- * Test value initialization utilities
- ******************************************************************************/
-
-/**
- * Test problem generation options
- */
-enum GenMode
-{
-    UNIFORM,            // Assign to '2', regardless of integer seed
-    INTEGER_SEED,       // Assign to integer seed
-    RANDOM,             // Assign to random, regardless of integer seed
-    RANDOM_BIT,         // Assign to randomly chosen 0 or 1, regardless of integer seed
-};
-
-/**
- * Initialize value
- */
-template <typename T>
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
-{
-    switch (gen_mode)
-    {
-#if (CUB_PTX_ARCH == 0)
-    case RANDOM:
-        RandomBits(value);
-        break;
-    case RANDOM_BIT:
-        char c;
-        RandomBits(c, 0, 0, 1);
-        value = (c > 0) ? (T) 1 : (T) -1;
-        break;
-#endif
-     case UNIFORM:
-        value = 2;
-        break;
-    case INTEGER_SEED:
-    default:
-         value = (T) index;
-        break;
-    }
-}
-
-
-/**
- * Initialize value (bool)
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
-{
-    switch (gen_mode)
-    {
-#if (CUB_PTX_ARCH == 0)
-    case RANDOM:
-    case RANDOM_BIT:
-        char c;
-        RandomBits(c, 0, 0, 1);
-        value = (c > 0);
-        break;
-#endif
-     case UNIFORM:
-        value = true;
-        break;
-    case INTEGER_SEED:
-    default:
-        value = (index > 0);
-        break;
-    }
-}
-
-
-/**
- * cub::NullType test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, cub::NullType &value, int index = 0)
-{}
-
-
-/**
- * cub::KeyValuePair<OffsetT, ValueT>test initialization
- */
-template <typename KeyT, typename ValueT>
-__host__ __device__ __forceinline__ void InitValue(
-    GenMode                             gen_mode,
-    cub::KeyValuePair<KeyT, ValueT>&    value,
-    int                                 index = 0)
-{
-    InitValue(gen_mode, value.value, index);
-
-    // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
-    RandomBits(value.key, 3);
-    value.key = (value.key & 0x1);
-}
-
-
-
-/******************************************************************************
- * Comparison and ostream operators
- ******************************************************************************/
-
-/**
- * KeyValuePair ostream operator
- */
-template <typename Key, typename Value>
-std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
-{
-    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
-    return os;
-}
-
-
-/******************************************************************************
- * Comparison and ostream operators for CUDA vector types
- ******************************************************************************/
-
-/**
- * Vector1 overloads
- */
-#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '(' << CoutCast(val.x) << ')';                \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x);                                \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x);                                \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x > b.x);                                 \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x < b.x);                                 \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(a.x + b.x);                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace std */
-
-
-
-/**
- * Vector2 overloads
- */
-#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        return a.y > b.y;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        return a.y < b.y;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                         \
-        T b)                                         \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-
-
-/**
- * Vector3 overloads
- */
-#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ','                       \
-            << CoutCast(val.z) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y) ||                                 \
-            (a.z != b.z);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y) &&                                 \
-            (a.z == b.z);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-        InitValue(gen_mode, value.z, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
-        return a.z > b.z;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
-        return a.z < b.z;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y,                                      \
-            a.z + b.z);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-
-/**
- * Vector4 overloads
- */
-#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ','                       \
-            << CoutCast(val.z) << ','                       \
-            << CoutCast(val.w) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y) ||                                 \
-            (a.z != b.z) ||                                 \
-            (a.w != b.w);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y) &&                                 \
-            (a.z == b.z) &&                                 \
-            (a.w == b.w);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-        InitValue(gen_mode, value.z, index);                \
-        InitValue(gen_mode, value.w, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
-        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
-        return a.w > b.w;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
-        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
-        return a.w < b.w;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y,                                      \
-            a.z + b.z,                                      \
-            a.w + b.w);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-/**
- * All vector overloads
- */
-#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
-    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
-    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
-    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
-    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
-
-/**
- * Define for types
- */
-CUB_VEC_OVERLOAD(char, char)
-CUB_VEC_OVERLOAD(short, short)
-CUB_VEC_OVERLOAD(int, int)
-CUB_VEC_OVERLOAD(long, long)
-CUB_VEC_OVERLOAD(longlong, long long)
-CUB_VEC_OVERLOAD(uchar, unsigned char)
-CUB_VEC_OVERLOAD(ushort, unsigned short)
-CUB_VEC_OVERLOAD(uint, unsigned int)
-CUB_VEC_OVERLOAD(ulong, unsigned long)
-CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
-CUB_VEC_OVERLOAD(float, float)
-CUB_VEC_OVERLOAD(double, double)
-
-
-//---------------------------------------------------------------------
-// Complex data type TestFoo
-//---------------------------------------------------------------------
-
-/**
- * TestFoo complex data type
- */
-struct TestFoo
-{
-    long long   x;
-    int         y;
-    short       z;
-    char        w;
-
-    // Factory
-    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
-    {
-        TestFoo retval = {x, y, z, w};
-        return retval;
-    }
-
-    // Assignment from int operator
-    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
-    {
-        x = b;
-        y = b;
-        z = b;
-        w = b;
-        return *this;
-    }
-
-    // Summation operator
-    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
-    {
-        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
-    }
-
-    // Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
-    {
-        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
-    }
-
-    // Equality operator
-    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
-    {
-        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
-    }
-
-    // Less than operator
-    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
-    {
-        if (x < b.x) return true; else if (b.x < x) return false;
-        if (y < b.y) return true; else if (b.y < y) return false;
-        if (z < b.z) return true; else if (b.z < z) return false;
-        return w < b.w;
-    }
-
-    // Greater than operator
-    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
-    {
-        if (x > b.x) return true; else if (b.x > x) return false;
-        if (y > b.y) return true; else if (b.y > y) return false;
-        if (z > b.z) return true; else if (b.z > z) return false;
-        return w > b.w;
-    }
-
-};
-
-/**
- * TestFoo ostream operator
- */
-std::ostream& operator<<(std::ostream& os, const TestFoo& val)
-{
-    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
-    return os;
-}
-
-/**
- * TestFoo test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
-{
-    InitValue(gen_mode, value.x, index);
-    InitValue(gen_mode, value.y, index);
-    InitValue(gen_mode, value.z, index);
-    InitValue(gen_mode, value.w, index);
-}
-
-
-/// numeric_limits<TestFoo> specialization
-namespace cub {
-template<>
-struct NumericTraits<TestFoo>
-{
-    static const Category CATEGORY = NOT_A_NUMBER;
-    enum {
-        PRIMITIVE       = false,
-        NULL_TYPE       = false,
-    };
-    static TestFoo Max()
-    {
-        return TestFoo::MakeTestFoo(
-            NumericTraits<long long>::Max(),
-            NumericTraits<int>::Max(),
-            NumericTraits<short>::Max(),
-            NumericTraits<char>::Max());
-    }
-
-    static TestFoo Lowest()
-    {
-        return TestFoo::MakeTestFoo(
-            NumericTraits<long long>::Lowest(),
-            NumericTraits<int>::Lowest(),
-            NumericTraits<short>::Lowest(),
-            NumericTraits<char>::Lowest());
-    }
-};
-} // namespace cub
-
-
-//---------------------------------------------------------------------
-// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
-//---------------------------------------------------------------------
-
-/**
- * TestBar complex data type
- */
-struct TestBar
-{
-    long long       x;
-    int             y;
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
-    {}
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
-    {}
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
-    {}
-
-    // Assignment from int operator
-    __host__ __device__ __forceinline__ TestBar& operator =(int b)
-    {
-        x = b;
-        y = b;
-        return *this;
-    }
-
-    // Summation operator
-    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
-    {
-        return TestBar(x + b.x, y + b.y);
-    }
-
-    // Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
-    {
-        return (x != b.x) || (y != b.y);
-    }
-
-    // Equality operator
-    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
-    {
-        return (x == b.x) && (y == b.y);
-    }
-
-    // Less than operator
-    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
-    {
-        if (x < b.x) return true; else if (b.x < x) return false;
-        return y < b.y;
-    }
-
-    // Greater than operator
-    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
-    {
-        if (x > b.x) return true; else if (b.x > x) return false;
-        return y > b.y;
-    }
-
-};
-
-
-/**
- * TestBar ostream operator
- */
-std::ostream& operator<<(std::ostream& os, const TestBar& val)
-{
-    os << '(' << val.x << ',' << val.y << ')';
-    return os;
-}
-
-/**
- * TestBar test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
-{
-    InitValue(gen_mode, value.x, index);
-    InitValue(gen_mode, value.y, index);
-}
-
-/// numeric_limits<TestBar> specialization
-namespace cub {
-template<>
-struct NumericTraits<TestBar>
-{
-    static const Category CATEGORY = NOT_A_NUMBER;
-    enum {
-        PRIMITIVE       = false,
-        NULL_TYPE       = false,
-    };
-    static TestBar Max()
-    {
-        return TestBar(
-            NumericTraits<long long>::Max(),
-            NumericTraits<int>::Max());
-    }
-
-    static TestBar Lowest()
-    {
-        return TestBar(
-            NumericTraits<long long>::Lowest(),
-            NumericTraits<int>::Lowest());
-    }
-};
-} // namespace cub
-
-
-/******************************************************************************
- * Helper routines for list comparison and display
- ******************************************************************************/
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename S, typename T, typename OffsetT>
-int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                << CoutCast(computed[i]) << " != "
-                << CoutCast(reference[i]);
-            return 1;
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            float difference = std::abs(computed[i]-reference[i]);
-            float fraction = difference / std::abs(reference[i]);
-
-            if (fraction > 0.0001)
-            {
-                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                    << "(computed) " << CoutCast(computed[i]) << " != "
-                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
-{
-    return 0;
-}
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            double difference = std::abs(computed[i]-reference[i]);
-            double fraction = difference / std::abs(reference[i]);
-
-            if (fraction > 0.0001)
-            {
-                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                    << CoutCast(computed[i]) << " != "
-                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Verify the contents of a device array match those
- * of a host array
- */
-int CompareDeviceResults(
-    cub::NullType *h_reference,
-    cub::NullType *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    return 0;
-}
-
-/**
- * Verify the contents of a device array match those
- * of a host array
- */
-template <typename S, typename OffsetT>
-int CompareDeviceResults(
-    S *h_reference,
-    cub::DiscardOutputIterator<OffsetT> d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    return 0;
-}
-
-/**
- * Verify the contents of a device array match those
- * of a host array
- */
-template <typename S, typename T>
-int CompareDeviceResults(
-    S *h_reference,
-    T *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    // Allocate array on host
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    // Display data
-    if (display_data)
-    {
-        printf("Reference:\n");
-        for (int i = 0; i < int(num_items); i++)
-        {
-            std::cout << CoutCast(h_reference[i]) << ", ";
-        }
-        printf("\n\nComputed:\n");
-        for (int i = 0; i < int(num_items); i++)
-        {
-            std::cout << CoutCast(h_data[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Check
-    int retval = CompareResults(h_data, h_reference, num_items, verbose);
-
-    // Cleanup
-    if (h_data) free(h_data);
-
-    return retval;
-}
-
-
-/**
- * Verify the contents of a device array match those
- * of a device array
- */
-template <typename T>
-int CompareDeviceDeviceResults(
-    T *d_reference,
-    T *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    // Allocate array on host
-    T *h_reference = (T*) malloc(num_items * sizeof(T));
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    // Display data
-    if (display_data) {
-        printf("Reference:\n");
-        for (int i = 0; i < num_items; i++)
-        {
-            std::cout << CoutCast(h_reference[i]) << ", ";
-        }
-        printf("\n\nComputed:\n");
-        for (int i = 0; i < num_items; i++)
-        {
-            std::cout << CoutCast(h_data[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Check
-    int retval = CompareResults(h_data, h_reference, num_items, verbose);
-
-    // Cleanup
-    if (h_reference) free(h_reference);
-    if (h_data) free(h_data);
-
-    return retval;
-}
-
-
-/**
- * Print the contents of a host array
- */
-void DisplayResults(
-    cub::NullType   *h_data,
-    size_t          num_items)
-{}
-
-
-/**
- * Print the contents of a host array
- */
-template <typename InputIteratorT>
-void DisplayResults(
-    InputIteratorT h_data,
-    size_t num_items)
-{
-    // Display data
-    for (int i = 0; i < int(num_items); i++)
-    {
-        std::cout << CoutCast(h_data[i]) << ", ";
-    }
-    printf("\n");
-}
-
-
-/**
- * Print the contents of a device array
- */
-template <typename T>
-void DisplayDeviceResults(
-    T *d_data,
-    size_t num_items)
-{
-    // Allocate array on host
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    DisplayResults(h_data, num_items);
-
-    // Cleanup
-    if (h_data) free(h_data);
-}
-
-
-/******************************************************************************
- * Segment descriptor generation
- ******************************************************************************/
-
-/**
- * Initialize segments
- */
-void InitializeSegments(
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    bool    verbose = false)
-{
-    if (num_segments <= 0)
-        return;
-
-    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
-    int offset = 0;
-    for (int i = 0; i < num_segments; ++i)
-    {
-        h_segment_offsets[i] = offset;
-
-        unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
-        offset += segment_length;
-        offset = CUB_MIN(offset, num_items);
-    }
-    h_segment_offsets[num_segments] = num_items;
-
-    if (verbose)
-    {
-        printf("Segment offsets: ");
-        DisplayResults(h_segment_offsets, num_segments + 1);
-    }
-}
-
-
-/******************************************************************************
- * Timing
- ******************************************************************************/
-
-
-struct CpuTimer
-{
-#if defined(_WIN32) || defined(_WIN64)
-
-    LARGE_INTEGER ll_freq;
-    LARGE_INTEGER ll_start;
-    LARGE_INTEGER ll_stop;
-
-    CpuTimer()
-    {
-        QueryPerformanceFrequency(&ll_freq);
-    }
-
-    void Start()
-    {
-        QueryPerformanceCounter(&ll_start);
-    }
-
-    void Stop()
-    {
-        QueryPerformanceCounter(&ll_stop);
-    }
-
-    float ElapsedMillis()
-    {
-        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
-        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
-
-        return float((stop - start) * 1000);
-    }
-
-#else
-
-    rusage start;
-    rusage stop;
-
-    void Start()
-    {
-        getrusage(RUSAGE_SELF, &start);
-    }
-
-    void Stop()
-    {
-        getrusage(RUSAGE_SELF, &stop);
-    }
-
-    float ElapsedMillis()
-    {
-        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
-        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
-
-        return (sec * 1000) + (usec / 1000);
-    }
-
-#endif
-};
-
-struct GpuTimer
-{
-    cudaEvent_t start;
-    cudaEvent_t stop;
-
-    GpuTimer()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-    }
-
-    ~GpuTimer()
-    {
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-    }
-
-    void Start()
-    {
-        cudaEventRecord(start, 0);
-    }
-
-    void Stop()
-    {
-        cudaEventRecord(stop, 0);
-    }
-
-    float ElapsedMillis()
-    {
-        float elapsed;
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&elapsed, start, stop);
-        return elapsed;
-    }
-};
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu
deleted file mode 100644
index 673219aa4..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_reduce.cu
+++ /dev/null
@@ -1,840 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of WarpReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/warp/warp_reduce.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<
-    typename    OpT,
-    int         LOGICAL_WARP_THREADS>
-struct WrapperFunctor
-{
-    OpT op;
-    int num_valid;
-
-    inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
-
-    template <typename T>
-    inline __host__ __device__ T operator()(const T &a, const T &b) const
-    {
-#if CUB_PTX_ARCH != 0
-        if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
-            cub::ThreadTrap();
-#endif
-
-        return op(a, b);
-    }
-
-};
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * Generic reduction
- */
-template <
-    typename    T,
-    typename    ReductionOp,
-    typename    WarpReduce,
-    bool        PRIMITIVE = Traits<T>::PRIMITIVE>
-struct DeviceTest
-{
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).Reduce(data, reduction_op);
-    }
-
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        ReductionOp                         &reduction_op,
-        const int                           &valid_warp_threads)
-    {
-        return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T HeadSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T TailSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
-    }
-
-};
-
-
-/**
- * Summation
- */
-template <
-    typename    T,
-    typename    WarpReduce>
-struct DeviceTest<T, Sum, WarpReduce, true>
-{
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).Sum(data);
-    }
-
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        Sum                              &reduction_op,
-        const int                           &valid_warp_threads)
-    {
-        return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T HeadSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T TailSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
-    }
-
-};
-
-
-/**
- * Full-tile warp reduction kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-__global__ void FullWarpReduceKernel(
-    T               *d_in,
-    T               *d_out,
-    ReductionOp     reduction_op,
-    clock_t         *d_elapsed)
-{
-    // Cooperative warp-reduce utility type (1 warp)
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T input = d_in[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
-        temp_storage[warp_id], input, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
-        output :
-        input;
-}
-
-/**
- * Partially-full warp reduction kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-__global__ void PartialWarpReduceKernel(
-    T           *d_in,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed,
-    int         valid_warp_threads)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T input = d_in[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test partial-warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
-        temp_storage[warp_id], input, reduction_op, valid_warp_threads);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
-        output :
-        input;
-}
-
-
-/**
- * Head-based segmented warp reduction test kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    FlagT,
-    typename    ReductionOp>
-__global__ void WarpHeadSegmentedReduceKernel(
-    T           *d_in,
-    FlagT        *d_head_flags,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T       input       = d_in[threadIdx.x];
-    FlagT   head_flag   = d_head_flags[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test segmented warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
-        temp_storage[warp_id], input, head_flag, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
-        output :
-        input;
-}
-
-
-/**
- * Tail-based segmented warp reduction test kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    FlagT,
-    typename    ReductionOp>
-__global__ void WarpTailSegmentedReduceKernel(
-    T           *d_in,
-    FlagT       *d_tail_flags,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T       input       = d_in[threadIdx.x];
-    FlagT    tail_flag   = d_tail_flags[threadIdx.x];
-    FlagT    head_flag   = (threadIdx.x == 0) ?
-                            0 :
-                            d_tail_flags[threadIdx.x - 1];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test segmented warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
-        temp_storage[warp_id], input, tail_flag, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
-        output :
-        input;
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize reduction problem (and solution)
- */
-template <
-    typename    T,
-    typename    ReductionOp>
-void Initialize(
-    GenMode     gen_mode,
-    int         flag_entropy,
-    T           *h_in,
-    int         *h_flags,
-    int         warps,
-    int         warp_threads,
-    int         valid_warp_threads,
-    ReductionOp reduction_op,
-    T           *h_head_out,
-    T           *h_tail_out)
-{
-    for (int i = 0; i < warps * warp_threads; ++i)
-    {
-        // Sample a value for this item
-        InitValue(gen_mode, h_in[i], i);
-        h_head_out[i] = h_in[i];
-        h_tail_out[i] = h_in[i];
-
-        // Sample whether or not this item will be a segment head
-        char bits;
-        RandomBits(bits, flag_entropy);
-        h_flags[i] = bits & 0x1;
-    }
-
-    // Accumulate segments (lane 0 of each warp is implicitly a segment head)
-    for (int warp = 0; warp < warps; ++warp)
-    {
-        int warp_offset  = warp * warp_threads;
-        int item_offset = warp_offset + valid_warp_threads - 1;
-
-        // Last item in warp
-        T head_aggregate = h_in[item_offset];
-        T tail_aggregate = h_in[item_offset];
-
-        if (h_flags[item_offset])
-            h_head_out[item_offset] = head_aggregate;
-        item_offset--;
-
-        // Work backwards
-        while (item_offset >= warp_offset)
-        {
-            if (h_flags[item_offset + 1])
-            {
-                head_aggregate = h_in[item_offset];
-            }
-            else
-            {
-                head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
-            }
-
-            if (h_flags[item_offset])
-            {
-                h_head_out[item_offset] = head_aggregate;
-                h_tail_out[item_offset + 1] = tail_aggregate;
-                tail_aggregate = h_in[item_offset];
-            }
-            else
-            {
-                tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
-            }
-
-            item_offset--;
-        }
-
-        // Record last segment head_aggregate to head offset
-        h_head_out[warp_offset] = head_aggregate;
-        h_tail_out[warp_offset] = tail_aggregate;
-    }
-}
-
-
-/**
- * Test warp reduction
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void TestReduce(
-    GenMode     gen_mode,
-    ReductionOp reduction_op,
-    int         valid_warp_threads = LOGICAL_WARP_THREADS)
-{
-    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
-
-    // Allocate host arrays
-    T   *h_in           = new T[BLOCK_THREADS];
-    int *h_flags        = new int[BLOCK_THREADS];
-    T   *h_out          = new T[BLOCK_THREADS];
-    T   *h_tail_out     = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
-
-    // Initialize/clear device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
-
-    if (g_verbose)
-    {
-        printf("Data:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
-    }
-
-    // Run kernel
-    printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
-        gen_mode,
-        WARPS,
-        LOGICAL_WARP_THREADS,
-        valid_warp_threads,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    if (valid_warp_threads == LOGICAL_WARP_THREADS)
-    {
-        // Run full-warp kernel
-        FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            reduction_op,
-            d_elapsed);
-    }
-    else
-    {
-        // Run partial-warp kernel
-        PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            reduction_op,
-            d_elapsed,
-            valid_warp_threads);
-    }
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_flags) delete[] h_flags;
-    if (h_out) delete[] h_out;
-    if (h_tail_out) delete[] h_tail_out;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test warp segmented reduction
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void TestSegmentedReduce(
-    GenMode     gen_mode,
-    int         flag_entropy,
-    ReductionOp reduction_op)
-{
-    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
-
-    // Allocate host arrays
-    int compare;
-    T   *h_in           = new T[BLOCK_THREADS];
-    int *h_flags        = new int[BLOCK_THREADS];
-    T   *h_head_out     = new T[BLOCK_THREADS];
-    T   *h_tail_out     = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
-
-    // Initialize/clear device arrays
-    T           *d_in = NULL;
-    int         *d_flags = NULL;
-    T           *d_head_out = NULL;
-    T           *d_tail_out = NULL;
-    clock_t     *d_elapsed = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
-
-    if (g_verbose)
-    {
-        printf("Data:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
-
-        printf("\nFlags:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
-    }
-
-    printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
-        gen_mode,
-        flag_entropy,
-        WARPS,
-        LOGICAL_WARP_THREADS,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run head-based kernel
-    WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-        d_in,
-        d_flags,
-        d_head_out,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tHead-based segmented reduction results: ");
-    compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Run tail-based kernel
-    WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-        d_in,
-        d_flags,
-        d_tail_out,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tTail-based segmented reduction results: ");
-    compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_flags) delete[] h_flags;
-    if (h_head_out) delete[] h_head_out;
-    if (h_tail_out) delete[] h_tail_out;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-    if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
-    if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Run battery of tests for different full and partial tile sizes
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void Test(
-    GenMode     gen_mode,
-    ReductionOp reduction_op)
-{
-    // Partial tiles
-    for (
-        int valid_warp_threads = 1;
-        valid_warp_threads < LOGICAL_WARP_THREADS;
-        valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
-    {
-        // Without wrapper (to test non-excepting PTX POD-op specializations)
-        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
-
-        // With wrapper to ensure no ops called on OOB lanes
-        WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
-        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
-    }
-
-    // Full tile
-    TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
-
-    // Segmented reduction with different head flags
-    for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
-    {
-        TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
-    }
-}
-
-
-/**
- * Run battery of tests for different data types and reduce ops
- */
-template <
-    int WARPS,
-    int LOGICAL_WARP_THREADS>
-void Test(GenMode gen_mode)
-{
-    // primitive
-    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());
-
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Sum());
-
-    if (gen_mode != RANDOM)
-    {
-        Test<WARPS, LOGICAL_WARP_THREADS, float>(           gen_mode, Sum());
-        Test<WARPS, LOGICAL_WARP_THREADS, double>(          gen_mode, Sum());
-    }
-
-    // primitive (alternative reduce op)
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Max());
-
-    // vec-1
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar1>(              gen_mode, Sum());
-
-    // vec-2
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar2>(              gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ushort2>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, uint2>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>(          gen_mode, Sum());
-
-    // vec-4
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar4>(              gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ushort4>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, uint4>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>(          gen_mode, Sum());
-
-    // complex
-    Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, TestBar>(             gen_mode, Sum());
-}
-
-
-/**
- * Run battery of tests for different problem generation options
- */
-template <
-    int WARPS,
-    int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
-    Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
-    Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
-}
-
-
-/**
- * Run battery of tests for different number of active warps
- */
-template <int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<1, LOGICAL_WARP_THREADS>();
-
-    // Only power-of-two subwarps can be tiled
-    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
-        Test<2, LOGICAL_WARP_THREADS>();
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    TestReduce<1, 32, int>(UNIFORM, Sum());
-
-    TestReduce<1, 32, double>(UNIFORM, Sum());
-    TestReduce<2, 16, TestBar>(UNIFORM, Sum());
-    TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test logical warp sizes
-        Test<32>();
-        Test<16>();
-        Test<9>();
-        Test<7>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu
deleted file mode 100644
index ba8e5cf66..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_warp_scan.cu
+++ /dev/null
@@ -1,661 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of WarpScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/warp/warp_scan.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-static const int        NUM_WARPS       = 2;
-
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * Primitive variant to test
- */
-enum TestMode
-{
-    BASIC,
-    AGGREGATE,
-};
-
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/// Exclusive scan basic
-template <typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.ExclusiveScan(data, data, initial_value, scan_op);
-}
-
-/// Exclusive scan aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate);
-}
-
-
-/// Exclusive sum basic
-template <
-    typename    WarpScanT,
-    typename    T>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.ExclusiveSum(data, data);
-}
-
-
-/// Exclusive sum aggregate
-template <
-    typename    WarpScanT,
-    typename    T>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.ExclusiveSum(data, data, aggregate);
-}
-
-
-/// Inclusive scan basic
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.InclusiveScan(data, data, scan_op);
-}
-
-/// Inclusive scan aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.InclusiveScan(data, data, scan_op, aggregate);
-}
-
-/// Inclusive sum basic
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    InitialValueT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.InclusiveSum(data, data);
-}
-
-/// Inclusive sum aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    InitialValueT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.InclusiveSum(data, data, aggregate);
-}
-
-
-/**
- * WarpScan test kernel
- */
-template <
-    int         LOGICAL_WARP_THREADS,
-    TestMode    TEST_MODE,
-    typename    T,
-    typename    ScanOpT,
-    typename    InitialValueT>
-__global__ void WarpScanKernel(
-    T               *d_in,
-    T               *d_out,
-    T               *d_aggregate,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value,
-    clock_t         *d_elapsed)
-{
-    // Cooperative warp-scan utility type (1 warp)
-    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpScanT::TempStorage temp_storage[NUM_WARPS];
-
-    // Get warp index
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-
-    // Per-thread tile data
-    T data = d_in[threadIdx.x];
-
-    // Start cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    T aggregate;
-
-    // Test scan
-    WarpScanT warp_scan(temp_storage[warp_id]);
-    DeviceTest(
-        warp_scan,
-        data,
-        initial_value,
-        scan_op,
-        aggregate,
-        Int2Type<TEST_MODE>(),
-        Int2Type<Traits<T>::PRIMITIVE>());
-
-    // Stop cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Store data
-    d_out[threadIdx.x] = data;
-
-    if (TEST_MODE != BASIC)
-    {
-        // Store aggregate
-        d_aggregate[threadIdx.x] = aggregate;
-    }
-
-    // Store time
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive-scan problem (and solution)
- */
-template <
-    typename        T,
-    typename        ScanOpT>
-void Initialize(
-    GenMode         gen_mode,
-    T               *h_in,
-    T               *h_reference,
-    int             logical_warp_items,
-    ScanOpT         scan_op,
-    T               initial_value,
-    T               warp_aggregates[NUM_WARPS])
-{
-    for (int w = 0; w < NUM_WARPS; ++w)
-    {
-        int base_idx = (w * logical_warp_items);
-        int i = base_idx;
-
-        InitValue(gen_mode, h_in[i], i);
-
-        T warp_aggregate   = h_in[i];
-        h_reference[i]      = initial_value;
-        T inclusive         = scan_op(initial_value, h_in[i]);
-
-        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
-        {
-            InitValue(gen_mode, h_in[i], i);
-            h_reference[i] = inclusive;
-            inclusive = scan_op(inclusive, h_in[i]);
-            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
-        }
-
-        warp_aggregates[w] = warp_aggregate;
-    }
-
-}
-
-
-/**
- * Initialize inclusive-scan problem (and solution)
- */
-template <
-    typename    T,
-    typename    ScanOpT>
-void Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         logical_warp_items,
-    ScanOpT     scan_op,
-    NullType,
-    T           warp_aggregates[NUM_WARPS])
-{
-    for (int w = 0; w < NUM_WARPS; ++w)
-    {
-        int base_idx = (w * logical_warp_items);
-        int i = base_idx;
-
-        InitValue(gen_mode, h_in[i], i);
-
-        T warp_aggregate    = h_in[i];
-        T inclusive         = h_in[i];
-        h_reference[i]      = inclusive;
-
-        for (i = i + 1; i < base_idx + logical_warp_items; ++i)
-        {
-            InitValue(gen_mode, h_in[i], i);
-            inclusive = scan_op(inclusive, h_in[i]);
-            warp_aggregate = scan_op(warp_aggregate, h_in[i]);
-            h_reference[i] = inclusive;
-        }
-
-        warp_aggregates[w] = warp_aggregate;
-    }
-}
-
-
-/**
- * Test warp scan
- */
-template <
-    int             LOGICAL_WARP_THREADS,
-    TestMode        TEST_MODE,
-    typename        T,
-    typename        ScanOpT,
-    typename        InitialValueT>        // NullType implies inclusive-scan, otherwise inclusive scan
-void Test(
-    GenMode         gen_mode,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    enum {
-        TOTAL_ITEMS = LOGICAL_WARP_THREADS * NUM_WARPS,
-    };
-
-    // Allocate host arrays
-    T *h_in = new T[TOTAL_ITEMS];
-    T *h_reference = new T[TOTAL_ITEMS];
-    T *h_aggregate = new T[TOTAL_ITEMS];
-
-    // Initialize problem
-    T aggregates[NUM_WARPS];
-
-    Initialize(
-        gen_mode,
-        h_in,
-        h_reference,
-        LOGICAL_WARP_THREADS,
-        scan_op,
-        initial_value,
-        aggregates);
-
-    if (g_verbose)
-    {
-        printf("Input: \n");
-        DisplayResults(h_in, TOTAL_ITEMS);
-        printf("\n");
-    }
-
-    for (int w = 0; w < NUM_WARPS; ++w)
-    {
-        for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
-        {
-            h_aggregate[(w * LOGICAL_WARP_THREADS) + i] = aggregates[w];
-        }
-    }
-
-    // Initialize/clear device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    T *d_aggregate = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TOTAL_ITEMS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TOTAL_ITEMS + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * TOTAL_ITEMS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TOTAL_ITEMS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TOTAL_ITEMS + 1)));
-    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * TOTAL_ITEMS));
-
-    // Run kernel
-    printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
-        TEST_MODE, typeid(TEST_MODE).name(),
-        gen_mode, typeid(gen_mode).name(),
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        LOGICAL_WARP_THREADS,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run aggregate/prefix kernel
-    WarpScanKernel<LOGICAL_WARP_THREADS, TEST_MODE><<<1, TOTAL_ITEMS>>>(
-        d_in,
-        d_out,
-        d_aggregate,
-        scan_op,
-        initial_value,
-        d_elapsed);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tScan results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TOTAL_ITEMS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Copy out and display aggregate
-    if (TEST_MODE == AGGREGATE)
-    {
-        printf("\tScan aggregate: ");
-        compare = CompareDeviceResults(h_aggregate, d_aggregate, TOTAL_ITEMS, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_aggregate) delete[] h_aggregate;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Run battery of tests for different primitive variants
- */
-template <
-    int         LOGICAL_WARP_THREADS,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-    // Exclusive
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, T());
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, T());
-
-    // Exclusive (non-specialized, so we can use initial-value)
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-
-    // Inclusive
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, NullType());
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, NullType());
-}
-
-
-/**
- * Run battery of tests for different data types and scan ops
- */
-template <int LOGICAL_WARP_THREADS>
-void Test(GenMode gen_mode)
-{
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // primitive
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (char) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (short) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (int) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long long) 99);
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (float) 99);
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (double) 99);
-    }
-
-    // primitive (alternative scan op)
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned char) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned short) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned int) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned long long) 99);
-
-    // vec-2
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uchar2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ushort2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uint2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulong2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulonglong2(17, 21));
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float2(17, 21));
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double2(17, 21));
-    }
-
-    // vec-4
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_char4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_short4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_int4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_long4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_longlong4(17, 21, 32, 85));
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float4(17, 21, 32, 85));
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double4(17, 21, 32, 85));
-    }
-
-    // complex
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestBar(17, 21));
-
-}
-
-
-/**
- * Run battery of tests for different problem generation options
- */
-template <int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<LOGICAL_WARP_THREADS>(UNIFORM);
-    Test<LOGICAL_WARP_THREADS>(INTEGER_SEED);
-    Test<LOGICAL_WARP_THREADS>(RANDOM);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0);
-    Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0);
-    Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0);
-    Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0);
-
-    typedef KeyValuePair<int, float> T;
-    cub::Sum sum_op;
-    Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp<cub::Sum>(sum_op), T());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test logical warp sizes
-        Test<32>();
-        Test<16>();
-        Test<9>();
-        Test<2>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore
deleted file mode 100644
index 5e56e040e..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/bin
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu b/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu
deleted file mode 100644
index ec0cf57bb..000000000
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/tune/tune_device_reduce.cu
+++ /dev/null
@@ -1,763 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Evaluates different tuning configurations of DeviceReduce.
- *
- * The best way to use this program:
- * (1) Find the best all-around single-block tune for a given arch.
- *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
- *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
- * (2) Update the single tune in device_reduce.cuh
- * (3) Find the best all-around multi-block tune for a given arch.
- *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
- *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
- * (4) Update the multi-block tune in device_reduce.cuh
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <vector>
-#include <algorithm>
-#include <stdio.h>
-#include <cub/cub.cuh>
-#include "../test/test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-#ifndef TUNE_ARCH
-#define TUNE_ARCH 100
-#endif
-
-int     g_max_items         = 48 * 1024 * 1024;
-int     g_samples           = 100;
-int     g_timing_iterations        = 2;
-bool    g_verbose           = false;
-bool    g_single            = false;
-bool    g_verify            = true;
-CachingDeviceAllocator  g_allocator;
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    GenMode         gen_mode,
-    T               *h_in,
-    int             num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-}
-
-/**
- * Sequential reduction
- */
-template <typename T, typename ReductionOp>
-T Reduce(
-    T               *h_in,
-    ReductionOp     reduction_op,
-    int             num_items)
-{
-    T retval = h_in[0];
-    for (int i = 1; i < num_items; ++i)
-        retval = reduction_op(retval, h_in[i]);
-
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Full tile test generation
-//---------------------------------------------------------------------
-
-
-
-/**
- * Wrapper structure for generating and running different tuning configurations
- */
-template <
-    typename T,
-    typename OffsetT,
-    typename ReductionOp>
-struct Schmoo
-{
-    //---------------------------------------------------------------------
-    // Types
-    //---------------------------------------------------------------------
-
-    /// Pairing of kernel function pointer and corresponding dispatch params
-    template <typename KernelPtr>
-    struct DispatchTuple
-    {
-        KernelPtr                           kernel_ptr;
-        DeviceReduce::KernelDispachParams   params;
-
-        float                               avg_throughput;
-        float                               best_avg_throughput;
-        OffsetT                              best_size;
-        float                               hmean_speedup;
-
-
-        DispatchTuple() :
-            kernel_ptr(0),
-            params(DeviceReduce::KernelDispachParams()),
-            avg_throughput(0.0),
-            best_avg_throughput(0.0),
-            hmean_speedup(0.0),
-            best_size(0)
-        {}
-    };
-
-    /**
-     * Comparison operator for DispatchTuple.avg_throughput
-     */
-    template <typename Tuple>
-    static bool MinSpeedup(const Tuple &a, const Tuple &b)
-    {
-        float delta = a.hmean_speedup - b.hmean_speedup;
-
-        return ((delta < 0.02) && (delta > -0.02)) ?
-            (a.best_avg_throughput < b.best_avg_throughput) :       // Negligible average performance differences: defer to best performance
-            (a.hmean_speedup < b.hmean_speedup);
-    }
-
-
-
-    /// Multi-block reduction kernel type and dispatch tuple type
-    typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
-    typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
-
-    /// Single-block reduction kernel type and dispatch tuple type
-    typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
-    typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
-
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    vector<MultiDispatchTuple> multi_kernels;       // List of generated multi-block kernels
-    vector<SingleDispatchTuple> single_kernels;     // List of generated single-block kernels
-
-
-    //---------------------------------------------------------------------
-    // Kernel enumeration methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Must have smem that fits in the SM
-     * Must have vector load length that divides items per thread
-     */
-    template <typename TilesReducePolicy, typename ReductionOp>
-    struct SmemSize
-    {
-        enum
-        {
-            BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
-            IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
-                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
-        };
-    };
-
-
-    /**
-     * Specialization that allows kernel generation with the specified TilesReducePolicy
-     */
-    template <
-        typename    TilesReducePolicy,
-        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
-    struct Ok
-    {
-        /// Enumerate multi-block kernel and add to the list
-        template <typename KernelsVector>
-        static void GenerateMulti(
-            KernelsVector &multi_kernels,
-            int subscription_factor)
-        {
-            MultiDispatchTuple tuple;
-            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
-            tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
-            multi_kernels.push_back(tuple);
-        }
-
-
-        /// Enumerate single-block kernel and add to the list
-        template <typename KernelsVector>
-        static void GenerateSingle(KernelsVector &single_kernels)
-        {
-            SingleDispatchTuple tuple;
-            tuple.params.template Init<TilesReducePolicy>();
-            tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
-            single_kernels.push_back(tuple);
-        }
-    };
-
-    /**
-     * Specialization that rejects kernel generation with the specified TilesReducePolicy
-     */
-    template <typename TilesReducePolicy>
-    struct Ok<TilesReducePolicy, false>
-    {
-        template <typename KernelsVector>
-        static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
-
-        template <typename KernelsVector>
-        static void GenerateSingle(KernelsVector &single_kernels) {}
-    };
-
-
-    /// Enumerate block-scheduling variations
-    template <
-        int                     BLOCK_THREADS,
-        int                     ITEMS_PER_THREAD,
-        int                     VECTOR_LOAD_LENGTH,
-        BlockReduceAlgorithm    BLOCK_ALGORITHM,
-        CacheLoadModifier      LOAD_MODIFIER>
-    void Enumerate()
-    {
-        // Multi-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
-#if TUNE_ARCH >= 200
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
-#endif
-
-        // Single-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
-    }
-
-
-    /// Enumerate load modifier variations
-    template <
-        int                     BLOCK_THREADS,
-        int                     ITEMS_PER_THREAD,
-        int                     VECTOR_LOAD_LENGTH,
-        BlockReduceAlgorithm    BLOCK_ALGORITHM>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
-#if TUNE_ARCH >= 350
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
-#endif
-    }
-
-
-    /// Enumerate block algorithms
-    template <
-        int BLOCK_THREADS,
-        int ITEMS_PER_THREAD,
-        int VECTOR_LOAD_LENGTH>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    }
-
-
-    /// Enumerate vectorization variations
-    template <
-        int BLOCK_THREADS,
-        int ITEMS_PER_THREAD>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
-    }
-
-
-    /// Enumerate thread-granularity variations
-    template <int BLOCK_THREADS>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, 7>();
-        Enumerate<BLOCK_THREADS, 8>();
-        Enumerate<BLOCK_THREADS, 9>();
-
-        Enumerate<BLOCK_THREADS, 11>();
-        Enumerate<BLOCK_THREADS, 12>();
-        Enumerate<BLOCK_THREADS, 13>();
-
-        Enumerate<BLOCK_THREADS, 15>();
-        Enumerate<BLOCK_THREADS, 16>();
-        Enumerate<BLOCK_THREADS, 17>();
-
-        Enumerate<BLOCK_THREADS, 19>();
-        Enumerate<BLOCK_THREADS, 20>();
-        Enumerate<BLOCK_THREADS, 21>();
-
-        Enumerate<BLOCK_THREADS, 23>();
-        Enumerate<BLOCK_THREADS, 24>();
-        Enumerate<BLOCK_THREADS, 25>();
-    }
-
-
-    /// Enumerate block size variations
-    void Enumerate()
-    {
-        printf("\nEnumerating kernels\n"); fflush(stdout);
-
-        Enumerate<32>();
-        Enumerate<64>();
-        Enumerate<96>();
-        Enumerate<128>();
-        Enumerate<160>();
-        Enumerate<192>();
-        Enumerate<256>();
-        Enumerate<512>();
-    }
-
-
-    //---------------------------------------------------------------------
-    // Test methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Test a configuration
-     */
-    void TestConfiguration(
-        MultiDispatchTuple      &multi_dispatch,
-        SingleDispatchTuple     &single_dispatch,
-        T*                      d_in,
-        T*                      d_out,
-        T*                      h_reference,
-        OffsetT                  num_items,
-        ReductionOp             reduction_op)
-    {
-        // Clear output
-        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
-
-        // Allocate temporary storage
-        void            *d_temp_storage = NULL;
-        size_t          temp_storage_bytes = 0;
-        CubDebugExit(DeviceReduce::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            multi_dispatch.kernel_ptr,
-            single_dispatch.kernel_ptr,
-            FillAndResetDrainKernel<OffsetT>,
-            multi_dispatch.params,
-            single_dispatch.params,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Warmup/correctness iteration
-        CubDebugExit(DeviceReduce::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            multi_dispatch.kernel_ptr,
-            single_dispatch.kernel_ptr,
-            FillAndResetDrainKernel<OffsetT>,
-            multi_dispatch.params,
-            single_dispatch.params,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op));
-
-        if (g_verify) CubDebugExit(cudaDeviceSynchronize());
-
-        // Copy out and display results
-        int compare = (g_verify) ?
-            CompareDeviceResults(h_reference, d_out, 1, true, false) :
-            0;
-
-        // Performance
-        GpuTimer gpu_timer;
-        float elapsed_millis = 0.0;
-        for (int i = 0; i < g_timing_iterations; i++)
-        {
-            gpu_timer.Start();
-
-            CubDebugExit(DeviceReduce::Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                multi_dispatch.kernel_ptr,
-                single_dispatch.kernel_ptr,
-                FillAndResetDrainKernel<OffsetT>,
-                multi_dispatch.params,
-                single_dispatch.params,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op));
-
-            gpu_timer.Stop();
-            elapsed_millis += gpu_timer.ElapsedMillis();
-        }
-
-        // Mooch
-        CubDebugExit(cudaDeviceSynchronize());
-
-        float avg_elapsed = elapsed_millis / g_timing_iterations;
-        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
-        float avg_bandwidth = avg_throughput * sizeof(T);
-
-        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
-        if (avg_throughput > multi_dispatch.best_avg_throughput)
-        {
-            multi_dispatch.best_avg_throughput = avg_throughput;
-            multi_dispatch.best_size = num_items;
-        }
-
-        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
-        if (avg_throughput > single_dispatch.best_avg_throughput)
-        {
-            single_dispatch.best_avg_throughput = avg_throughput;
-            single_dispatch.best_size = num_items;
-        }
-
-        if (g_verbose)
-        {
-            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
-            multi_dispatch.params.Print();
-            printf(" ), single_dispatch( ");
-            single_dispatch.params.Print();
-            printf(" )\n");
-            fflush(stdout);
-        }
-
-        AssertEquals(0, compare);
-
-        // Cleanup temporaries
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    }
-
-
-    /**
-     * Evaluate multi-block configurations
-     */
-    void TestMulti(
-        T*                      h_in,
-        T*                      d_in,
-        T*                      d_out,
-        ReductionOp             reduction_op)
-    {
-        // Simple single kernel tuple for use with multi kernel sweep
-        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
-        SingleDispatchTuple simple_single_tuple;
-        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
-        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
-
-        double max_exponent      = log2(double(g_max_items));
-        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
-        unsigned int max_int     = (unsigned int) -1;
-
-        for (int sample = 0; sample < g_samples; ++sample)
-        {
-            printf("\nMulti-block sample %d, ", sample);
-
-            int num_items;
-            if (sample == 0)
-            {
-                // First sample: use max items
-                num_items = g_max_items;
-                printf("num_items: %d", num_items); fflush(stdout);
-            }
-            else
-            {
-                // Sample a problem size from [2^g_min_exponent, g_max_items].  First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
-                unsigned int bits;
-                RandomBits(bits);
-                double scale = double(bits) / max_int;
-
-                if (sample < g_samples / 2)
-                {
-                    // log bias
-                    double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
-                    num_items = pow(2.0, exponent);
-                    num_items = CUB_MIN(num_items, g_max_items);
-                    printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
-                }
-                else
-                {
-                    // uniform bias
-                    num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
-                    num_items = CUB_MIN(num_items, g_max_items);
-                    printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
-                }
-            }
-            if (g_verbose)
-                printf("\n");
-            else
-                printf(", ");
-
-            // Compute reference
-            T h_reference = Reduce(h_in, reduction_op, num_items);
-
-            // Run test on each multi-kernel configuration
-            float best_avg_throughput = 0.0;
-            for (int j = 0; j < multi_kernels.size(); ++j)
-            {
-                multi_kernels[j].avg_throughput = 0.0;
-
-                TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
-
-                best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
-            }
-
-            // Print best throughput for this problem size
-            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
-
-            // Accumulate speedup (inverse for harmonic mean)
-            for (int j = 0; j < multi_kernels.size(); ++j)
-                multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
-        }
-
-        // Find max overall throughput and compute hmean speedups
-        float overall_max_throughput = 0.0;
-        for (int j = 0; j < multi_kernels.size(); ++j)
-        {
-            overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
-            multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
-        }
-
-        // Sort by cumulative speedup
-        sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
-
-        // Print ranked multi configurations
-        printf("\nRanked multi_kernels:\n");
-        for (int j = 0; j < multi_kernels.size(); ++j)
-        {
-            printf("\t (%d) params( ", multi_kernels.size() - j);
-            multi_kernels[j].params.Print();
-            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
-                multi_kernels[j].hmean_speedup,
-                multi_kernels[j].best_avg_throughput,
-                (int) multi_kernels[j].best_size,
-                multi_kernels[j].best_avg_throughput * sizeof(T),
-                multi_kernels[j].best_avg_throughput / overall_max_throughput);
-        }
-
-        printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
-    }
-
-
-    /**
-     * Evaluate single-block configurations
-     */
-    void TestSingle(
-        T*                      h_in,
-        T*                      d_in,
-        T*                      d_out,
-        ReductionOp             reduction_op)
-     {
-        // Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
-        MultiDispatchTuple multi_tuple;
-
-        double max_exponent     = log2(double(g_max_items));
-        unsigned int max_int    = (unsigned int) -1;
-
-        for (int sample = 0; sample < g_samples; ++sample)
-        {
-            printf("\nSingle-block sample %d, ", sample);
-
-            int num_items;
-            if (sample == 0)
-            {
-                // First sample: use max items
-                num_items = g_max_items;
-                printf("num_items: %d", num_items); fflush(stdout);
-            }
-            else
-            {
-                // Sample a problem size from [2, g_max_items], log-distributed
-                unsigned int bits;
-                RandomBits(bits);
-                double scale = double(bits) / max_int;
-                double exponent = ((max_exponent - 1) * scale) + 1;
-                num_items = pow(2.0, exponent);
-                printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
-            }
-
-            if (g_verbose)
-                printf("\n");
-            else
-                printf(", ");
-
-            // Compute reference
-            T h_reference = Reduce(h_in, reduction_op, num_items);
-
-            // Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
-            float best_avg_throughput = 0.0;
-            for (int j = 0; j < single_kernels.size(); ++j)
-            {
-                single_kernels[j].avg_throughput = 0.0;
-
-                TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
-
-                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
-            }
-
-            // Print best throughput for this problem size
-            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
-
-            // Accumulate speedup (inverse for harmonic mean)
-            for (int j = 0; j < single_kernels.size(); ++j)
-                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
-        }
-
-        // Find max overall throughput and compute hmean speedups
-        float overall_max_throughput = 0.0;
-        for (int j = 0; j < single_kernels.size(); ++j)
-        {
-            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
-            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
-        }
-
-        // Sort by cumulative speedup
-        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
-
-        // Print ranked single configurations
-        printf("\nRanked single_kernels:\n");
-        for (int j = 0; j < single_kernels.size(); ++j)
-        {
-            printf("\t (%d) params( ", single_kernels.size() - j);
-            single_kernels[j].params.Print();
-            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
-                single_kernels[j].hmean_speedup,
-                single_kernels[j].best_avg_throughput,
-                (int) single_kernels[j].best_size,
-                single_kernels[j].best_avg_throughput * sizeof(T),
-                single_kernels[j].best_avg_throughput / overall_max_throughput);
-        }
-
-        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
-    }
-
-};
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    args.GetCmdLineArgument("n", g_max_items);
-    args.GetCmdLineArgument("s", g_samples);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_single = args.CheckCmdLineFlag("single");
-    g_verify = !args.CheckCmdLineFlag("noverify");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--n=<max items>]"
-            "[--s=<samples>]"
-            "[--i=<timing iterations>]"
-            "[--single]"
-            "[--v]"
-            "[--noverify]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#if (TUNE_SIZE == 1)
-    typedef unsigned char T;
-#elif (TUNE_SIZE == 2)
-    typedef unsigned short T;
-#elif (TUNE_SIZE == 4)
-    typedef unsigned int T;
-#elif (TUNE_SIZE == 8)
-    typedef unsigned long long T;
-#else
-    // Default
-    typedef unsigned int T;
-#endif
-
-    typedef unsigned int OffsetT;
-    Sum reduction_op;
-
-    // Enumerate kernels
-    Schmoo<T, OffsetT, Sum > schmoo;
-    schmoo.Enumerate();
-
-    // Allocate host arrays
-    T *h_in = new T[g_max_items];
-
-    // Initialize problem
-    Initialize(UNIFORM, h_in, g_max_items);
-
-    // Initialize device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
-
-    // Test kernels
-    if (g_single)
-        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
-    else
-        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-
-    return 0;
-}
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub/.clang-tidy b/source/tnn/device/cuda/thirdparty/cub/.clang-tidy
new file mode 100644
index 000000000..5f4573980
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/.clang-tidy
@@ -0,0 +1,26 @@
+---
+Checks:
+      'modernize-*,
+       -modernize-use-equals-default,
+       -modernize-concat-nested-namespaces,
+       -modernize-use-trailing-return-type'
+
+      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
+      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
+      # -modernize-use-trailing-return-type  # just a preference
+
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+FormatStyle:     none
+CheckOptions:
+ - key:             modernize-loop-convert.MaxCopySize
+   value:           '16'
+ - key:             modernize-loop-convert.MinConfidence
+   value:           reasonable
+ - key:             modernize-pass-by-value.IncludeStyle
+   value:           llvm
+ - key:             modernize-replace-auto-ptr.IncludeStyle
+   value:           llvm
+ - key:             modernize-use-nullptr.NullMacros
+   value:           'NULL'
+...
diff --git a/source/tnn/device/cuda/thirdparty/cub/.gitignore b/source/tnn/device/cuda/thirdparty/cub/.gitignore
new file mode 100644
index 000000000..50efc4be6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/.gitignore
@@ -0,0 +1,6 @@
+.p4config
+*~
+\#*
+/build
+.cache
+.vscode
diff --git a/source/tnn/device/cuda/thirdparty/cub/CHANGELOG.md b/source/tnn/device/cuda/thirdparty/cub/CHANGELOG.md
new file mode 100644
index 000000000..1f03565c1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/CHANGELOG.md
@@ -0,0 +1,1654 @@
+# CUB 2.1.0
+
+## Breaking Changes
+
+- NVIDIA/cub#553: Deprecate the `CUB_USE_COOPERATIVE_GROUPS` macro, as all supported CTK
+  distributions provide CG. This macro will be removed in a future version of CUB.
+
+## New Features
+
+- NVIDIA/cub#359: Add new `DeviceBatchMemcpy` algorithm.
+- NVIDIA/cub#565: Add `DeviceMergeSort::StableSortKeysCopy` API. Thanks to David Wendt (@davidwendt)
+  for this contribution.
+- NVIDIA/cub#585: Add SM90 tuning policy for `DeviceRadixSort`. Thanks to Andy Adinets (@canonizer)
+  for this contribution.
+- NVIDIA/cub#586: Introduce a new mechanism to opt-out of compiling CDP support in CUB algorithms by
+  defining `CUB_DISABLE_CDP`.
+- NVIDIA/cub#589: Support 64-bit indexing in `DeviceReduce`.
+- NVIDIA/cub#607: Support 128-bit integers in radix sort.
+
+## Bug Fixes
+
+- NVIDIA/cub#547: Resolve several long-running issues resulting from using multiple versions of CUB
+  within the same process. Adds an inline namespace that encodes CUB version and targeted PTX
+  architectures.
+- NVIDIA/cub#562: Fix bug in `BlockShuffle` resulting from an invalid thread offset. Thanks to
+  @sjfeng1999 for this contribution.
+- NVIDIA/cub#564: Fix bug in `BlockRadixRank` when used with blocks that are not a multiple of 32
+  threads.
+- NVIDIA/cub#579: Ensure that all threads in the logical warp participate in the index-shuffle
+  for `BlockRadixRank`. Thanks to Andy Adinets (@canonizer) for this contribution.
+- NVIDIA/cub#582: Fix reordering in CUB member initializer lists.
+- NVIDIA/cub#589: Fix `DeviceSegmentedSort` when used with `bool` keys.
+- NVIDIA/cub#590: Fix CUB's CMake install rules. Thanks to Robert Maynard (@robertmaynard) for this
+  contribution.
+- NVIDIA/cub#592: Fix overflow in `DeviceReduce`.
+- NVIDIA/cub#598: Fix `DeviceRunLengthEncode` when the first item is a `NaN`.
+- NVIDIA/cub#611: Fix `WarpScanExclusive` for vector types.
+
+## Other Enhancements
+
+- NVIDIA/cub#537: Add detailed and expanded version of
+  a [CUB developer overview](https://github.com/NVIDIA/cub/blob/main/docs/developer_overview.md).
+- NVIDIA/cub#549: Fix `BlockReduceRaking` docs for non-commutative operations. Thanks to Tobias
+  Ribizel (@upsj) for this contribution.
+- NVIDIA/cub#606: Optimize CUB's decoupled-lookback implementation.
+
+# CUB 2.0.1
+
+## Other Enhancements
+
+- Skip device-side synchronization on SM90+. These syncs are a debugging-only feature and not
+  required for correctness, and a warning will be emitted if this happens.
+
+# CUB 2.0.0
+
+## Summary
+
+The CUB 2.0.0 major release adds a dependency on libcu++ and contains several
+breaking changes. These include new diagnostics when inspecting device-only
+lambdas from the host, an updated method of determining accumulator types for
+algorithms like Reduce and Scan, and a compile-time replacement for the
+runtime `debug_synchronous` debugging flags.
+
+This release also includes several new features. `DeviceHistogram` now
+supports `__half` and better handles various edge cases. `WarpReduce` now
+performs correctly when restricted to a single-thread “warp”, and will use
+the `__reduce_add_sync` accelerated intrinsic (introduced with Ampere) when
+appropriate. `DeviceRadixSort` learned to handle the case
+where `begin_bit == end_bit`.
+
+Several algorithms also have updated documentation, with a particular focus on
+clarifying which operations can and cannot be performed in-place.
+
+## Breaking Changes
+
+- NVIDIA/cub#448 Add libcu++ dependency (v1.8.0+).
+- NVIDIA/cub#448: The following macros are no longer defined by default. They
+  can be re-enabled by defining `CUB_PROVIDE_LEGACY_ARCH_MACROS`. These will be
+  completely removed in a future release.
+  - `CUB_IS_HOST_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`.
+  - `CUB_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+- NVIDIA/cub#486: CUB's CUDA Runtime support macros have been updated to
+  support `NV_IF_TARGET`. They are now defined consistently across all
+  host/device compilation passes. This should not affect most usages of these
+  macros, but may require changes for some edge cases.
+  - `CUB_RUNTIME_FUNCTION`: Execution space annotations for functions that
+    invoke CUDA Runtime APIs.
+    - Old behavior:
+      - RDC enabled: Defined to `__host__ __device__`
+      - RDC not enabled:
+        - NVCC host pass: Defined to `__host__ __device__`
+        - NVCC device pass: Defined to `__host__`
+    - New behavior:
+      - RDC enabled: Defined to `__host__ __device__`
+      - RDC not enabled: Defined to `__host__`
+  - `CUB_RUNTIME_ENABLED`: No change in behavior, but no longer used in CUB.
+    Provided for legacy support only. Legacy behavior:
+    - RDC enabled: Macro is defined.
+    - RDC not enabled:
+      - NVCC host pass: Macro is defined.
+      - NVCC device pass: Macro is not defined.
+  - `CUB_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to replace
+    most usages of `CUB_RUNTIME_ENABLED`. Behavior:
+    - RDC enabled: Macro is defined.
+    - RDC not enabled: Macro is not defined.
+- NVIDIA/cub#509: A compile-time error is now emitted when a `__device__`-only
+  lambda's return type is queried from host code (requires libcu++ ≥ 1.9.0).
+  - Due to limitations in the CUDA programming model, the result of this query
+    is unreliable, and will silently return an incorrect result. This leads to
+    difficult to debug errors.
+  - When using libcu++ 1.9.0, an error will be emitted with information about
+    work-arounds:
+    - Use a named function object with a `__device__`-only implementation
+      of `operator()`.
+    - Use a `__host__ __device__` lambda.
+    - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0)
+- NVIDIA/cub#509: Use the result type of the binary reduction operator for
+  accumulating intermediate results in the `DeviceReduce` algorithm, following
+  guidance from http://wg21.link/P2322R6.
+  - This change requires host-side introspection of the binary operator's
+    signature, and device-only extended lambda functions can no longer be used.
+  - In addition to the behavioral changes, the interfaces for
+    the `Dispatch*Reduce` layer have changed:
+    - `DispatchReduce`:
+      - Now accepts accumulator type as last parameter.
+      - Now accepts initializer type instead of output iterator value type.
+      - Constructor now accepts `init` as initial type instead of output
+        iterator value type.
+    - `DispatchSegmentedReduce`:
+      - Accepts accumulator type as last parameter.
+      - Accepts initializer type instead of output iterator value type.
+  - Thread operators now accept parameters using different types: `Equality`
+    , `Inequality`, `InequalityWrapper`, `Sum`, `Difference`, `Division`, `Max`
+    , `ArgMax`, `Min`, `ArgMin`.
+  - `ThreadReduce` now accepts accumulator type and uses a different type
+    for `prefix`.
+- NVIDIA/cub#511: Use the result type of the binary operator for accumulating
+  intermediate results in the `DeviceScan`, `DeviceScanByKey`,
+  and `DeviceReduceByKey` algorithms, following guidance
+  from http://wg21.link/P2322R6.
+  - This change requires host-side introspection of the binary operator's
+    signature, and device-only extended lambda functions can no longer be used.
+  - In addition to the behavioral changes, the interfaces for the `Dispatch`
+    layer have changed:
+    - `DispatchScan`now accepts accumulator type as a template parameter.
+    - `DispatchScanByKey`now accepts accumulator type as a template parameter.
+    - `DispatchReduceByKey`now accepts accumulator type as the last template
+      parameter.
+- NVIDIA/cub#527: Deprecate the `debug_synchronous` flags on device algorithms.
+  - This flag no longer has any effect. Define `CUB_DEBUG_SYNC` during
+    compilation to enable these checks.
+  - Moving this option from run-time to compile-time avoids the compilation
+    overhead of unused debugging paths in production code.
+
+## New Features
+
+- NVIDIA/cub#514: Support `__half` in `DeviceHistogram`.
+- NVIDIA/cub#516: Add support for single-threaded invocations of `WarpReduce`.
+- NVIDIA/cub#516: Use `__reduce_add_sync` hardware acceleration for `WarpReduce`
+  on supported architectures.
+
+## Bug Fixes
+
+- NVIDIA/cub#481: Fix the device-wide radix sort implementations to simply copy
+  the input to the output when `begin_bit == end_bit`.
+- NVIDIA/cub#487: Fix `DeviceHistogram::Even` for a variety of edge cases:
+  - Bin ids are now correctly computed when mixing different types for `SampleT`
+    and `LevelT`.
+  - Bin ids are now correctly computed when `LevelT` is an integral type and the
+    number of levels does not evenly divide the level range.
+- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#508: Remove excessive calls to the binary operator given to
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#533: Fix debugging utilities when RDC is disabled.
+
+## Other Enhancements
+
+- NVIDIA/cub#448: Removed special case code for unsupported CUDA architectures.
+- NVIDIA/cub#448: Replace several usages of `__CUDA_ARCH__` with `<nv/target>`
+  to handle host/device code divergence.
+- NVIDIA/cub#448: Mark unused PTX arch parameters as legacy.
+- NVIDIA/cub#476: Enabled additional debug logging for the onesweep radix sort
+  implementation. Thanks to @canonizer for this contribution.
+- NVIDIA/cub#480: Add `CUB_DISABLE_BF16_SUPPORT` to avoid including
+  the `cuda_bf16.h` header or using the `__nv_bfloat16` type.
+- NVIDIA/cub#486: Add debug log messages for post-kernel debug synchronizations.
+- NVIDIA/cub#490: Clarify documentation for in-place usage of `DeviceScan`
+  algorithms.
+- NVIDIA/cub#494: Clarify documentation for in-place usage of `DeviceHistogram`
+  algorithms.
+- NVIDIA/cub#495: Clarify documentation for in-place usage of `DevicePartition`
+  algorithms.
+- NVIDIA/cub#499: Clarify documentation for in-place usage of `Device*Sort`
+  algorithms.
+- NVIDIA/cub#500: Clarify documentation for in-place usage of `DeviceReduce`
+  algorithms.
+- NVIDIA/cub#501: Clarify documentation for in-place usage
+  of `DeviceRunLengthEncode` algorithms.
+- NVIDIA/cub#503: Clarify documentation for in-place usage of `DeviceSelect`
+  algorithms.
+- NVIDIA/cub#518: Fix typo in `WarpMergeSort` documentation.
+- NVIDIA/cub#519: Clarify segmented sort documentation regarding the handling of
+  elements that are not included in any segment.
+
+# CUB 1.17.2
+
+## Summary
+
+CUB 1.17.2 is a minor bugfix release.
+
+- NVIDIA/cub#547: Introduce an annotated inline namespace to prevent issues with
+  collisions and mismatched kernel configurations across libraries. The new
+  namespace encodes the CUB version and target SM architectures.
+
+# CUB 1.17.1
+
+## Summary
+
+CUB 1.17.1 is a minor bugfix release.
+
+- NVIDIA/cub#508: Ensure that `temp_storage_bytes` is properly set in
+  the `AdjacentDifferenceCopy` device algorithms.
+- NVIDIA/cub#508: Remove excessive calls to the binary operator given to
+  the `AdjacentDifferenceCopy` device algorithms.
+- Fix device-side debug synchronous behavior in `DeviceSegmentedSort`.
+
+# CUB 1.17.0
+
+## Summary
+
+CUB 1.17.0 is the final minor release of the 1.X series. It provides a variety
+of bug fixes and miscellaneous enhancements, detailed below.
+
+## Known Issues
+
+### "Run-to-run" Determinism Broken
+
+Several CUB device algorithms are documented to provide deterministic results
+(per device) for non-associative reduction operators (e.g. floating-point
+addition). Unfortunately, the implementations of these algorithms contain
+performance optimizations that violate this guarantee.
+The `DeviceReduce::ReduceByKey` and `DeviceScan` algorithms are known to be
+affected. We're currently evaluating the scope and impact of correcting this in
+a future CUB release. See NVIDIA/cub#471 for details.
+
+## Bug Fixes
+
+- NVIDIA/cub#444: Fixed `DeviceSelect` to work with discard iterators and mixed
+  input/output types.
+- NVIDIA/cub#452: Fixed install issue when `CMAKE_INSTALL_LIBDIR` contained
+  nested directories. Thanks to @robertmaynard for this contribution.
+- NVIDIA/cub#462: Fixed bug that produced incorrect results
+  from `DeviceSegmentedSort` on sm_61 and sm_70.
+- NVIDIA/cub#464: Fixed `DeviceSelect::Flagged` so that flags are normalized to
+  0 or 1.
+- NVIDIA/cub#468: Fixed overflow issues in `DeviceRadixSort` given `num_items`
+  close to 2^32. Thanks to @canonizer for this contribution.
+- NVIDIA/cub#498: Fixed compiler regression in `BlockAdjacentDifference`.
+  Thanks to @MKKnorr for this contribution.
+
+## Other Enhancements
+
+- NVIDIA/cub#445: Remove device-sync in `DeviceSegmentedSort` when launched via
+  CDP.
+- NVIDIA/cub#449: Fixed invalid link in documentation. Thanks to @kshitij12345
+  for this contribution.
+- NVIDIA/cub#450: `BlockDiscontinuity`: Replaced recursive-template loop
+  unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
+  contribution.
+- NVIDIA/cub#451: Replaced the deprecated `TexRefInputIterator` implementation
+  with an alias to `TexObjInputIterator`. This fully removes all usages of the
+  deprecated CUDA texture reference APIs from CUB.
+- NVIDIA/cub#456: `BlockAdjacentDifference`: Replaced recursive-template loop
+  unrolling with `#pragma unroll`. Thanks to @kshitij12345 for this
+  contribution.
+- NVIDIA/cub#466: `cub::DeviceAdjacentDifference` API has been updated to use
+  the new `OffsetT` deduction approach described in NVIDIA/cub#212.
+- NVIDIA/cub#470: Fix several doxygen-related warnings. Thanks to @karthikeyann
+  for this contribution.
+
+# CUB 1.16.0
+
+## Summary
+
+CUB 1.16.0 is a major release providing several improvements to the device scope
+algorithms. `DeviceRadixSort` now supports large (64-bit indexed) input data. A
+new `UniqueByKey` algorithm has been added to `DeviceSelect`.
+`DeviceAdjacentDifference` provides new `SubtractLeft` and `SubtractRight`
+functionality.
+
+This release also deprecates several obsolete APIs, including type traits
+and `BlockAdjacentDifference` algorithms. Many bugfixes and documentation
+updates are also included.
+
+### 64-bit Offsets in `DeviceRadixSort` Public APIs
+
+Users frequently want to process large datasets using CUB's device-scope
+algorithms, but the current public APIs limit input data sizes to those that can
+be indexed by a 32-bit integer. Beginning with this release, CUB is updating
+these APIs to support 64-bit offsets, as discussed in NVIDIA/cub#212.
+
+The device-scope algorithms will be updated with 64-bit offset support
+incrementally, starting with the `cub::DeviceRadixSort` family of algorithms.
+Thanks to @canonizer for contributing this functionality.
+
+### New `DeviceSelect::UniqueByKey` Algorithm
+
+`cub::DeviceSelect` now provides a `UniqueByKey` algorithm, which has been
+ported from Thrust. Thanks to @zasdfgbnm for this contribution.
+
+### New `DeviceAdjacentDifference` Algorithms
+
+The new `cub::DeviceAdjacentDifference` interface, also ported from Thrust,
+provides `SubtractLeft` and `SubtractRight` algorithms as CUB kernels.
+
+## Deprecation Notices
+
+### Synchronous CUDA Dynamic Parallelism Support
+
+**A future version of CUB will change the `debug_synchronous` behavior of
+device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).**
+
+This will only affect calls to CUB device-scope algorithms launched from
+device-side code with `debug_synchronous = true`. Such invocations will continue
+to print extra debugging information, but they will no longer synchronize after
+kernel launches.
+
+### Deprecated Traits
+
+CUB provided a variety of metaprogramming type traits in order to support C++03.
+Since C++14 is now required, these traits have been deprecated in favor of their
+STL equivalents, as shown below:
+
+| Deprecated CUB Trait  | Replacement STL Trait |
+|-----------------------|-----------------------|
+| cub::If               | std::conditional      |
+| cub::Equals           | std::is_same          |
+| cub::IsPointer        | std::is_pointer       |
+| cub::IsVolatile       | std::is_volatile      |
+| cub::RemoveQualifiers | std::remove_cv        |
+| cub::EnableIf         | std::enable_if        |
+
+CUB now uses the STL traits internally, resulting in a ~6% improvement in
+compile time.
+
+### Misnamed `cub::BlockAdjacentDifference` APIs
+
+The algorithms in `cub::BlockAdjacentDifference` have been deprecated, as their
+names did not clearly describe their intent. The `FlagHeads` method is
+now `SubtractLeft`, and `FlagTails` has been replaced by `SubtractRight`.
+
+## Breaking Changes
+
+- NVIDIA/cub#331: Deprecate the misnamed `BlockAdjacentDifference::FlagHeads`
+  and `FlagTails` methods. Use the new `SubtractLeft` and `SubtractRight`
+  methods instead.
+- NVIDIA/cub#364: Deprecate some obsolete type traits. These should be replaced
+  by the equivalent traits in `<type_traits>` as described above.
+
+## New Features
+
+- NVIDIA/cub#331: Port the `thrust::adjacent_difference` kernel and expose it
+  as `cub::DeviceAdjacentDifference`.
+- NVIDIA/cub#405: Port the `thrust::unique_by_key` kernel and expose it
+  as `cub::DeviceSelect::UniqueByKey`. Thanks to @zasdfgbnm for this
+  contribution.
+
+## Enhancements
+
+- NVIDIA/cub#340: Allow 64-bit offsets in `DeviceRadixSort` public APIs. Thanks
+  to @canonizer for this contribution.
+- NVIDIA/cub#400: Implement a significant reduction in `DeviceMergeSort`
+  compilation time.
+- NVIDIA/cub#415: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in
+  Thrust's CMake install rules. Thanks for @robertmaynard for this contribution.
+
+## Bug Fixes
+
+- NVIDIA/cub#381: Fix shared memory alignment in `dyn_smem` example.
+- NVIDIA/cub#393: Fix some collisions with the `min`/`max`  macros defined
+  in `windows.h`.
+- NVIDIA/cub#404: Fix bad cast in `util_device`.
+- NVIDIA/cub#410: Fix CDP issues in `DeviceSegmentedSort`.
+- NVIDIA/cub#411: Ensure that the `nv_exec_check_disable` pragma is only used on
+  nvcc.
+- NVIDIA/cub#418: Fix `-Wsizeof-array-div` warning on gcc 11. Thanks to
+  @robertmaynard for this contribution.
+- NVIDIA/cub#420: Fix new uninitialized variable warning in `DiscardIterator` on
+  gcc 10.
+- NVIDIA/cub#423: Fix some collisions with the `small` macro defined
+  in `windows.h`.
+- NVIDIA/cub#426: Fix some issues with version handling in CUB's CMake packages.
+- NVIDIA/cub#430: Remove documentation for `DeviceSpmv` parameters that are
+  absent from public APIs.
+- NVIDIA/cub#432: Remove incorrect documentation for `DeviceScan` algorithms
+  that guaranteed run-to-run deterministic results for floating-point addition.
+
+# CUB 1.15.0 (NVIDIA HPC SDK 22.1, CUDA Toolkit 11.6)
+
+## Summary
+
+CUB 1.15.0 includes a new `cub::DeviceSegmentedSort` algorithm, which
+demonstrates up to 5000x speedup compared to `cub::DeviceSegmentedRadixSort`
+when sorting a large number of small segments. A new `cub::FutureValue<T>`
+helper allows the `cub::DeviceScan` algorithms to lazily load the
+`initial_value` from a pointer. `cub::DeviceScan` also added `ScanByKey`
+functionality.
+
+The new `DeviceSegmentedSort` algorithm partitions segments into size groups.
+Each group is processed with specialized kernels using a variety of sorting
+algorithms. This approach varies the number of threads allocated for sorting
+each segment and utilizes the GPU more efficiently.
+
+`cub::FutureValue<T>` provides the ability to use the result of a previous
+kernel as a scalar input to a CUB device-scope algorithm without unnecessary
+synchronization:
+
+```cpp
+int *d_intermediate_result = ...;
+intermediate_kernel<<<blocks, threads>>>(d_intermediate_result,  // output
+                                         arg1,                   // input
+                                         arg2);                  // input
+
+// Wrap the intermediate pointer in a FutureValue -- no need to explicitly
+// sync when both kernels are stream-ordered. The pointer is read after
+// the ExclusiveScan kernel starts executing.
+cub::FutureValue<int> init_value(d_intermediate_result);
+
+cub::DeviceScan::ExclusiveScan(d_temp_storage,
+                               temp_storage_bytes,
+                               d_in,
+                               d_out,
+                               cub::Sum(),
+                               init_value,
+                               num_items);
+```
+
+Previously, an explicit synchronization would have been necessary to obtain the
+intermediate result, which was passed by value into ExclusiveScan. This new
+feature enables better performance in workflows that use cub::DeviceScan.
+
+## Deprecation Notices
+
+**A future version of CUB will change the `debug_synchronous` behavior of
+device-scope algorithms when invoked via CUDA Dynamic Parallelism (CDP).**
+
+This will only affect calls to CUB device-scope algorithms launched from
+device-side code with `debug_synchronous = true`. These algorithms will continue
+to print extra debugging information, but they will no longer synchronize after
+kernel launches.
+
+## Breaking Changes
+
+- NVIDIA/cub#305: The template parameters of `cub::DispatchScan` have changed to
+  support the new `cub::FutureValue` helper. More details under "New Features".
+- NVIDIA/cub#377: Remove broken `operator->()` from
+  `cub::TransformInputIterator`, since this cannot be implemented without
+  returning a temporary object's address. Thanks to Xiang Gao (@zasdfgbnm) for
+  this contribution.
+
+## New Features
+
+- NVIDIA/cub#305: Add overloads to `cub::DeviceScan` algorithms that allow the
+  output of a previous kernel to be used as `initial_value` without explicit
+  synchronization. See the new `cub::FutureValue` helper for details. Thanks to
+  Xiang Gao (@zasdfgbnm) for this contribution.
+- NVIDIA/cub#354: Add `cub::BlockRunLengthDecode` algorithm. Thanks to Elias
+  Stehle (@elstehle) for this contribution.
+- NVIDIA/cub#357: Add `cub::DeviceSegmentedSort`, an optimized version
+  of `cub::DeviceSegmentedSort` with improved load balancing and small array
+  performance.
+- NVIDIA/cub#376: Add "by key" overloads to `cub::DeviceScan`. Thanks to Xiang
+  Gao (@zasdfgbnm) for this contribution.
+
+## Bug Fixes
+
+- NVIDIA/cub#349: Doxygen and unused variable fixes.
+- NVIDIA/cub#363: Maintenance updates for the new `cub::DeviceMergeSort`
+  algorithms.
+- NVIDIA/cub#382: Fix several `-Wconversion` warnings. Thanks to Matt Stack
+  (@matt-stack) for this contribution.
+- NVIDIA/cub#388: Fix debug assertion on MSVC when using
+  `cub::CachingDeviceAllocator`.
+- NVIDIA/cub#395: Support building with `__CUDA_NO_HALF_CONVERSIONS__`. Thanks
+  to Xiang Gao (@zasdfgbnm) for this contribution.
+
+# CUB 1.14.0 (NVIDIA HPC SDK 21.9)
+
+## Summary
+
+CUB 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9.
+
+This release provides the often-requested merge sort algorithm, ported from the
+`thrust::sort` implementation. Merge sort provides more flexibility than the
+existing radix sort by supporting arbitrary data types and comparators, though
+radix sorting is still faster for supported inputs. This functionality is
+provided through the new `cub::DeviceMergeSort` and `cub::BlockMergeSort`
+algorithms.
+
+The namespace wrapping mechanism has been overhauled for 1.14. The existing
+macros (`CUB_NS_PREFIX`/`CUB_NS_POSTFIX`) can now be replaced by a single macro,
+`CUB_WRAPPED_NAMESPACE`, which is set to the name of the desired wrapped
+namespace. Defining a similar `THRUST_CUB_WRAPPED_NAMESPACE` macro will embed
+both `thrust::` and `cub::` symbols in the same external namespace. The
+prefix/postfix macros are still supported, but now require a new
+`CUB_NS_QUALIFIER` macro to be defined, which provides the fully qualified CUB
+namespace (e.g. `::foo::cub`). See `cub/util_namespace.cuh` for details.
+
+## Breaking Changes
+
+- NVIDIA/cub#350: When the `CUB_NS_[PRE|POST]FIX` macros are set,
+  `CUB_NS_QUALIFIER` must also be defined to the fully qualified CUB namespace
+  (e.g. `#define CUB_NS_QUALIFIER ::foo::cub`). Note that this is handled
+  automatically when using the new `[THRUST_]CUB_WRAPPED_NAMESPACE` mechanism.
+
+## New Features
+
+- NVIDIA/cub#322: Ported the merge sort algorithm from Thrust:
+  `cub::BlockMergeSort` and `cub::DeviceMergeSort` are now available.
+- NVIDIA/cub#326: Simplify the namespace wrapper macros, and detect when
+  Thrust's symbols are in a wrapped namespace.
+
+## Bug Fixes
+
+- NVIDIA/cub#160, NVIDIA/cub#163, NVIDIA/cub#352: Fixed several bugs in
+  `cub::DeviceSpmv` and added basic tests for this algorithm. Thanks to James
+  Wyles and Seunghwa Kang for their contributions.
+- NVIDIA/cub#328: Fixed error handling bug and incorrect debugging output in
+  `cub::CachingDeviceAllocator`. Thanks to Felix Kallenborn for this
+  contribution.
+- NVIDIA/cub#335: Fixed a compile error affecting clang and NVRTC. Thanks to
+  Jiading Guo for this contribution.
+- NVIDIA/cub#351: Fixed some errors in the `cub::DeviceHistogram` documentation.
+
+## Enhancements
+
+- NVIDIA/cub#348: Add an example that demonstrates how to use dynamic shared
+  memory with a CUB block algorithm. Thanks to Matthias Jouanneaux for this
+  contribution.
+
+# CUB 1.13.1 (CUDA Toolkit 11.5)
+
+CUB 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5.
+
+This release provides a new hook for embedding the `cub::` namespace inside
+a custom namespace. This is intended to work around various issues related to
+linking multiple shared libraries that use CUB. The existing `CUB_NS_PREFIX` and
+`CUB_NS_POSTFIX` macros already provided this capability; this update provides a
+simpler mechanism that is extended to and integrated with Thrust. Simply define
+`THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and both `thrust::` and
+`cub::` will be placed inside the new namespace. Using different wrapped
+namespaces for each shared library will prevent issues like those reported in
+NVIDIA/thrust#1401.
+
+## New Features
+
+- NVIDIA/cub#326: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks.
+
+# CUB 1.13.0 (NVIDIA HPC SDK 21.7)
+
+CUB 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release.
+
+Notable new features include support for striped data arrangements in block
+load/store utilities, `bfloat16` radix sort support, and fewer restrictions on
+offset iterators in segmented device algorithms. Several bugs
+in `cub::BlockShuffle`, `cub::BlockDiscontinuity`, and `cub::DeviceHistogram`
+have been addressed. The amount of code generated in `cub::DeviceScan` has been
+greatly reduced, leading to significant compile-time improvements when targeting
+multiple PTX architectures.
+
+This release also includes several user-contributed documentation fixes that
+will be reflected in CUB's online documentation in the coming weeks.
+
+## Breaking Changes
+
+- NVIDIA/cub#320: Deprecated `cub::TexRefInputIterator<T, UNIQUE_ID>`. Use
+  `cub::TexObjInputIterator<T>` as a replacement.
+
+## New Features
+
+- NVIDIA/cub#274: Add `BLOCK_LOAD_STRIPED` and `BLOCK_STORE_STRIPED`
+  functionality to `cub::BlockLoadAlgorithm` and `cub::BlockStoreAlgorithm`.
+  Thanks to Matthew Nicely (@mnicely) for this contribution.
+- NVIDIA/cub#291: `cub::DeviceSegmentedRadixSort` and
+  `cub::DeviceSegmentedReduce` now support different types for begin/end
+  offset iterators. Thanks to Sergey Pavlov (@psvvsp) for this contribution.
+- NVIDIA/cub#306: Add `bfloat16` support to `cub::DeviceRadixSort`. Thanks to
+  Xiang Gao (@zasdfgbnm) for this contribution.
+- NVIDIA/cub#320: Introduce a new `CUB_IGNORE_DEPRECATED_API` macro that
+  disables deprecation warnings on Thrust and CUB APIs.
+
+## Bug Fixes
+
+- NVIDIA/cub#277: Fixed sanitizer warnings in `RadixSortScanBinsKernels`. Thanks
+  to Andy Adinets (@canonizer) for this contribution.
+- NVIDIA/cub#287: `cub::DeviceHistogram` now correctly handles cases
+  where `OffsetT` is not an `int`. Thanks to Dominique LaSalle (@nv-dlasalle)
+  for this contribution.
+- NVIDIA/cub#311: Fixed several bugs and added tests for the `cub::BlockShuffle`
+  collective operations.
+- NVIDIA/cub#312: Eliminate unnecessary kernel instantiations when
+  compiling `cub::DeviceScan`. Thanks to Elias Stehle (@elstehle) for this
+  contribution.
+- NVIDIA/cub#319: Fixed out-of-bounds memory access on debugging builds
+  of `cub::BlockDiscontinuity::FlagHeadsAndTails`.
+- NVIDIA/cub#320: Fixed harmless missing return statement warning in
+  unreachable `cub::TexObjInputIterator` code path.
+
+## Other Enhancements
+
+- Several documentation fixes are included in this release.
+    - NVIDIA/cub#275: Fixed comments describing the `cub::If` and `cub::Equals`
+      utilities. Thanks to Rukshan Jayasekara (@rukshan99) for this
+      contribution.
+    - NVIDIA/cub#290: Documented that `cub::DeviceSegmentedReduce` will produce
+      consistent results run-to-run on the same device for pseudo-associated
+      reduction operators. Thanks to Himanshu (@himanshu007-creator) for this
+      contribution.
+    - NVIDIA/cub#298: `CONTRIBUTING.md` now refers to Thrust's build
+      instructions for developer builds, which is the preferred way to build the
+      CUB test harness. Thanks to Xiang Gao (@zasdfgbnm) for contributing.
+    - NVIDIA/cub#301: Expand `cub::DeviceScan` documentation to include in-place
+      support and add tests. Thanks to Xiang Gao (@zasdfgbnm) for this
+      contribution.
+    - NVIDIA/cub#307: Expand `cub::DeviceRadixSort` and `cub::BlockRadixSort`
+      documentation to clarify stability, in-place support, and type-specific
+      bitwise transformations. Thanks to Himanshu (@himanshu007-creator) for
+      contributing.
+    - NVIDIA/cub#316: Move `WARP_TIME_SLICING` documentation to the correct
+      location. Thanks to Peter Han (@peter9606) for this contribution.
+    - NVIDIA/cub#321: Update URLs from deprecated github.com to preferred
+      github.io. Thanks to Lilo Huang (@lilohuang) for this contribution.
+
+# CUB 1.12.1 (CUDA Toolkit 11.4)
+
+CUB 1.12.1 is a trivial patch release that slightly changes the phrasing of
+a deprecation message.
+
+# CUB 1.12.0 (NVIDIA HPC SDK 21.3)
+
+## Summary
+
+CUB 1.12.0 is a bugfix release accompanying the NVIDIA HPC SDK 21.3 and
+the CUDA Toolkit 11.4.
+
+Radix sort is now stable when both +0.0 and -0.0 are present in the input (they
+are treated as equivalent).
+Many compilation warnings and subtle overflow bugs were fixed in the device
+algorithms, including a long-standing bug that returned invalid temporary
+storage requirements when `num_items` was close to (but not
+exceeding) `INT32_MAX`.
+Support for Clang < 7.0 and MSVC < 2019 (aka 19.20/16.0/14.20) is now
+deprecated.
+
+## Breaking Changes
+
+- NVIDIA/cub#256: Deprecate Clang < 7 and MSVC < 2019.
+
+## New Features
+
+- NVIDIA/cub#218: Radix sort now treats -0.0 and +0.0 as equivalent for floating
+  point types, which is required for the sort to be stable. Thanks to Andy
+  Adinets for this contribution.
+
+## Bug Fixes
+
+- NVIDIA/cub#247: Suppress newly triggered warnings in Clang. Thanks to Andrew
+  Corrigan for this contribution.
+- NVIDIA/cub#249: Enable stricter warning flags. This fixes a number of
+  outstanding issues:
+  - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to
+    (but not over) `INT32_MAX`.
+  - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict
+    compilers.
+  - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned
+    offsets.
+- NVIDIA/cub#258: Use correct `OffsetT` in `DispatchRadixSort::InitPassConfig`.
+  Thanks to Felix Kallenborn for this contribution.
+- NVIDIA/cub#259: Remove some problematic `__forceinline__` annotations.
+
+## Other Enhancements
+
+- NVIDIA/cub#123: Fix incorrect issue number in changelog. Thanks to Peet
+  Whittaker for this contribution.
+
+# CUB 1.11.0 (CUDA Toolkit 11.3)
+
+## Summary
+
+CUB 1.11.0 is a major release accompanying the CUDA Toolkit 11.3 release,
+providing bugfixes and performance enhancements.
+
+It includes a new `DeviceRadixSort` backend that improves performance by up to
+2x on supported keys and hardware.
+
+Our CMake package and build system continue to see improvements
+with `add_subdirectory` support, installation rules, status messages, and other
+features that make CUB easier to use from CMake projects.
+
+The release includes several other bugfixes and modernizations, and received
+updates from 11 contributors.
+
+## Breaking Changes
+
+- NVIDIA/cub#201: The intermediate accumulator type used when `DeviceScan` is
+  invoked with different input/output types is now consistent
+  with [P0571](https://wg21.link/P0571). This may produce different results for
+  some edge cases when compared with earlier releases of CUB.
+
+## New Features
+
+- NVIDIA/cub#204: Faster `DeviceRadixSort`, up to 2x performance increase for
+  32/64-bit keys on Pascal and up (SM60+). Thanks to Andy Adinets for this
+  contribution.
+- Unroll loops in `BlockRadixRank` to improve performance for 32-bit keys by
+  1.5-2x on Clang CUDA. Thanks to Justin Lebar for this contribution.
+- NVIDIA/cub#200: Allow CUB to be added to CMake projects via `add_subdirectory`.
+- NVIDIA/cub#214: Optionally add install rules when included with
+  CMake's `add_subdirectory`. Thanks to Kai Germaschewski for this contribution.
+
+## Bug Fixes
+
+- NVIDIA/cub#215: Fix integer truncation in `AgentReduceByKey`, `AgentScan`,
+  and `AgentSegmentFixup`. Thanks to Rory Mitchell for this contribution.
+- NVIDIA/cub#225: Fix compile-time regression when defining `CUB_NS_PREFIX`
+  /`CUB_NS_POSTFIX` macro. Thanks to Elias Stehle for this contribution.
+- NVIDIA/cub#210: Fix some edge cases in `DeviceScan`:
+  - Use values from the input when padding temporary buffers. This prevents
+    custom functors from getting unexpected values.
+  - Prevent integer truncation when using large indices via the `DispatchScan`
+    layer.
+  - Use timesliced reads/writes for types > 128 bytes.
+- NVIDIA/cub#217: Fix and add test for cmake package install rules. Thanks to
+  Keith Kraus and Kai Germaschewski for testing and discussion.
+- NVIDIA/cub#170, NVIDIA/cub#233: Update CUDA version checks to behave on Clang
+  CUDA and `nvc++`. Thanks to Artem Belevich, Andrew Corrigan, and David Olsen
+  for these contributions.
+- NVIDIA/cub#220, NVIDIA/cub#216: Various fixes for Clang CUDA. Thanks to Andrew
+  Corrigan for these contributions.
+- NVIDIA/cub#231: Fix signedness mismatch warnings in unit tests.
+- NVIDIA/cub#231: Suppress GPU deprecation warnings.
+- NVIDIA/cub#214: Use semantic versioning rules for our CMake package's
+  compatibility checks. Thanks to Kai Germaschewski for this contribution.
+- NVIDIA/cub#214: Use `FindPackageHandleStandardArgs` to print standard status
+  messages when our CMake package is found. Thanks to Kai Germaschewski for this
+  contribution.
+- NVIDIA/cub#207: Fix `CubDebug` usage
+  in `CachingDeviceAllocator::DeviceAllocate`. Thanks to Andreas Hehn for this
+  contribution.
+- Fix documentation for `DevicePartition`. Thanks to ByteHamster for this
+  contribution.
+- Clean up unused code in `DispatchScan`. Thanks to ByteHamster for this
+  contribution.
+
+## Other Enhancements
+
+- NVIDIA/cub#213: Remove tuning policies for unsupported hardware (<SM35).
+- References to the old Github repository and branch names were updated.
+  - Github's `thrust/cub` repository is now `NVIDIA/cub`
+  - Development has moved from the `master` branch to the `main` branch.
+
+# CUB 1.10.0 (NVIDIA HPC SDK 20.9, CUDA Toolkit 11.2)
+
+## Summary
+
+CUB 1.10.0 is the major release accompanying the NVIDIA HPC SDK 20.9 release
+  and the CUDA Toolkit 11.2 release.
+It drops support for C++03, GCC < 5, Clang < 6, and MSVC < 2017.
+It also overhauls CMake support.
+Finally, we now have a Code of Conduct for contributors:
+https://github.com/NVIDIA/cub/blob/main/CODE_OF_CONDUCT.md
+
+## Breaking Changes
+
+- C++03 is no longer supported.
+- GCC < 5, Clang < 6, and MSVC < 2017 are no longer supported.
+- C++11 is deprecated.
+  Using this dialect will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` or `CUB_IGNORE_DEPRECATED_CPP_11`.
+  Suppression is only a short term solution.
+  We will be dropping support for C++11 in the near future.
+- CMake < 3.15 is no longer supported.
+- The default branch on GitHub is now called `main`.
+
+## Other Enhancements
+
+- Added install targets to CMake builds.
+- C++17 support.
+
+## Bug Fixes
+
+- NVIDIA/thrust#1244: Check for macro collisions with system headers during
+    header testing.
+- NVIDIA/thrust#1153: Switch to placement new instead of assignment to
+    construct items in uninitialized memory.
+  Thanks to Hugh Winkler for this contribution.
+- NVIDIA/cub#38: Fix `cub::DeviceHistogram` for `size_t` `OffsetT`s.
+  Thanks to Leo Fang for this contribution.
+- NVIDIA/cub#35: Fix GCC-5 maybe-uninitialized warning.
+  Thanks to Rong Ou for this contribution.
+- NVIDIA/cub#36: Qualify namespace for `va_printf` in `_CubLog`.
+  Thanks to Andrei Tchouprakov for this contribution.
+
+# CUB 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+## Summary
+
+CUB 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+## Bug Fixes
+
+- NVIDIA/thrust#1217: Move static local in cub::DeviceCount to a separate
+  host-only function because NVC++ doesn't support static locals in host-device
+  functions.
+
+# CUB 1.9.10 (NVIDIA HPC SDK 20.5)
+
+## Summary
+
+Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake `find_package` support.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake `find_package` support.
+  Just point CMake at the `cmake` folder in your CUB include directory
+    (ex: `cmake -DCUB_DIR=/usr/local/cuda/include/cub/cmake/ .`) and then you
+    can add CUB to your CMake project with `find_package(CUB REQUIRED CONFIG)`.
+
+# CUB 1.9.9 (CUDA 11.0)
+
+## Summary
+
+CUB 1.9.9 is the release accompanying the CUDA Toolkit 11.0 release.
+It introduces CMake support, version macros, platform detection machinery,
+  and support for NVC++, which uses Thrust (and thus CUB) to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+Additionally, the scan dispatch layer was refactored and modernized.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+## Breaking Changes
+
+- Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `CUB_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `CUB_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+## New Features
+
+- CMake support.
+  Thanks to Francis Lemaire for this contribution.
+- Refactorized and modernized scan dispatch layer.
+  Thanks to Francis Lemaire for this contribution.
+- Policy hooks for device-wide reduce, scan, and radix sort facilities
+    to simplify tuning and allow users to provide custom policies.
+  Thanks to Francis Lemaire for this contribution.
+- `<cub/version.cuh>`: `CUB_VERSION`, `CUB_VERSION_MAJOR`, `CUB_VERSION_MINOR`,
+    `CUB_VERSION_SUBMINOR`, and `CUB_PATCH_NUMBER`.
+- Platform detection machinery:
+  - `<cub/util_cpp_dialect.cuh>`: Detects the C++ standard dialect.
+  - `<cub/util_compiler.cuh>`: host and device compiler detection.
+  - `<cub/util_deprecated.cuh>`: `CUB_DEPRECATED`.
+  - <cub/config.cuh>`: Includes `<cub/util_arch.cuh>`,
+      `<cub/util_compiler.cuh>`, `<cub/util_cpp_dialect.cuh>`,
+      `<cub/util_deprecated.cuh>`, `<cub/util_macro.cuh>`,
+      `<cub/util_namespace.cuh>`
+- `cub::DeviceCount` and `cub::DeviceCountUncached`, caching abstractions for
+    `cudaGetDeviceCount`.
+
+## Other Enhancements
+
+- Lazily initialize the per-device CUDAattribute caches, because CUDA context
+    creation is expensive and adds up with large CUDA binaries on machines with
+    many GPUs.
+  Thanks to the NVIDIA PyTorch team for bringing this to our attention.
+- Make `cub::SwitchDevice` avoid setting/resetting the device if the current
+    device is the same as the target device.
+
+## Bug Fixes
+
+- Add explicit failure parameter to CAS in the CUB attribute cache to workaround
+    a GCC 4.8 bug.
+- Revert a change in reductions that changed the signedness of the `lane_id`
+    variable to suppress a warning, as this introduces a bug in optimized device
+    code.
+- Fix initialization in `cub::ExclusiveSum`.
+  Thanks to Conor Hoekstra for this contribution.
+- Fix initialization of the `std::array` in the CUB attribute cache.
+- Fix `-Wsign-compare` warnings.
+  Thanks to Elias Stehle for this contribution.
+- Fix `test_block_reduce.cu` to build without parameters.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `grid_even_share.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `thread_search.cuh`.
+  Thanks to Francis Lemaire for this contribution.
+- Add missing includes to `cub.cuh`.
+  Thanks to Felix Kallenborn for this contribution.
+
+# CUB 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+## Summary
+
+CUB 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3 release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms.
+
+# CUB 1.9.8 (CUDA 11.0 Early Access)
+
+## Summary
+
+CUB 1.9.8 is the first release of CUB to be officially supported and included
+  in the CUDA Toolkit.
+When compiling CUB in C++11 mode, CUB now caches calls to CUDA attribute query
+  APIs, which improves performance of these queries by 20x to 50x when they
+  are called concurrently by multiple host threads.
+
+## Enhancements
+
+- (C++11 or later) Cache calls to `cudaFuncGetAttributes` and
+    `cudaDeviceGetAttribute` within `cub::PtxVersion` and `cub::SmVersion`.
+    These CUDA APIs acquire locks to CUDA driver/runtime mutex and perform
+    poorly under contention; with the caching, they are 20 to 50x faster when
+    called concurrently.
+  Thanks to Bilge Acun for bringing this issue to our attention.
+- `DispatchReduce` now takes an `OutputT` template parameter so that users can
+    specify the intermediate type explicitly.
+- Radix sort tuning policies updates to fix performance issues for element
+    types smaller than 4 bytes.
+
+## Bug Fixes
+
+- Change initialization style from copy initialization to direct initialization
+    (which is more permissive) in `AgentReduce` to allow a wider range of types
+    to be used with it.
+- Fix bad signed/unsigned comparisons in `WarpReduce`.
+- Fix computation of valid lanes in warp-level reduction primitive to correctly
+    handle the case where there are 0 input items per warp.
+
+# CUB 1.8.0
+
+## Summary
+
+CUB 1.8.0 introduces changes to the `cub::Shuffle*` interfaces.
+
+## Breaking Changes
+
+- The interfaces of `cub::ShuffleIndex`, `cub::ShuffleUp`, and
+    `cub::ShuffleDown` have been changed to allow for better computation of the
+    PTX SHFL control constant for logical warps smaller than 32 threads.
+
+## Bug Fixes
+
+- #112: Fix `cub::WarpScan`'s broadcast of warp-wide aggregate for logical
+    warps smaller than 32 threads.
+
+# CUB 1.7.5
+
+## Summary
+
+CUB 1.7.5 adds support for radix sorting `__half` keys and improved sorting
+  performance for 1 byte keys.
+It was incorporated into Thrust 1.9.2.
+
+## Enhancements
+
+- Radix sort support for `__half` keys.
+- Radix sort tuning policy updates to improve 1 byte key performance.
+
+## Bug Fixes
+
+- Syntax tweaks to mollify Clang.
+- #127: `cub::DeviceRunLengthEncode::Encode` returns incorrect results.
+- #128: 7-bit sorting passes fail for SM61 with large values.
+
+# CUB 1.7.4
+
+## Summary
+
+CUB 1.7.4 is a minor release that was incorporated into Thrust 1.9.1-2.
+
+## Bug Fixes
+
+- #114: Can't pair non-trivially-constructible values in radix sort.
+- #115: `cub::WarpReduce` segmented reduction is broken in CUDA 9 for logical
+    warp sizes smaller than 32.
+
+# CUB 1.7.3
+
+## Summary
+
+CUB 1.7.3 is a minor release.
+
+## Bug Fixes
+
+- #110: `cub::DeviceHistogram` null-pointer exception bug for iterator inputs.
+
+# CUB 1.7.2
+
+## Summary
+
+CUB 1.7.2 is a minor release.
+
+## Bug Fixes
+
+- #108: Device-wide reduction is now "run-to-run" deterministic for
+    pseudo-associative reduction operators (like floating point addition).
+
+# CUB 1.7.1
+
+## Summary
+
+CUB 1.7.1 delivers improved radix sort performance on SM7x (Volta) GPUs and a
+  number of bug fixes.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM7x (Volta).
+
+## Bug Fixes
+
+- #104: `uint64_t` `cub::WarpReduce` broken for CUB 1.7.0 on CUDA 8 and older.
+- #103: Can't mix Thrust from CUDA 9.0 and CUB.
+- #102: CUB pulls in `windows.h` which defines `min`/`max` macros that conflict
+    with `std::min`/`std::max`.
+- #99: Radix sorting crashes NVCC on Windows 10 for SM52.
+- #98: cuda-memcheck: --tool initcheck failed with lineOfSight.
+- #94: Git clone size.
+- #93: Accept iterators for segment offsets.
+- #87: CUB uses anonymous unions which is not valid C++.
+- #44: Check for C++11 is incorrect for Visual Studio 2013.
+
+# CUB 1.7.0
+
+## Summary
+
+CUB 1.7.0 brings support for CUDA 9.0 and SM7x (Volta) GPUs.
+It is compatible with independent thread scheduling.
+It was incorporated into Thrust 1.9.0-5.
+
+## Breaking Changes
+
+- Remove `cub::WarpAll` and `cub::WarpAny`.
+  These functions served to emulate `__all` and `__any` functionality for
+    SM1x devices, which did not have those operations.
+  However, SM1x devices are now deprecated in CUDA, and the interfaces of these
+    two functions are now lacking the lane-mask needed for collectives to run on
+    SM7x and newer GPUs which have independent thread scheduling.
+
+## Other Enhancements
+
+- Remove any assumptions of implicit warp synchronization to be compatible with
+    SM7x's (Volta) independent thread scheduling.
+
+## Bug Fixes
+
+- #86: Incorrect results with reduce-by-key.
+
+# CUB 1.6.4
+
+## Summary
+
+CUB 1.6.4 improves radix sorting performance for SM5x (Maxwell) and SM6x
+  (Pascal) GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM5x (Maxwell) and SM6x (Pascal) -
+    3.5B and 3.4B 32 byte keys/s on TitanX and GTX 1080, respectively.
+
+## Bug Fixes
+
+- Restore fence work-around for scan (reduce-by-key, etc.) hangs in CUDA 8.5.
+- #65: `cub::DeviceSegmentedRadixSort` should allow inputs to have
+    pointer-to-const type.
+- Mollify Clang device-side warnings.
+- Remove out-dated MSVC project files.
+
+# CUB 1.6.3
+
+## Summary
+
+CUB 1.6.3 improves support for Windows, changes
+  `cub::BlockLoad`/`cub::BlockStore` interface to take the local data type,
+  and enhances radix sort performance for SM6x (Pascal) GPUs.
+
+## Breaking Changes
+
+- `cub::BlockLoad` and `cub::BlockStore` are now templated by the local data
+    type, instead of the `Iterator` type.
+  This allows for output iterators having `void` as their `value_type` (e.g.
+    discard iterators).
+
+## Other Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs - 6.2B 4 byte
+    keys/s on GP100.
+- Improved support for Windows (warnings, alignment, etc).
+
+## Bug Fixes
+
+- #74: `cub::WarpReduce` executes reduction operator for out-of-bounds items.
+- #72: `cub:InequalityWrapper::operator` should be non-const.
+- #71: `cub::KeyValuePair` won't work if `Key` has non-trivial constructor.
+- #69: cub::BlockStore::Store` doesn't compile if `OutputIteratorT::value_type`
+    isn't `T`.
+- #68: `cub::TilePrefixCallbackOp::WarpReduce` doesn't permit PTX arch
+    specialization.
+
+# CUB 1.6.2 (previously 1.5.5)
+
+## Summary
+
+CUB 1.6.2 (previously 1.5.5) improves radix sort performance for SM6x (Pascal)
+  GPUs.
+
+## Enhancements
+
+- Radix sort tuning policies updated for SM6x (Pascal) GPUs.
+
+## Bug Fixes
+
+- Fix AArch64 compilation of `cub::CachingDeviceAllocator`.
+
+# CUB 1.6.1 (previously 1.5.4)
+
+## Summary
+
+CUB 1.6.1 (previously 1.5.4) is a minor release.
+
+## Bug Fixes
+
+- Fix radix sorting bug introduced by scan refactorization.
+
+# CUB 1.6.0 (previously 1.5.3)
+
+## Summary
+
+CUB 1.6.0 changes the scan and reduce interfaces.
+Exclusive scans now accept an "initial value" instead of an "identity value".
+Scans and reductions now support differing input and output sequence types.
+Additionally, many bugs have been fixed.
+
+## Breaking Changes
+
+- Device/block/warp-wide exclusive scans have been revised to now accept an
+    "initial value" (instead of an "identity value") for seeding the computation
+    with an arbitrary prefix.
+- Device-wide reductions and scans can now have input sequence types that are
+    different from output sequence types (as long as they are convertible).
+
+## Other Enhancements
+
+- Reduce repository size by moving the doxygen binary to doc repository.
+- Minor reduction in `cub::BlockScan` instruction counts.
+
+## Bug Fixes
+
+- Issue #55: Warning in `cub/device/dispatch/dispatch_reduce_by_key.cuh`.
+- Issue #59: `cub::DeviceScan::ExclusiveSum` can't prefix sum of float into
+    double.
+- Issue #58: Infinite loop in `cub::CachingDeviceAllocator::NearestPowerOf`.
+- Issue #47: `cub::CachingDeviceAllocator` needs to clean up CUDA global error
+    state upon successful retry.
+- Issue #46: Very high amount of needed memory from the
+    `cub::DeviceHistogram::HistogramEven`.
+- Issue #45: `cub::CachingDeviceAllocator` fails with debug output enabled
+
+# CUB 1.5.2
+
+## Summary
+
+CUB 1.5.2 enhances `cub::CachingDeviceAllocator` and improves scan performance
+  for SM5x (Maxwell).
+
+## Enhancements
+
+- Improved medium-size scan performance on SM5x (Maxwell).
+- Refactored `cub::CachingDeviceAllocator`:
+  - Now spends less time locked.
+  - Uses C++11's `std::mutex` when available.
+  - Failure to allocate a block from the runtime will retry once after
+  		freeing cached allocations.
+  - Now respects max-bin, fixing an issue where blocks in excess of max-bin
+      were still being retained in the free cache.
+
+## Bug fixes:
+
+- Fix for generic-type reduce-by-key `cub::WarpScan` for SM3x and newer GPUs.
+
+# CUB 1.5.1
+
+## Summary
+
+CUB 1.5.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for incorrect `cub::DeviceRadixSort` output for some small problems on
+    SM52 (Mawell) GPUs.
+- Fix for macro redefinition warnings when compiling `thrust::sort`.
+
+# CUB 1.5.0
+
+CUB 1.5.0 introduces segmented sort and reduction primitives.
+
+## New Features:
+
+- Segmented device-wide operations for device-wide sort and reduction primitives.
+
+## Bug Fixes:
+
+- #36: `cub::ThreadLoad` generates compiler errors when loading from
+    pointer-to-const.
+- #29: `cub::DeviceRadixSort::SortKeys<bool>` yields compiler errors.
+- #26: Misaligned address after `cub::DeviceRadixSort::SortKeys`.
+- #25: Fix for incorrect results and crashes when radix sorting 0-length
+    problems.
+- Fix CUDA 7.5 issues on SM52 GPUs with SHFL-based warp-scan and
+    warp-reduction on non-primitive data types (e.g. user-defined structs).
+- Fix small radix sorting problems where 0 temporary bytes were required and
+    users code was invoking `malloc(0)` on some systems where that returns
+    `nullptr`.
+  CUB assumed the user was asking for the size again and not running the sort.
+
+# CUB 1.4.1
+
+## Summary
+
+CUB 1.4.1 is a minor release.
+
+## Enhancements
+
+- Allow `cub::DeviceRadixSort` and `cub::BlockRadixSort` on bool types.
+
+## Bug Fixes
+
+- Fix minor CUDA 7.0 performance regressions in `cub::DeviceScan` and
+    `cub::DeviceReduceByKey`.
+- Remove requirement for callers to define the `CUB_CDP` macro
+    when invoking CUB device-wide rountines using CUDA dynamic parallelism.
+- Fix headers not being included in the proper order (or missing includes)
+    for some block-wide functions.
+
+# CUB 1.4.0
+
+## Summary
+
+CUB 1.4.0 adds `cub::DeviceSpmv`, `cub::DeviceRunLength::NonTrivialRuns`,
+  improves `cub::DeviceHistogram`, and introduces support for SM5x (Maxwell)
+  GPUs.
+
+## New Features:
+
+- `cub::DeviceSpmv` methods for multiplying sparse matrices by
+    dense vectors, load-balanced using a merge-based parallel decomposition.
+- `cub::DeviceRadixSort` sorting entry-points that always return
+    the sorted output into the specified buffer, as opposed to the
+    `cub::DoubleBuffer` in which it could end up in either buffer.
+- `cub::DeviceRunLengthEncode::NonTrivialRuns` for finding the starting
+    offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in
+    a given sequence.
+  Useful for top-down partitioning algorithms like MSD sorting of very-large
+    keys.
+
+## Other Enhancements
+
+- Support and performance tuning for SM5x (Maxwell) GPUs.
+- Updated cub::DeviceHistogram implementation that provides the same
+    "histogram-even" and "histogram-range" functionality as IPP/NPP.
+  Provides extremely fast and, perhaps more importantly, very uniform
+    performance response across diverse real-world datasets, including
+    pathological (homogeneous) sample distributions.
+
+# CUB 1.3.2
+
+## Summary
+
+CUB 1.3.2 is a minor release.
+
+## Bug Fixes
+
+- Fix `cub::DeviceReduce` where reductions of small problems (small enough to
+    only dispatch a single thread block) would run in the default stream (stream
+    zero) regardless of whether an alternate stream was specified.
+
+# CUB 1.3.1
+
+## Summary
+
+CUB 1.3.1 is a minor release.
+
+## Bug Fixes
+
+- Workaround for a benign WAW race warning reported by cuda-memcheck
+    in `cub::BlockScan` specialized for `BLOCK_SCAN_WARP_SCANS` algorithm.
+- Fix bug in `cub::DeviceRadixSort` where the algorithm may sort more
+    key bits than the caller specified (up to the nearest radix digit).
+- Fix for ~3% `cub::DeviceRadixSort` performance regression on SM2x (Fermi) and
+    SM3x (Kepler) GPUs.
+
+# CUB 1.3.0
+
+## Summary
+
+CUB 1.3.0 improves how thread blocks are expressed in block- and warp-wide
+  primitives and adds an enhanced version of `cub::WarpScan`.
+
+## Breaking Changes
+
+- CUB's collective (block-wide, warp-wide) primitives underwent a minor
+    interface refactoring:
+  - To provide the appropriate support for multidimensional thread blocks,
+      The interfaces for collective classes are now template-parameterized by
+      X, Y, and Z block dimensions (with `BLOCK_DIM_Y` and `BLOCK_DIM_Z` being
+      optional, and `BLOCK_DIM_X` replacing `BLOCK_THREADS`).
+    Furthermore, the constructors that accept remapped linear
+      thread-identifiers have been removed: all primitives now assume a
+      row-major thread-ranking for multidimensional thread blocks.
+  - To allow the host program (compiled by the host-pass) to accurately
+      determine the device-specific storage requirements for a given collective
+      (compiled for each device-pass), the interfaces for collective classes
+      are now (optionally) template-parameterized by the desired PTX compute
+      capability.
+    This is useful when aliasing collective storage to shared memory that has
+      been allocated dynamically by the host at the kernel call site.
+  - Most CUB programs having typical 1D usage should not require any
+      changes to accomodate these updates.
+
+## New Features
+
+- Added "combination" `cub::WarpScan` methods for efficiently computing
+    both inclusive and exclusive prefix scans (and sums).
+
+## Bug Fixes
+
+- Fix for bug in `cub::WarpScan` (which affected `cub::BlockScan` and
+    `cub::DeviceScan`) where incorrect results (e.g., NAN) would often be
+    returned when parameterized for floating-point types (fp32, fp64).
+- Workaround for ptxas error when compiling with with -G flag on Linux (for
+    debug instrumentation).
+- Fixes for certain scan scenarios using custom scan operators where code
+    compiled for SM1x is run on newer GPUs of higher compute-capability: the
+    compiler could not tell which memory space was being used collective
+    operations and was mistakenly using global ops instead of shared ops.
+
+# CUB 1.2.3
+
+## Summary
+
+CUB 1.2.3 is a minor release.
+
+## Bug Fixes
+
+- Fixed access violation bug in `cub::DeviceReduce::ReduceByKey` for
+    non-primitive value types.
+- Fixed code-snippet bug in `ArgIndexInputIteratorT` documentation.
+
+# CUB 1.2.2
+
+## Summary
+
+CUB 1.2.2 adds a new variant of `cub::BlockReduce` and MSVC project solections
+  for examples.
+
+## New Features
+
+- MSVC project solutions for device-wide and block-wide examples
+- New algorithmic variant of cub::BlockReduce for improved performance
+    when using commutative operators (e.g., numeric addition).
+
+## Bug Fixes
+
+- Inclusion of Thrust headers in a certain order prevented CUB device-wide
+    primitives from working properly.
+
+# CUB 1.2.0
+
+## Summary
+
+CUB 1.2.0 adds `cub::DeviceReduce::ReduceByKey` and
+  `cub::DeviceReduce::RunLengthEncode` and support for CUDA 6.0.
+
+## New Features
+
+- `cub::DeviceReduce::ReduceByKey`.
+- `cub::DeviceReduce::RunLengthEncode`.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan`, `cub::DeviceSelect`, `cub::DevicePartition`
+    performance.
+- Documentation and testing:
+  - Added performance-portability plots for many device-wide primitives.
+  - Explain that iterator (in)compatibilities with CUDA 5.0 (and older) and
+      Thrust 1.6 (and older).
+- Revised the operation of temporary tile status bookkeeping for
+    `cub::DeviceScan` (and similar) to be safe for current code run on future
+    platforms (now uses proper fences).
+
+## Bug Fixes
+
+- Fix `cub::DeviceScan` bug where Windows alignment disagreements between host
+    and device regarding user-defined data types would corrupt tile status.
+- Fix `cub::BlockScan` bug where certain exclusive scans on custom data types
+    for the `BLOCK_SCAN_WARP_SCANS` variant would return incorrect results for
+    the first thread in the block.
+- Added workaround to make `cub::TexRefInputIteratorT` work with CUDA 6.0.
+
+# CUB 1.1.1
+
+## Summary
+
+CUB 1.1.1 introduces texture and cache modifier iterators, descending sorting,
+  `cub::DeviceSelect`, `cub::DevicePartition`, `cub::Shuffle*`, and
+  `cub::MaxSMOccupancy`.
+Additionally, scan and sort performance for older GPUs has been improved and
+  many bugs have been fixed.
+
+## Breaking Changes
+
+- Refactored block-wide I/O (`cub::BlockLoad` and `cub::BlockStore`), removing
+    cache-modifiers from their interfaces.
+  `cub::CacheModifiedInputIterator` and `cub::CacheModifiedOutputIterator`
+    should now be used with `cub::BlockLoad` and `cub::BlockStore` to effect that
+    behavior.
+
+## New Features
+
+- `cub::TexObjInputIterator`, `cub::TexRefInputIterator`,
+    `cub::CacheModifiedInputIterator`, and `cub::CacheModifiedOutputIterator`
+    types for loading & storing arbitrary types through the cache hierarchy.
+  They are compatible with Thrust.
+- Descending sorting for `cub::DeviceRadixSort` and `cub::BlockRadixSort`.
+- Min, max, arg-min, and arg-max operators for `cub::DeviceReduce`.
+- `cub::DeviceSelect` (select-unique, select-if, and select-flagged).
+- `cub::DevicePartition` (partition-if, partition-flagged).
+- Generic `cub::ShuffleUp`, `cub::ShuffleDown`, and `cub::ShuffleIndex` for
+    warp-wide communication of arbitrary data types (SM3x and up).
+- `cub::MaxSmOccupancy` for accurately determining SM occupancy for any given
+    kernel function pointer.
+
+## Other Enhancements
+
+- Improved `cub::DeviceScan` and `cub::DeviceRadixSort` performance for older
+    GPUs (SM1x to SM3x).
+- Renamed device-wide `stream_synchronous` param to `debug_synchronous` to
+    avoid confusion about usage.
+- Documentation improvements:
+  - Added simple examples of device-wide methods.
+  - Improved doxygen documentation and example snippets.
+- Improved test coverege to include up to 21,000 kernel variants and 851,000
+    unit tests (per architecture, per platform).
+
+## Bug Fixes
+
+- Fix misc `cub::DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when
+    operating on non-primitive types for older architectures SM1x.
+- SHFL-based scans and reductions produced incorrect results for multi-word
+    types (size > 4B) on Linux.
+- For `cub::WarpScan`-based scans, not all threads in the first warp were
+    entering the prefix callback functor.
+- `cub::DeviceRadixSort` had a race condition with key-value pairs for pre-SM35
+    architectures.
+- `cub::DeviceRadixSor` bitfield-extract behavior with long keys on 64-bit
+    Linux was incorrect.
+- `cub::BlockDiscontinuity` failed to compile for types other than
+    `int32_t`/`uint32_t`.
+- CUDA Dynamic Parallelism (CDP, e.g. device-callable) versions of device-wide
+    methods now report the same temporary storage allocation size requirement as
+    their host-callable counterparts.
+
+# CUB 1.0.2
+
+## Summary
+
+CUB 1.0.2 is a minor release.
+
+## Bug Fixes
+
+- Corrections to code snippet examples for `cub::BlockLoad`, `cub::BlockStore`,
+    and `cub::BlockDiscontinuity`.
+- Cleaned up unnecessary/missing header includes.
+  You can now safely include a specific .cuh (instead of `cub.cuh`).
+- Bug/compilation fixes for `cub::BlockHistogram`.
+
+# CUB 1.0.1
+
+## Summary
+
+CUB 1.0.1 adds `cub::DeviceRadixSort` and `cub::DeviceScan`.
+Numerous other performance and correctness fixes and included.
+
+## Breaking Changes
+
+- New collective interface idiom (specialize/construct/invoke).
+
+## New Features
+
+- `cub::DeviceRadixSort`.
+  Implements short-circuiting for homogenous digit passes.
+- `cub::DeviceScan`.
+  Implements single-pass "adaptive-lookback" strategy.
+
+## Other Enhancements
+
+- Significantly improved documentation (with example code snippets).
+- More extensive regression test suit for aggressively testing collective
+    variants.
+- Allow non-trially-constructed types (previously unions had prevented aliasing
+    temporary storage of those types).
+- Improved support for SM3x SHFL (collective ops now use SHFL for types larger
+    than 32 bits).
+- Better code generation for 64-bit addressing within
+    `cub::BlockLoad`/`cub::BlockStore`.
+- `cub::DeviceHistogram` now supports histograms of arbitrary bins.
+- Updates to accommodate CUDA 5.5 dynamic parallelism.
+
+## Bug Fixes
+
+- Workarounds for SM10 codegen issues in uncommonly-used
+    `cub::WarpScan`/`cub::WarpReduce` specializations.
+
+# CUB 0.9.4
+
+## Summary
+
+CUB 0.9.3 is a minor release.
+
+## Enhancements
+
+- Various documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixed compilation errors for SM1x.
+- Fixed compilation errors for some WarpScan entrypoints on SM3x and up.
+
+# CUB 0.9.3
+
+## Summary
+
+CUB 0.9.3 adds histogram algorithms and work management utility descriptors.
+
+## New Features
+
+- `cub::DevicHistogram256`.
+- `cub::BlockHistogram256`.
+- `cub::BlockScan` algorithm variant `BLOCK_SCAN_RAKING_MEMOIZE`, which
+    trades more register consumption for less shared memory I/O.
+- `cub::GridQueue`, `cub::GridEvenShare`, work management utility descriptors.
+
+## Other Enhancements
+
+- Updates to `cub::BlockRadixRank` to use `cub::BlockScan`, which improves
+    performance on SM3x by using SHFL.
+- Allow types other than builtin types to be used in `cub::WarpScan::*Sum`
+    methods if they only have `operator+` overloaded.
+  Previously they also required to support assignment from `int(0)`.
+- Update `cub::BlockReduce`'s `BLOCK_REDUCE_WARP_REDUCTIONS` algorithm to work
+    even when block size is not an even multiple of warp size.
+- Refactoring of `cub::DeviceAllocator` interface and
+    `cub::CachingDeviceAllocator` implementation.
+
+# CUB 0.9.2
+
+## Summary
+
+CUB 0.9.2 adds `cub::WarpReduce`.
+
+## New Features
+
+- `cub::WarpReduce`, which uses the SHFL instruction when applicable.
+  `cub::BlockReduce` now uses this `cub::WarpReduce` instead of implementing
+    its own.
+
+## Enhancements
+
+- Documentation updates and corrections.
+
+## Bug Fixes
+
+- Fixes for 64-bit Linux compilation warnings and errors.
+
+# CUB 0.9.1
+
+## Summary
+
+CUB 0.9.1 is a minor release.
+
+## Bug Fixes
+
+- Fix for ambiguity in `cub::BlockScan::Reduce` between generic reduction and
+    summation.
+  Summation entrypoints are now called `::Sum()`, similar to the
+    convention in `cub::BlockScan`.
+- Small edits to documentation and download tracking.
+
+# CUB 0.9.0
+
+## Summary
+
+Initial preview release.
+CUB is the first durable, high-performance library of cooperative block-level,
+  warp-level, and thread-level primitives for CUDA kernel programming.
diff --git a/source/tnn/device/cuda/thirdparty/cub/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/CMakeLists.txt
new file mode 100644
index 000000000..cf6f20321
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/CMakeLists.txt
@@ -0,0 +1,98 @@
+# 3.15 is the minimum for including the project with add_subdirectory.
+# 3.21 is the minimum for the developer build.
+# 3.27.5 is the minimum for MSVC build with RDC=true.
+cmake_minimum_required(VERSION 3.15)
+
+# This must be done before any languages are enabled:
+if (CCCL_ENABLE_CUB)
+  cmake_minimum_required(VERSION 3.21)
+endif()
+
+project(CUB LANGUAGES NONE)
+
+# This must appear before the installation rules, as it is required by the
+# GNUInstallDirs CMake module.
+enable_language(CXX)
+
+# Support adding CUB to a parent project via add_subdirectory.
+# See examples/cmake/add_subdir/CMakeLists.txt for details.
+if (NOT CCCL_ENABLE_CUB)
+  include(cmake/CubAddSubdir.cmake)
+  return()
+endif()
+
+option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
+option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
+option(CUB_ENABLE_BENCHMARKS "Build CUB benchmarking suite." "${CCCL_ENABLE_BENCHMARKS}")
+option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
+option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
+
+# This is needed for NVCXX QA, which requires a static set of executable names.
+# Only a single dialect may be enabled when this is off.
+option(CUB_ENABLE_CPP_DIALECT_IN_NAMES
+  "Include C++ dialect information in target/object/etc names."
+  ON
+)
+mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES)
+
+# This option is only used when CUB is built stand-alone; otherwise the Thrust
+# option has the same effect.
+option(CUB_IGNORE_DEPRECATED_API
+  "Suppress warnings about deprecated Thrust/CUB API."
+  OFF
+)
+
+# Check if we're actually building anything before continuing. If not, no need
+# to search for deps, etc. This is a common approach for packagers that just
+# need the install rules. See GH issue NVIDIA/thrust#1211.
+if (NOT (CUB_ENABLE_HEADER_TESTING OR
+         CUB_ENABLE_TESTING OR
+         CUB_ENABLE_EXAMPLES))
+  return()
+endif()
+
+include(cmake/CubBuildCompilerTargets.cmake)
+include(cmake/CubBuildTargetList.cmake)
+include(cmake/CubCudaConfig.cmake)
+include(cmake/CubUtilities.cmake)
+
+if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  set(CUB_ENABLE_BENCHMARKS OFF)
+  set(CUB_ENABLE_TUNING OFF)
+endif()
+
+if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
+
+  set_property(
+    CACHE CMAKE_BUILD_TYPE
+    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
+  )
+endif ()
+
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+cub_build_compiler_targets()
+cub_build_target_list()
+
+if (CUB_ENABLE_HEADER_TESTING)
+  include(cmake/CubHeaderTesting.cmake)
+endif()
+
+# Both testing and examples use ctest
+if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)
+  include(CTest)
+  enable_testing()
+endif()
+
+if (CUB_ENABLE_TESTING)
+  add_subdirectory(test)
+endif()
+
+if (CUB_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+if (CUB_ENABLE_BENCHMARKS OR CUB_ENABLE_TUNING)
+  add_subdirectory(benchmarks)
+endif()
diff --git a/source/tnn/device/cuda/thirdparty/cub/CODE_OF_CONDUCT.md b/source/tnn/device/cuda/thirdparty/cub/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..03886478d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/CODE_OF_CONDUCT.md
@@ -0,0 +1,97 @@
+
+# Code of Conduct
+
+## Overview
+
+This document defines the Code of Conduct followed and enforced for NVIDIA C++
+  Core Compute Libraries.
+
+### Intended Audience
+
+* Community
+* Developers
+* Project Leads
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+  contributors and maintainers pledge to making participation in our project and
+  our community a harassment-free experience for everyone, regardless of age,
+  body size, disability, ethnicity, sex characteristics, gender identity and
+  expression, level of experience, education, socio-economic status, nationality,
+  personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+- Using welcoming and inclusive language.
+- Being respectful of differing viewpoints and experiences.
+- Gracefully accepting constructive criticism.
+- Focusing on what is best for the community.
+- Showing empathy towards other community members.
+
+Examples of unacceptable behavior by participants include:
+
+- The use of sexualized language or imagery and unwelcome sexual attention or
+    advances.
+- Trolling, insulting/derogatory comments, and personal or political attacks.
+- Public or private harassment.
+- Publishing others’ private information, such as a physical or electronic
+    address, without explicit permission.
+- Other conduct which could reasonably be considered inappropriate.
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+  behavior and are expected to take appropriate and fair corrective action in
+  response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+  reject comments, commits, code, wiki edits, issues, and other contributions
+  that are not aligned to this Code of Conduct, or to ban temporarily or
+  permanently any contributor for other behaviors that they deem inappropriate,
+  threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+  when an individual is representing the project or its community.
+Examples of representing a project or community include using an official
+  project email address, posting via an official social media account, or acting
+  as an appointed representative at an online or offline event.
+Representation of a project may be further defined and clarified by project
+  maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+  reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com).
+All complaints will be reviewed and investigated and will result in a response
+  that is deemed necessary and appropriate to the circumstances.
+The project team is obligated to maintain confidentiality with regard to the
+  reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+  faith may face temporary or permanent repercussions as determined by other
+  members of the project’s leadership.
+
+## Attribution
+
+This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was
+  adapted from the [Contributor Covenant version 1.4].
+
+Please see this [FAQ] for answers to common questions about this Code of Conduct.
+
+## Contact
+
+Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters.
+
+
+[cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com
+
+[FAQ]: https://www.contributor-covenant.org/faq
+
+[NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
+[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
diff --git a/source/tnn/device/cuda/thirdparty/cub/CONTRIBUTING.md b/source/tnn/device/cuda/thirdparty/cub/CONTRIBUTING.md
new file mode 100644
index 000000000..4002779dc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/CONTRIBUTING.md
@@ -0,0 +1,65 @@
+# Table of Contents
+
+1. [Contributing to CUB](#contributing-to-cub)
+1. [CMake Options](#cmake-options)
+1. [Development Model](#development-model)
+
+# Contributing to CUB
+
+CUB uses Github to manage all open-source development, including bug tracking,
+pull requests, and design discussions. CUB is tightly coupled to the Thrust
+project, and a compatible version of Thrust is required when working on the
+development version of CUB.
+
+To setup a CUB development branch, it is recommended to recursively clone the
+Thrust repository and use the CUB submodule at `dependencies/cub` to stage
+changes. CUB's tests and examples can be built by configuring Thrust with the
+CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`.
+
+This process is described in more detail in Thrust's
+[CONTRIBUTING.md](https://nvidia.github.io/cccl/thrust/contributing.html).
+
+The CMake options in the following section may be used to customize CUB's build
+process. Note that some of these are controlled by Thrust for compatibility and
+may not have an effect when building CUB through the Thrust build system. This
+is pointed out in the documentation below where applicable.
+
+# CMake Options
+
+A CUB build is configured using CMake options. These may be passed to CMake
+using
+
+```
+cmake -D<option_name>=<value> [Thrust or CUB project source root]
+```
+
+or configured interactively with the `ccmake` or `cmake-gui` interfaces.
+
+The configuration options for CUB are:
+
+- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
+  - Standard CMake build option. Default: `RelWithDebInfo`
+- `CUB_ENABLE_INSTALL_RULES={ON, OFF}`
+  - If true, installation rules will be generated for CUB. Default is `ON` when
+    building CUB alone, and `OFF` when CUB is a subproject added via CMake's
+    `add_subdirectory`.
+- `CUB_ENABLE_HEADER_TESTING={ON, OFF}`
+  - Whether to test compile public headers. Default is `ON`.
+- `CUB_ENABLE_TESTING={ON, OFF}`
+  - Whether to build unit tests. Default is `ON`.
+- `CUB_ENABLE_EXAMPLES={ON, OFF}`
+  - Whether to build examples. Default is `ON`.
+- `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}`
+  - Setting this has no effect when building CUB as a component of Thrust.
+    See Thrust's dialect options, which CUB will inherit.
+  - Toggle whether a specific C++ dialect will be targeted.
+  - Multiple dialects may be targeted in a single build.
+  - Possible values of `XX` are `{11, 14, 17}`.
+  - By default, only C++14 is enabled.
+- `CUB_ENABLE_RDC_TESTS={ON, OFF}`
+  - Enable tests that require separable compilation.
+  - Default is `ON`.
+- `CUB_FORCE_RDC={ON, OFF}`
+  - Enable separable compilation on all targets that are agnostic of RDC.
+  - Targets that explicitly require RDC to be enabled or disabled will ignore this setting.
+  - Default is `OFF`.
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT b/source/tnn/device/cuda/thirdparty/cub/LICENSE.TXT
similarity index 94%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT
rename to source/tnn/device/cuda/thirdparty/cub/LICENSE.TXT
index a678e64f8..6aeea8da6 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/LICENSE.TXT
+++ b/source/tnn/device/cuda/thirdparty/cub/LICENSE.TXT
@@ -1,24 +1,24 @@
-Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
-Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/source/tnn/device/cuda/thirdparty/cub/README.md b/source/tnn/device/cuda/thirdparty/cub/README.md
new file mode 100644
index 000000000..1e271ef01
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/README.md
@@ -0,0 +1,219 @@
+<hr>
+<h3>About CUB</h3>
+
+CUB provides state-of-the-art, reusable software components for every layer
+of the CUDA programming model:
+- [<b><em>Device-wide primitives</em></b>](https://nvidia.github.io/cccl/cub/device_wide.html)
+  - Sort, prefix scan, reduction, histogram, etc.
+  - Compatible with CUDA dynamic parallelism
+- [<b><em>Block-wide "collective" primitives</em></b>](https://nvidia.github.io/cccl/cub/block_wide.html)
+  - I/O, sort, prefix scan, reduction, histogram, etc.
+  - Compatible with arbitrary thread block sizes and types
+- [<b><em>Warp-wide "collective" primitives</em></b>](https://nvidia.github.io/cccl/cub/warp_wide.html)
+  - Warp-wide prefix scan, reduction, etc.
+  - Safe and architecture-specific
+- <b><em>Thread and resource utilities</em></b>
+  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc.
+
+![Orientation of collective primitives within the CUDA software stack](https://nvidia.github.io/cccl/cub/_images/cub_overview.png)
+
+CUB is included in the NVIDIA HPC SDK and the CUDA Toolkit.
+
+We recommend the [CUB Project Website](https://nvidia.github.io/cccl/cub/) for further information and examples.
+
+<br><hr>
+<h3>A Simple Example</h3>
+
+```cpp
+#include <cub/cub.cuh>
+
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
+     // owning 16 integer items each
+     using BlockRadixSort = BlockRadixSort<int, 128, 16>;
+     using BlockLoad = BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>;
+     using BlockStore = BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE>;
+
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load;
+         typename BlockStore::TempStorage      store;
+     } temp_storage;
+
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
+
+     // Store the sorted segment
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses `cub::BlockRadixSort` to collectively sort
+its own input segment.  The class is specialized by the
+data type being sorted, by the number of threads per block, by the number of
+keys per thread, and implicitly by the targeted compilation architecture.
+
+The `cub::BlockLoad` and `cub::BlockStore` classes are similarly specialized.
+Furthermore, to provide coalesced accesses to device memory, these primitives are
+configured to access memory using a striped access pattern (where consecutive threads
+simultaneously access consecutive items) and then <em>transpose</em> the keys into
+a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads.
+
+Once specialized, these classes expose opaque `TempStorage` member types.
+The thread block uses these storage types to statically allocate the union of
+shared memory needed by the thread block.  (Alternatively these storage types
+could be aliased to global memory allocations).
+
+<br><hr>
+<h3>Supported Compilers</h3>
+
+CUB is regularly tested using the specified versions of the following
+compilers. Unsupported versions may emit deprecation warnings, which can be
+silenced by defining CUB_IGNORE_DEPRECATED_COMPILER during compilation.
+
+- NVCC 11.0+
+- GCC 5+
+- Clang 7+
+- MSVC 2019+ (19.20/16.0/14.20)
+
+<br><hr>
+<h3>Releases</h3>
+
+CUB is distributed with the NVIDIA HPC SDK and the CUDA Toolkit in addition
+to GitHub.
+
+See the [changelog](CHANGELOG.md) for details about specific releases.
+
+| CUB Release               | Included In                             |
+| ------------------------- | --------------------------------------- |
+| 2.0.1                     | CUDA Toolkit 12.0                       |
+| 2.0.0                     | TBD                                     |
+| 1.17.2                    | TBD                                     |
+| 1.17.1                    | TBD                                     |
+| 1.17.0                    | TBD                                     |
+| 1.16.0                    | TBD                                     |
+| 1.15.0                    | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6 |
+| 1.14.0                    | NVIDIA HPC SDK 21.9                     |
+| 1.13.1                    | CUDA Toolkit 11.5                       |
+| 1.13.0                    | NVIDIA HPC SDK 21.7                     |
+| 1.12.1                    | CUDA Toolkit 11.4                       |
+| 1.12.0                    | NVIDIA HPC SDK 21.3                     |
+| 1.11.0                    | CUDA Toolkit 11.3                       |
+| 1.10.0                    | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2 |
+| 1.9.10-1                  | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1 |
+| 1.9.10                    | NVIDIA HPC SDK 20.5                     |
+| 1.9.9                     | CUDA Toolkit 11.0                       |
+| 1.9.8-1                   | NVIDIA HPC SDK 20.3                     |
+| 1.9.8                     | CUDA Toolkit 11.0 Early Access          |
+| 1.9.8                     | CUDA 11.0 Early Access                  |
+| 1.8.0                     |                                         |
+| 1.7.5                     | Thrust 1.9.2                            |
+| 1.7.4                     | Thrust 1.9.1-2                          |
+| 1.7.3                     |                                         |
+| 1.7.2                     |                                         |
+| 1.7.1                     |                                         |
+| 1.7.0                     | Thrust 1.9.0-5                          |
+| 1.6.4                     |                                         |
+| 1.6.3                     |                                         |
+| 1.6.2 (previously 1.5.5)  |                                         |
+| 1.6.1 (previously 1.5.4)  |                                         |
+| 1.6.0 (previously 1.5.3)  |                                         |
+| 1.5.2                     |                                         |
+| 1.5.1                     |                                         |
+| 1.5.0                     |                                         |
+| 1.4.1                     |                                         |
+| 1.4.0                     |                                         |
+| 1.3.2                     |                                         |
+| 1.3.1                     |                                         |
+| 1.3.0                     |                                         |
+| 1.2.3                     |                                         |
+| 1.2.2                     |                                         |
+| 1.2.0                     |                                         |
+| 1.1.1                     |                                         |
+| 1.0.2                     |                                         |
+| 1.0.1                     |                                         |
+| 0.9.4                     |                                         |
+| 0.9.2                     |                                         |
+| 0.9.1                     |                                         |
+| 0.9.0                     |                                         |
+
+<br><hr>
+<h3>Development Process</h3>
+
+CUB and Thrust depend on each other. It is recommended to clone Thrust
+and build CUB as a component of Thrust.
+
+CUB uses the [CMake build system](https://cmake.org/) to build unit tests,
+examples, and header tests. To build CUB as a developer, the following
+recipe should be followed:
+
+```bash
+# Clone Thrust and CUB from Github. CUB is located in Thrust's
+# `dependencies/cub` submodule.
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON ..   # Command line interface.
+ccmake -DTHRUST_INCLUDE_CUB_CMAKE=ON ..  # ncurses GUI (Linux only)
+cmake-gui  # Graphical UI, set source/build directories and options in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+By default, the C++14 standard is targeted, but this can be changed in CMake.
+More information on configuring your CUB build and creating a pull request is
+found in [CONTRIBUTING.md](CONTRIBUTING.md).
+
+<br><hr>
+<h3>Open Source License</h3>
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/benchmarks/CMakeLists.txt
new file mode 100644
index 000000000..68f72c04e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/CMakeLists.txt
@@ -0,0 +1,115 @@
+include(${CMAKE_SOURCE_DIR}/benchmarks/cmake/CCCLBenchmarkRegistry.cmake)
+
+# Defer dependencies collection to nvbench helper
+add_subdirectory(nvbench_helper)
+
+set(benches_root "${CMAKE_CURRENT_LIST_DIR}")
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+  message(FATAL_ERROR "CUB benchmarks must be built in release mode.")
+endif()
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  message(FATAL_ERROR "CMAKE_CUDA_ARCHITECTURES must be set to build CUB benchmarks.")
+endif()
+
+set(benches_meta_target cub.all.benches)
+add_custom_target(${benches_meta_target})
+
+function(get_recursive_subdirs subdirs)
+  set(dirs)
+  file(GLOB_RECURSE contents
+    CONFIGURE_DEPENDS
+    LIST_DIRECTORIES ON
+    "${CMAKE_CURRENT_LIST_DIR}/bench/*"
+  )
+
+  foreach(test_dir IN LISTS contents)
+    if(IS_DIRECTORY "${test_dir}")
+      list(APPEND dirs "${test_dir}")
+    endif()
+  endforeach()
+
+  set(${subdirs} "${dirs}" PARENT_SCOPE)
+endfunction()
+
+create_benchmark_registry()
+
+function(get_bench_ranges src bench_name)
+  file(READ "${src}" file_data)
+  set(param_regex "//[ ]+%RANGE%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)")
+
+  string(REGEX MATCHALL "${param_regex}" matches "${file_data}")
+
+  set(ranges "")
+
+  foreach(match IN LISTS matches)
+    string(REGEX MATCH "${param_regex}" unused "${match}")
+
+    set(def ${CMAKE_MATCH_1})
+    set(label ${CMAKE_MATCH_2})
+    set(range ${CMAKE_MATCH_3})
+    set(ranges "${ranges}${def}|${label}=${range},")
+
+    string(REPLACE ":" ";" range "${range}")
+    list(LENGTH range range_len)
+
+    if (NOT "${range_len}" STREQUAL 3)
+      message(FATAL_ERROR "Range should be represented as 'start:end:step'")
+    endif()
+  endforeach()
+
+  string(LENGTH "${ranges}" ranges_length)
+  math(EXPR last_character_index "${ranges_length} - 1")
+  string(SUBSTRING "${ranges}" 0 ${last_character_index} ranges)
+  register_cccl_tuning("${bench_name}" "${ranges}")
+endfunction()
+
+function(add_bench target_name bench_name bench_src)
+  set(bench_target ${bench_name})
+  set(${target_name} ${bench_target} PARENT_SCOPE)
+
+  add_executable(${bench_target} "${bench_src}")
+  cccl_configure_target(${bench_target} DIALECT 17)
+  target_link_libraries(${bench_target} PRIVATE nvbench_helper nvbench::main)
+endfunction()
+
+function(add_bench_dir bench_dir)
+  file(GLOB bench_srcs CONFIGURE_DEPENDS "${bench_dir}/*.cu")
+  file(RELATIVE_PATH bench_prefix "${benches_root}" "${bench_dir}")
+  file(TO_CMAKE_PATH "${bench_prefix}" bench_prefix)
+  string(REPLACE "/" "." bench_prefix "${bench_prefix}")
+
+  foreach(bench_src IN LISTS bench_srcs)
+    # base tuning
+    get_filename_component(bench_name "${bench_src}" NAME_WLE)
+    string(PREPEND bench_name "cub.${bench_prefix}.")
+
+    set(base_bench_name "${bench_name}.base")
+    add_bench(base_bench_target ${base_bench_name} "${bench_src}")
+    add_dependencies(${benches_meta_target} ${base_bench_target})
+    target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1)
+    target_compile_options(${base_bench_target} PRIVATE "--extended-lambda")
+
+    if (CUB_ENABLE_TUNING)
+      # tuning
+      set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${bench_src}")
+      get_bench_ranges("${bench_src}" "${bench_name}")
+      set(tuning_name "${bench_name}.variant")
+      set(tuning_path "${CMAKE_BINARY_DIR}/${tuning_name}.h")
+      add_bench(bench_target ${tuning_name} "${bench_src}")
+      # for convenience, make tuning variant buildable by default
+      file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
+      target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
+    else()
+      # benchmarking
+      register_cccl_benchmark("${bench_name}" "")
+    endif()
+  endforeach()
+endfunction()
+
+get_recursive_subdirs(subdirs)
+
+foreach(subdir IN LISTS subdirs)
+  add_bench_dir("${subdir}")
+endforeach()
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
new file mode 100644
index 000000000..66050f5d6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/adjacent_difference/subtract_left.cu
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_adjacent_difference.cuh>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+
+#if !TUNE_BASE
+struct policy_hub_t
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using AdjacentDifferencePolicy =
+      cub::AgentAdjacentDifferencePolicy<TUNE_THREADS_PER_BLOCK,
+                                         TUNE_ITEMS_PER_THREAD,
+                                         cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                                         cub::LOAD_CA,
+                                         cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class T, class OffsetT>
+void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  constexpr bool may_alias = false;
+  constexpr bool read_left = true;
+
+  using input_it_t      = const T*;
+  using output_it_t     = T*;
+  using difference_op_t = cub::Difference;
+  using offset_t        = cub::detail::choose_offset_t<OffsetT>;
+
+#if !TUNE_BASE
+  using dispatch_t = cub::
+    DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left, policy_hub_t>;
+#else
+  using dispatch_t =
+    cub::DispatchAdjacentDifference<input_it_t, output_it_t, difference_op_t, offset_t, may_alias, read_left>;
+#endif // TUNE_BASE
+
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(elements);
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+
+  std::size_t temp_storage_bytes{};
+  dispatch_t::Dispatch(nullptr, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(elements), difference_op_t{}, 0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<offset_t>(elements),
+      difference_op_t{},
+      launch.get_stream());
+  });
+}
+
+using types = nvbench::type_list<int32_t>;
+
+NVBENCH_BENCH_TYPES(left, NVBENCH_TYPE_AXES(types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/copy/memcpy.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/copy/memcpy.cu
new file mode 100644
index 000000000..3d2438fdf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/copy/memcpy.cu
@@ -0,0 +1,308 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_copy.cuh>
+
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_BUFFERS_PER_THREAD bpt 1:18:1
+// %RANGE% TUNE_TLEV_BYTES_PER_THREAD tlevbpt 2:16:2
+// %RANGE% TUNE_LARGE_THREADS ltpb 128:1024:32
+// %RANGE% TUNE_LARGE_BUFFER_BYTES_PER_THREAD lbbpt 4:128:4
+// %RANGE% TUNE_PREFER_POW2_BITS ppb 0:1:1
+// %RANGE% TUNE_WARP_LEVEL_THRESHOLD wlt 32:512:32
+// %RANGE% TUNE_BLOCK_LEVEL_THRESHOLD blt 1024:16384:512
+// %RANGE% TUNE_BLOCK_MAGIC_NS blns 0:2048:4
+// %RANGE% TUNE_BLOCK_DELAY_CONSTRUCTOR_ID bldcid 0:7:1
+// %RANGE% TUNE_BLOCK_L2_WRITE_LATENCY_NS bll2w 0:1200:5
+// %RANGE% TUNE_BUFF_MAGIC_NS buns 0:2048:4
+// %RANGE% TUNE_BUFF_DELAY_CONSTRUCTOR_ID budcid 0:7:1
+// %RANGE% TUNE_BUFF_L2_WRITE_LATENCY_NS bul2w 0:1200:5
+
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench_helper.cuh>
+
+template <class T, class OffsetT>
+struct offset_to_ptr_t
+{
+  T* d_ptr;
+  OffsetT* d_offsets;
+
+  __device__ T* operator()(OffsetT i) const
+  {
+    return d_ptr + d_offsets[i];
+  }
+};
+
+template <class T, class OffsetT>
+struct reordered_offset_to_ptr_t
+{
+  T* d_ptr;
+  OffsetT* d_map;
+  OffsetT* d_offsets;
+
+  __device__ T* operator()(OffsetT i) const
+  {
+    return d_ptr + d_offsets[d_map[i]];
+  }
+};
+
+template <class T, class OffsetT>
+struct offset_to_bytes_t
+{
+  OffsetT* d_offsets;
+
+  __device__ OffsetT operator()(OffsetT i) const
+  {
+    return (d_offsets[i + 1] - d_offsets[i]) * sizeof(T);
+  }
+};
+
+template <class T, class OffsetT>
+struct offset_to_size_t
+{
+  OffsetT* d_offsets;
+
+  __device__ OffsetT operator()(OffsetT i) const
+  {
+    return d_offsets[i + 1] - d_offsets[i];
+  }
+};
+
+#if !TUNE_BASE
+template <unsigned int MagicNs, unsigned int L2W, unsigned int DCID>
+using delay_constructor_t =
+  nvbench::tl::get<DCID,
+                   nvbench::type_list<cub::detail::no_delay_constructor_t<L2W>,
+                                      cub::detail::fixed_delay_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backoff_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backoff_jitter_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backoff_jitter_window_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backon_jitter_window_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backon_jitter_constructor_t<MagicNs, L2W>,
+                                      cub::detail::exponential_backon_constructor_t<MagicNs, L2W>>>;
+
+using buff_delay_constructor_t =
+  delay_constructor_t<TUNE_BUFF_MAGIC_NS, TUNE_BUFF_L2_WRITE_LATENCY_NS, TUNE_BUFF_DELAY_CONSTRUCTOR_ID>;
+using block_delay_constructor_t =
+  delay_constructor_t<TUNE_BLOCK_MAGIC_NS, TUNE_BLOCK_L2_WRITE_LATENCY_NS, TUNE_BLOCK_DELAY_CONSTRUCTOR_ID>;
+
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  {
+    using AgentSmallBufferPolicyT = cub::detail::AgentBatchMemcpyPolicy<
+      TUNE_THREADS,
+      TUNE_BUFFERS_PER_THREAD,
+      TUNE_TLEV_BYTES_PER_THREAD,
+      TUNE_PREFER_POW2_BITS,
+      TUNE_LARGE_THREADS * TUNE_LARGE_BUFFER_BYTES_PER_THREAD,
+      TUNE_WARP_LEVEL_THRESHOLD,
+      TUNE_BLOCK_LEVEL_THRESHOLD,
+      buff_delay_constructor_t,
+      block_delay_constructor_t>;
+
+    using AgentLargeBufferPolicyT =
+      cub::detail::AgentBatchMemcpyLargeBuffersPolicy<TUNE_LARGE_THREADS, TUNE_LARGE_BUFFER_BYTES_PER_THREAD>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif
+
+template <class T, class OffsetT>
+void gen_it(T* d_buffer,
+            thrust::device_vector<T*>& output,
+            thrust::device_vector<OffsetT> offsets,
+            bool randomize,
+            thrust::default_random_engine& rne)
+{
+  OffsetT* d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  if (randomize)
+  {
+    const auto buffers = output.size();
+    thrust::device_vector<OffsetT> map(buffers);
+    thrust::sequence(map.begin(), map.end());
+    thrust::shuffle(map.begin(), map.end(), rne);
+    thrust::device_vector<OffsetT> sizes(buffers);
+    thrust::tabulate(sizes.begin(), sizes.end(), offset_to_size_t<T, OffsetT>{d_offsets});
+    thrust::scatter(sizes.begin(), sizes.end(), map.begin(), offsets.begin());
+    thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin());
+    OffsetT* d_map = thrust::raw_pointer_cast(map.data());
+    thrust::tabulate(output.begin(), output.end(), reordered_offset_to_ptr_t<T, OffsetT>{d_buffer, d_map, d_offsets});
+  }
+  else
+  {
+    thrust::tabulate(output.begin(), output.end(), offset_to_ptr_t<T, OffsetT>{d_buffer, d_offsets});
+  }
+}
+
+template <class T, class OffsetT>
+void copy(nvbench::state& state,
+          nvbench::type_list<T, OffsetT>,
+          std::size_t elements,
+          std::size_t min_buffer_size,
+          std::size_t max_buffer_size,
+          bool randomize_input,
+          bool randomize_output)
+{
+  using offset_t           = OffsetT;
+  using it_t               = T*;
+  using input_buffer_it_t  = it_t*;
+  using output_buffer_it_t = it_t*;
+  using buffer_size_it_t   = offset_t*;
+  using buffer_offset_t    = std::uint32_t;
+  using block_offset_t     = std::uint32_t;
+
+  constexpr bool is_memcpy = true;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t;
+#else
+  using policy_t = cub::detail::DeviceBatchMemcpyPolicy<buffer_offset_t, block_offset_t>;
+#endif
+
+  using dispatch_t = cub::detail::DispatchBatchMemcpy<
+    input_buffer_it_t,
+    output_buffer_it_t,
+    buffer_size_it_t,
+    buffer_offset_t,
+    block_offset_t,
+    policy_t,
+    is_memcpy>;
+
+  thrust::device_vector<T> input_buffer = generate(elements);
+  thrust::device_vector<T> output_buffer(elements);
+  thrust::device_vector<offset_t> offsets =
+    generate.uniform.segment_offsets(elements, min_buffer_size, max_buffer_size);
+
+  T* d_input_buffer   = thrust::raw_pointer_cast(input_buffer.data());
+  T* d_output_buffer  = thrust::raw_pointer_cast(output_buffer.data());
+  offset_t* d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  const auto buffers = offsets.size() - 1;
+
+  thrust::device_vector<it_t> input_buffers(buffers);
+  thrust::device_vector<it_t> output_buffers(buffers);
+  thrust::device_vector<offset_t> buffer_sizes(buffers);
+  thrust::tabulate(buffer_sizes.begin(), buffer_sizes.end(), offset_to_bytes_t<T, offset_t>{d_offsets});
+
+  thrust::default_random_engine rne;
+  gen_it(d_input_buffer, input_buffers, offsets, randomize_input, rne);
+  gen_it(d_output_buffer, output_buffers, offsets, randomize_output, rne);
+
+  // Clear the offsets vector to free memory
+  offsets.clear();
+  offsets.shrink_to_fit();
+  d_offsets = nullptr;
+
+  input_buffer_it_t d_input_buffers   = thrust::raw_pointer_cast(input_buffers.data());
+  output_buffer_it_t d_output_buffers = thrust::raw_pointer_cast(output_buffers.data());
+  buffer_size_it_t d_buffer_sizes     = thrust::raw_pointer_cast(buffer_sizes.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_writes<T>(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_reads<it_t>(buffers);
+  state.add_global_memory_reads<it_t>(buffers);
+  state.add_global_memory_reads<offset_t>(buffers);
+
+  std::size_t temp_storage_bytes{};
+  std::uint8_t* d_temp_storage{};
+  dispatch_t::Dispatch(
+    d_temp_storage, temp_storage_bytes, d_input_buffers, d_output_buffers, d_buffer_sizes, buffers, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input_buffers,
+      d_output_buffers,
+      d_buffer_sizes,
+      buffers,
+      launch.get_stream());
+  });
+}
+
+template <class T, class OffsetT>
+void uniform(nvbench::state& state, nvbench::type_list<T, OffsetT> tl)
+{
+  const auto elements              = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto max_buffer_size       = static_cast<std::size_t>(state.get_int64("MaxBufferSize"));
+  const auto min_buffer_size_ratio = static_cast<std::size_t>(state.get_int64("MinBufferSizeRatio"));
+  const auto min_buffer_size =
+    static_cast<std::size_t>(static_cast<double>(max_buffer_size) / 100.0) * min_buffer_size_ratio;
+
+  copy(
+    state, tl, elements, min_buffer_size, max_buffer_size, state.get_int64("Randomize"), state.get_int64("Randomize"));
+}
+
+template <class T, class OffsetT>
+void large(nvbench::state& state, nvbench::type_list<T, OffsetT> tl)
+{
+  const auto elements                  = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto max_buffer_size           = elements;
+  constexpr auto min_buffer_size_ratio = 99;
+  const auto min_buffer_size =
+    static_cast<std::size_t>(static_cast<double>(max_buffer_size) / 100.0) * min_buffer_size_ratio;
+
+  // No need to randomize large buffers
+  constexpr bool randomize_input  = false;
+  constexpr bool randomize_output = false;
+
+  copy(state, tl, elements, min_buffer_size, max_buffer_size, randomize_input, randomize_output);
+}
+
+using types = nvbench::type_list<nvbench::uint8_t, nvbench::uint32_t>;
+
+#ifdef TUNE_OffsetT
+using u_offset_types = nvbench::type_list<TUNE_OffsetT>;
+#else
+using u_offset_types = nvbench::type_list<uint32_t, uint64_t>;
+#endif
+
+NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
+  .set_name("uniform")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
+  .add_int64_axis("MinBufferSizeRatio", {1, 99})
+  .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
+  .add_int64_axis("Randomize", {0, 1});
+
+NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
+  .set_name("large")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", {28, 29});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/base.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/base.cu
new file mode 100644
index 000000000..3c3d233e4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/base.cu
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in = thrust::raw_pointer_cast(in.data());
+  // `d_out` exists for visibility
+  // All inputs are equal to `42`, while the operator is searching for `0`.
+  // If the operator finds `0` in the input sequence, it's an issue leading to a segfault.
+  output_it_t d_out = nullptr;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/copy.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/copy.cu
new file mode 100644
index 000000000..4da8373c5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/for_each/copy.cu
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_for.cuh>
+
+#include <nvbench_helper.cuh>
+
+template <class T>
+struct op_t
+{
+  int* d_count{};
+
+  __device__ void operator()(T val) const
+  {
+    if (val == T{})
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+
+template <class T, class OffsetT>
+void for_each(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t  = const T*;
+  using output_it_t = int*;
+  using offset_t    = OffsetT;
+
+  const auto elements = static_cast<offset_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> in(elements, T{42});
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = nullptr;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+
+  op_t<T> op{d_out};
+
+  std::size_t temp_size{};
+  cub::DeviceFor::ForEachCopyN(nullptr, temp_size, d_in, elements, op);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DeviceFor::ForEachCopyN(temp_storage, temp_size, d_in, elements, op, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(for_each, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/even.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/even.cu
new file mode 100644
index 000000000..efe31e3a0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/even.cu
@@ -0,0 +1,147 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "histogram_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 4:28:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
+// %RANGE% TUNE_WORK_STEALING ws 0:1:1
+// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
+
+template <typename SampleT, typename CounterT, typename OffsetT>
+static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, OffsetT>)
+{
+  constexpr int num_channels        = 1;
+  constexpr int num_active_channels = 1;
+
+  using sample_iterator_t = SampleT*;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT,
+                           policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT>;
+#endif // TUNE_BASE
+
+  const auto entropy   = str_to_entropy(state.get_string("Entropy"));
+  const auto elements  = state.get_int64("Elements{io}");
+  const auto num_bins  = state.get_int64("Bins");
+  const int num_levels = static_cast<int>(num_bins) + 1;
+
+  const SampleT lower_level = 0;
+  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+
+  thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
+  thrust::device_vector<CounterT> hist(num_bins);
+
+  SampleT* d_input      = thrust::raw_pointer_cast(input.data());
+  CounterT* d_histogram = thrust::raw_pointer_cast(hist.data());
+
+  CounterT* d_histogram1[1] = {d_histogram};
+  int num_levels1[1]        = {num_levels};
+  SampleT lower_level1[1]   = {lower_level};
+  SampleT upper_level1[1]   = {upper_level};
+
+  std::uint8_t* d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+
+  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  OffsetT num_rows           = 1;
+  OffsetT row_stride_samples = num_row_pixels;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<SampleT>(elements);
+  state.add_global_memory_writes<CounterT>(num_bins);
+
+  dispatch_t::DispatchEven(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_input,
+    d_histogram1,
+    num_levels1,
+    lower_level1,
+    upper_level1,
+    num_row_pixels,
+    num_rows,
+    row_stride_samples,
+    0,
+    is_byte_sample);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::DispatchEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input,
+      d_histogram1,
+      num_levels1,
+      lower_level1,
+      upper_level1,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      launch.get_stream(),
+      is_byte_sample);
+  });
+}
+
+using bin_types         = nvbench::type_list<int32_t>;
+using some_offset_types = nvbench::type_list<int32_t>;
+
+#ifdef TUNE_SampleT
+using sample_types = nvbench::type_list<TUNE_SampleT>;
+#else // !defined(TUNE_SampleT)
+using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
+#endif // TUNE_SampleT
+
+NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/histogram_common.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/histogram_common.cuh
new file mode 100644
index 000000000..d6a7f9f91
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/histogram_common.cuh
@@ -0,0 +1,104 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/device/device_histogram.cuh>
+
+#include <cuda/std/type_traits>
+
+#if !TUNE_BASE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_LOAD == 2
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+#  define TUNE_VEC_SIZE (1 << TUNE_VEC_SIZE_POW)
+
+#  if TUNE_MEM_PREFERENCE == 0
+constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
+#  elif TUNE_MEM_PREFERENCE == 1
+constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
+#  else // TUNE_MEM_PREFERENCE == 2
+constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
+#  endif // TUNE_MEM_PREFERENCE
+
+#  if TUNE_LOAD_ALGORITHM_ID == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  elif TUNE_LOAD_ALGORITHM_ID == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  else
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_STRIPED
+#  endif // TUNE_LOAD_ALGORITHM_ID
+
+template <typename SampleT, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  {
+    static constexpr cub::BlockLoadAlgorithm load_algorithm =
+      (TUNE_LOAD_ALGORITHM == cub::BLOCK_LOAD_STRIPED)
+        ? (NUM_CHANNELS == 1 ? cub::BLOCK_LOAD_STRIPED : cub::BLOCK_LOAD_DIRECT)
+        : TUNE_LOAD_ALGORITHM;
+
+    using AgentHistogramPolicyT = cub::AgentHistogramPolicy<
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      load_algorithm,
+      TUNE_LOAD_MODIFIER,
+      TUNE_RLE_COMPRESS,
+      MEM_PREFERENCE,
+      TUNE_WORK_STEALING,
+      TUNE_VEC_SIZE>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class SampleT, class OffsetT>
+SampleT get_upper_level(OffsetT bins, OffsetT elements)
+{
+  if constexpr (cuda::std::is_integral_v<SampleT>)
+  {
+    if constexpr (sizeof(SampleT) < sizeof(OffsetT))
+    {
+      const SampleT max_key = std::numeric_limits<SampleT>::max();
+      return static_cast<SampleT>(std::min(bins, static_cast<OffsetT>(max_key)));
+    }
+    else
+    {
+      return static_cast<SampleT>(bins);
+    }
+  }
+
+  return static_cast<SampleT>(elements);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/even.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/even.cu
new file mode 100644
index 000000000..068823d11
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/even.cu
@@ -0,0 +1,157 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "../histogram_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
+// %RANGE% TUNE_WORK_STEALING ws 0:1:1
+// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
+
+template <typename SampleT, typename CounterT, typename OffsetT>
+static void even(nvbench::state& state, nvbench::type_list<SampleT, CounterT, OffsetT>)
+{
+  constexpr int num_channels        = 4;
+  constexpr int num_active_channels = 3;
+
+  using sample_iterator_t = SampleT*;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT,
+                           policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT>;
+#endif // TUNE_BASE
+
+  const auto entropy     = str_to_entropy(state.get_string("Entropy"));
+  const auto elements    = state.get_int64("Elements{io}");
+  const auto num_bins    = state.get_int64("Bins");
+  const int num_levels_r = static_cast<int>(num_bins) + 1;
+  const int num_levels_g = num_levels_r;
+  const int num_levels_b = num_levels_g;
+
+  const SampleT lower_level_r = 0;
+  const SampleT upper_level_r = get_upper_level<SampleT>(num_bins, elements);
+  const SampleT lower_level_g = lower_level_r;
+  const SampleT upper_level_g = upper_level_r;
+  const SampleT lower_level_b = lower_level_g;
+  const SampleT upper_level_b = upper_level_g;
+
+  thrust::device_vector<CounterT> hist_r(num_bins);
+  thrust::device_vector<CounterT> hist_g(num_bins);
+  thrust::device_vector<CounterT> hist_b(num_bins);
+  thrust::device_vector<SampleT> input = generate(elements * num_channels, entropy, lower_level_r, upper_level_r);
+
+  SampleT* d_input        = thrust::raw_pointer_cast(input.data());
+  CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data());
+  CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data());
+  CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data());
+
+  CounterT* d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b};
+  int num_levels[num_active_channels]        = {num_levels_r, num_levels_g, num_levels_b};
+  SampleT lower_level[num_active_channels]   = {lower_level_r, lower_level_g, lower_level_b};
+  SampleT upper_level[num_active_channels]   = {upper_level_r, upper_level_g, upper_level_b};
+
+  std::uint8_t* d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+
+  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  OffsetT num_rows           = 1;
+  OffsetT row_stride_samples = num_row_pixels;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<SampleT>(elements * num_active_channels);
+  state.add_global_memory_writes<CounterT>(num_bins * num_active_channels);
+
+  dispatch_t::DispatchEven(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_input,
+    d_histogram,
+    num_levels,
+    lower_level,
+    upper_level,
+    num_row_pixels,
+    num_rows,
+    row_stride_samples,
+    0,
+    is_byte_sample);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::DispatchEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      launch.get_stream(),
+      is_byte_sample);
+  });
+}
+
+using bin_types         = nvbench::type_list<int32_t>;
+using some_offset_types = nvbench::type_list<int32_t>;
+
+#ifdef TUNE_SampleT
+using sample_types = nvbench::type_list<TUNE_SampleT>;
+#else // !defined(TUNE_SampleT)
+using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
+#endif // TUNE_SampleT
+
+NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/range.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/range.cu
new file mode 100644
index 000000000..ccf929c63
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/multi/range.cu
@@ -0,0 +1,164 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/sequence.h>
+
+#include "../histogram_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
+// %RANGE% TUNE_WORK_STEALING ws 0:1:1
+// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
+
+template <typename SampleT, typename CounterT, typename OffsetT>
+static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, OffsetT>)
+{
+  constexpr int num_channels        = 4;
+  constexpr int num_active_channels = 3;
+
+  using sample_iterator_t = SampleT*;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT,
+                           policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT>;
+#endif // TUNE_BASE
+
+  const auto entropy     = str_to_entropy(state.get_string("Entropy"));
+  const auto elements    = state.get_int64("Elements{io}");
+  const auto num_bins    = state.get_int64("Bins");
+  const int num_levels_r = static_cast<int>(num_bins) + 1;
+  const int num_levels_g = num_levels_r;
+  const int num_levels_b = num_levels_g;
+
+  const SampleT lower_level = 0;
+  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+
+  SampleT step = (upper_level - lower_level) / num_bins;
+  thrust::device_vector<SampleT> levels_r(num_bins + 1);
+
+  // TODO Extract sequence to the helper TU
+  thrust::sequence(levels_r.begin(), levels_r.end(), lower_level, step);
+  thrust::device_vector<SampleT> levels_g = levels_r;
+  thrust::device_vector<SampleT> levels_b = levels_g;
+
+  SampleT* d_levels_r = thrust::raw_pointer_cast(levels_r.data());
+  SampleT* d_levels_g = thrust::raw_pointer_cast(levels_g.data());
+  SampleT* d_levels_b = thrust::raw_pointer_cast(levels_b.data());
+
+  thrust::device_vector<CounterT> hist_r(num_bins);
+  thrust::device_vector<CounterT> hist_g(num_bins);
+  thrust::device_vector<CounterT> hist_b(num_bins);
+  thrust::device_vector<SampleT> input = generate(elements * num_channels, entropy, lower_level, upper_level);
+
+  SampleT* d_input        = thrust::raw_pointer_cast(input.data());
+  CounterT* d_histogram_r = thrust::raw_pointer_cast(hist_r.data());
+  CounterT* d_histogram_g = thrust::raw_pointer_cast(hist_g.data());
+  CounterT* d_histogram_b = thrust::raw_pointer_cast(hist_b.data());
+
+  CounterT* d_histogram[num_active_channels] = {d_histogram_r, d_histogram_g, d_histogram_b};
+  int num_levels[num_active_channels]        = {num_levels_r, num_levels_g, num_levels_b};
+  SampleT* d_levels[num_active_channels]     = {d_levels_r, d_levels_g, d_levels_b};
+
+  std::uint8_t* d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+
+  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  OffsetT num_rows           = 1;
+  OffsetT row_stride_samples = num_row_pixels;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<SampleT>(elements * num_active_channels);
+  state.add_global_memory_writes<CounterT>(num_bins * num_active_channels);
+
+  dispatch_t::DispatchRange(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_input,
+    d_histogram,
+    num_levels,
+    d_levels,
+    num_row_pixels,
+    num_rows,
+    row_stride_samples,
+    0,
+    is_byte_sample);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::DispatchRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input,
+      d_histogram,
+      num_levels,
+      d_levels,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      launch.get_stream(),
+      is_byte_sample);
+  });
+}
+
+using bin_types         = nvbench::type_list<int32_t>;
+using some_offset_types = nvbench::type_list<int32_t>;
+
+#ifdef TUNE_SampleT
+using sample_types = nvbench::type_list<TUNE_SampleT>;
+#else // !defined(TUNE_SampleT)
+using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
+#endif // TUNE_SampleT
+
+NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/range.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/range.cu
new file mode 100644
index 000000000..56f9ba686
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/histogram/range.cu
@@ -0,0 +1,153 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/sequence.h>
+
+#include "histogram_common.cuh"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_RLE_COMPRESS rle 0:1:1
+// %RANGE% TUNE_WORK_STEALING ws 0:1:1
+// %RANGE% TUNE_MEM_PREFERENCE mem 0:2:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_LOAD_ALGORITHM_ID laid 0:2:1
+// %RANGE% TUNE_VEC_SIZE_POW vec 0:2:1
+
+template <typename SampleT, typename CounterT, typename OffsetT>
+static void range(nvbench::state& state, nvbench::type_list<SampleT, CounterT, OffsetT>)
+{
+  constexpr int num_channels        = 1;
+  constexpr int num_active_channels = 1;
+
+  using sample_iterator_t = SampleT*;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t, num_channels, num_active_channels>;
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT,
+                           policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchHistogram<num_channels, //
+                           num_active_channels,
+                           sample_iterator_t,
+                           CounterT,
+                           SampleT,
+                           OffsetT>;
+#endif // TUNE_BASE
+
+  const auto entropy   = str_to_entropy(state.get_string("Entropy"));
+  const auto elements  = state.get_int64("Elements{io}");
+  const auto num_bins  = state.get_int64("Bins");
+  const int num_levels = static_cast<int>(num_bins) + 1;
+
+  const SampleT lower_level = 0;
+  const SampleT upper_level = get_upper_level<SampleT>(num_bins, elements);
+
+  SampleT step = (upper_level - lower_level) / num_bins;
+  thrust::device_vector<SampleT> levels(num_bins + 1);
+
+  // TODO Extract sequence to the helper TU
+  thrust::sequence(levels.begin(), levels.end(), lower_level, step);
+  SampleT* d_levels = thrust::raw_pointer_cast(levels.data());
+
+  thrust::device_vector<SampleT> input = generate(elements, entropy, lower_level, upper_level);
+  thrust::device_vector<CounterT> hist(num_bins);
+
+  SampleT* d_input      = thrust::raw_pointer_cast(input.data());
+  CounterT* d_histogram = thrust::raw_pointer_cast(hist.data());
+
+  CounterT* d_histogram1[1] = {d_histogram};
+  int num_levels1[1]        = {num_levels};
+  SampleT* d_levels1[1]     = {d_levels};
+
+  std::uint8_t* d_temp_storage = nullptr;
+  std::size_t temp_storage_bytes{};
+
+  cub::Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+  OffsetT num_row_pixels     = static_cast<OffsetT>(elements);
+  OffsetT num_rows           = 1;
+  OffsetT row_stride_samples = num_row_pixels;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<SampleT>(elements);
+  state.add_global_memory_writes<CounterT>(num_bins);
+
+  dispatch_t::DispatchRange(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_input,
+    d_histogram1,
+    num_levels1,
+    d_levels1,
+    num_row_pixels,
+    num_rows,
+    row_stride_samples,
+    0,
+    is_byte_sample);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::DispatchRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input,
+      d_histogram1,
+      num_levels1,
+      d_levels1,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      launch.get_stream(),
+      is_byte_sample);
+  });
+}
+
+using bin_types         = nvbench::type_list<int32_t>;
+using some_offset_types = nvbench::type_list<int32_t>;
+
+#ifdef TUNE_SampleT
+using sample_types = nvbench::type_list<TUNE_SampleT>;
+#else // !defined(TUNE_SampleT)
+using sample_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t, float, double>;
+#endif // TUNE_SampleT
+
+NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/keys.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/keys.cu
new file mode 100644
index 000000000..6a26d3267
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/keys.cu
@@ -0,0 +1,148 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1
+
+#ifndef TUNE_BASE
+#  define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2)
+#endif // TUNE_BASE
+
+using value_t = cub::NullType;
+
+#if !TUNE_BASE
+
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_LOAD == 2
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename KeyT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    using MergeSortPolicy =
+      cub::AgentMergeSortPolicy<TUNE_THREADS_PER_BLOCK,
+                                cub::Nominal4BItemsToItems<KeyT>(TUNE_ITEMS_PER_THREAD),
+                                TUNE_LOAD_ALGORITHM,
+                                TUNE_LOAD_MODIFIER,
+                                TUNE_STORE_ALGORITHM>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OffsetT>
+void keys(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using key_t            = T;
+  using value_t          = cub::NullType;
+  using key_input_it_t   = key_t*;
+  using value_input_it_t = value_t*;
+  using key_it_t         = key_t*;
+  using value_it_t       = value_t*;
+  using offset_t         = OffsetT;
+  using compare_op_t     = less_t;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t>;
+  using dispatch_t =
+    cub::DispatchMergeSort<key_input_it_t, value_input_it_t, key_it_t, value_it_t, offset_t, compare_op_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchMergeSort<key_input_it_t, value_input_it_t, key_it_t, value_it_t, offset_t, compare_op_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  thrust::device_vector<T> buffer_1 = generate(elements, entropy);
+  thrust::device_vector<T> buffer_2(elements);
+
+  key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
+  key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(elements);
+
+  // Allocate temporary storage:
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    d_buffer_1,
+    nullptr,
+    d_buffer_2,
+    nullptr,
+    static_cast<offset_t>(elements),
+    compare_op_t{},
+    0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_buffer_1,
+      nullptr,
+      d_buffer_2,
+      nullptr,
+      static_cast<offset_t>(elements),
+      compare_op_t{},
+      launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(keys, NVBENCH_TYPE_AXES(all_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/pairs.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/pairs.cu
new file mode 100644
index 000000000..f0238063e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/merge_sort/pairs.cu
@@ -0,0 +1,169 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK_POW2 tpb 6:10:1
+
+#ifndef TUNE_BASE
+#  define TUNE_THREADS_PER_BLOCK (1 << TUNE_THREADS_PER_BLOCK_POW2)
+#endif
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_LOAD == 2
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename KeyT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    using MergeSortPolicy =
+      cub::AgentMergeSortPolicy<TUNE_THREADS_PER_BLOCK,
+                                cub::Nominal4BItemsToItems<KeyT>(TUNE_ITEMS_PER_THREAD),
+                                TUNE_LOAD_ALGORITHM,
+                                TUNE_LOAD_MODIFIER,
+                                TUNE_STORE_ALGORITHM>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // TUNE_BASE
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+void pairs(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  using key_t            = KeyT;
+  using value_t          = ValueT;
+  using key_input_it_t   = key_t*;
+  using value_input_it_t = value_t*;
+  using key_it_t         = key_t*;
+  using value_it_t       = value_t*;
+  using offset_t         = OffsetT;
+  using compare_op_t     = less_t;
+
+#if !TUNE_BASE
+  using policy_t = policy_hub_t<key_t>;
+  using dispatch_t =
+    cub::DispatchMergeSort<key_input_it_t, value_input_it_t, key_it_t, value_it_t, offset_t, compare_op_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchMergeSort<key_input_it_t, value_input_it_t, key_it_t, value_it_t, offset_t, compare_op_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  thrust::device_vector<key_t> keys_buffer_1 = generate(elements, entropy);
+  thrust::device_vector<key_t> keys_buffer_2(elements);
+  thrust::device_vector<value_t> values_buffer_1(elements);
+  thrust::device_vector<value_t> values_buffer_2(elements);
+
+  key_t* d_keys_buffer_1     = thrust::raw_pointer_cast(keys_buffer_1.data());
+  key_t* d_keys_buffer_2     = thrust::raw_pointer_cast(keys_buffer_2.data());
+  value_t* d_values_buffer_1 = thrust::raw_pointer_cast(values_buffer_1.data());
+  value_t* d_values_buffer_2 = thrust::raw_pointer_cast(values_buffer_2.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_reads<ValueT>(elements);
+  state.add_global_memory_writes<KeyT>(elements);
+  state.add_global_memory_writes<ValueT>(elements);
+
+  // Allocate temporary storage:
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    d_keys_buffer_1,
+    d_values_buffer_1,
+    d_keys_buffer_2,
+    d_values_buffer_2,
+    static_cast<offset_t>(elements),
+    compare_op_t{},
+    0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_keys_buffer_1,
+      d_values_buffer_1,
+      d_keys_buffer_2,
+      d_values_buffer_2,
+      static_cast<offset_t>(elements),
+      compare_op_t{},
+      launch.get_stream());
+  });
+}
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types = all_types;
+#endif // TUNE_KeyT
+
+#ifdef TUNE_ValueT
+using value_types = nvbench::type_list<TUNE_ValueT>;
+#else // !defined(TUNE_ValueT)
+using value_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t
+#  if NVBENCH_HELPER_HAS_I128
+// nvcc currently hangs for __int128 value type with the fallback policy of {CTA: 64, IPT: 1}. NVBug 4384075
+//  ,
+//  int128_t
+#  endif
+                                       >;
+#endif // TUNE_ValueT
+
+NVBENCH_BENCH_TYPES(pairs, NVBENCH_TYPE_AXES(key_types, value_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/flagged.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/flagged.cu
new file mode 100644
index 000000000..ab2fd83dc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/flagged.cu
@@ -0,0 +1,191 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_partition.cuh>
+
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = true;
+constexpr bool may_alias    = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // TUNE_BASE
+
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(
+  FlagsItT d_flags,
+  OffsetT num_items,
+  T* d_out,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+{
+  const auto selected_elements = thrust::count(d_flags, d_flags + num_items, true);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename FlagsItT, typename T, typename OffsetT>
+void init_output_partition_buffer(FlagsItT, OffsetT, T* d_out, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void flagged(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
+{
+  using input_it_t                           = const T*;
+  using flag_it_t                            = const bool*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = cub::NullType;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  auto generator = generate(elements, entropy);
+
+  thrust::device_vector<T> in       = generator;
+  thrust::device_vector<bool> flags = generator;
+  thrust::device_vector<offset_t> num_selected(1);
+  thrust::device_vector<T> out(elements);
+
+  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
+  flag_it_t d_flags                = thrust::raw_pointer_cast(flags.data());
+  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(flags.cbegin(), elements, thrust::raw_pointer_cast(out.data()), d_out);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_reads<bool>(elements);
+  state.add_global_memory_writes<T>(elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_selected,
+      select_op_t{},
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(flagged, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/if.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/if.cu
new file mode 100644
index 000000000..5fc4f82f6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/if.cu
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_partition.cuh>
+
+#include <thrust/count.h>
+
+#include <cuda/std/type_traits>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = true;
+constexpr bool may_alias    = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class T>
+struct less_then_t
+{
+  T m_val;
+
+  __device__ bool operator()(const T& val) const
+  {
+    return val < m_val;
+  }
+};
+
+template <typename T>
+T value_from_entropy(double percentage)
+{
+  if (percentage == 1)
+  {
+    return std::numeric_limits<T>::max();
+  }
+
+  const auto max_val = static_cast<double>(std::numeric_limits<T>::max());
+  const auto min_val = static_cast<double>(std::numeric_limits<T>::lowest());
+  const auto result  = min_val + percentage * max_val - percentage * min_val;
+  return static_cast<T>(result);
+}
+
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(
+  InItT d_in,
+  OffsetT num_items,
+  T* d_out,
+  SelectOpT select_op,
+  cub::detail::partition_distinct_output_t<T*, T*>& d_partition_out_buffer)
+{
+  const auto selected_elements = thrust::count_if(d_in, d_in + num_items, select_op);
+  d_partition_out_buffer       = cub::detail::partition_distinct_output_t<T*, T*>{d_out, d_out + selected_elements};
+}
+
+template <typename InItT, typename T, typename OffsetT, typename SelectOpT>
+void init_output_partition_buffer(InItT, OffsetT, T* d_out, SelectOpT, T*& d_partition_out_buffer)
+{
+  d_partition_out_buffer = d_out;
+}
+
+template <typename T, typename OffsetT, typename UseDistinctPartitionT>
+void partition(nvbench::state& state, nvbench::type_list<T, OffsetT, UseDistinctPartitionT>)
+{
+  using input_it_t                           = const T*;
+  using flag_it_t                            = cub::NullType*;
+  using num_selected_it_t                    = OffsetT*;
+  using select_op_t                          = less_then_t<T>;
+  using equality_op_t                        = cub::NullType;
+  using offset_t                             = OffsetT;
+  constexpr bool use_distinct_out_partitions = UseDistinctPartitionT::value;
+  using output_it_t                          = typename ::cuda::std::
+    conditional<use_distinct_out_partitions, cub::detail::partition_distinct_output_t<T*, T*>, T*>::type;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // !TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  T val = value_from_entropy<T>(entropy_to_probability(entropy));
+  select_op_t select_op{val};
+
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<offset_t> num_selected(1);
+
+  thrust::device_vector<T> out(elements);
+
+  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
+  flag_it_t d_flags                = nullptr;
+  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+  output_it_t d_out{};
+  init_output_partition_buffer(in.cbegin(), elements, thrust::raw_pointer_cast(out.data()), select_op, d_out);
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_selected,
+      select_op,
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using distinct_partitions = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types, distinct_partitions))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "DistinctPartitions{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/three_way.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/three_way.cu
new file mode 100644
index 000000000..9b1fdb0e1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/partition/three_way.cu
@@ -0,0 +1,166 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_partition.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
+  {
+    using ThreeWayPartitionPolicy = //
+      cub::AgentThreeWayPartitionPolicy<TUNE_THREADS_PER_BLOCK,
+                                        TUNE_ITEMS_PER_THREAD,
+                                        TUNE_LOAD_ALGORITHM,
+                                        cub::LOAD_DEFAULT,
+                                        cub::BLOCK_SCAN_WARP_SCANS,
+                                        delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class T>
+struct less_then_t
+{
+  T m_val;
+
+  __device__ bool operator()(const T& val) const
+  {
+    return val < m_val;
+  }
+};
+
+template <typename T, typename OffsetT>
+void partition(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using input_it_t        = const T*;
+  using output_it_t       = T*;
+  using num_selected_it_t = OffsetT*;
+  using select_op_t       = less_then_t<T>;
+  using offset_t          = OffsetT;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchThreeWayPartitionIf<
+    input_it_t,
+    output_it_t,
+    output_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    select_op_t,
+    offset_t,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchThreeWayPartitionIf<
+    input_it_t,
+    output_it_t,
+    output_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    select_op_t,
+    offset_t>;
+#endif // !TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  T min_val{};
+  T max_val = std::numeric_limits<T>::max();
+
+  T left_border  = max_val / 3;
+  T right_border = left_border * 2;
+
+  select_op_t select_op_1{left_border};
+  select_op_t select_op_2{right_border};
+
+  thrust::device_vector<T> in = generate(elements, entropy, min_val, max_val);
+  thrust::device_vector<offset_t> num_selected(1);
+  thrust::device_vector<T> out_1(elements);
+  thrust::device_vector<T> out_2(elements);
+  thrust::device_vector<T> out_3(elements);
+
+  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out_1              = thrust::raw_pointer_cast(out_1.data());
+  output_it_t d_out_2              = thrust::raw_pointer_cast(out_2.data());
+  output_it_t d_out_3              = thrust::raw_pointer_cast(out_3.data());
+  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_out_1, d_out_2, d_out_3, d_num_selected, select_op_1, select_op_2, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_out_1,
+      d_out_2,
+      d_out_3,
+      d_num_selected,
+      select_op_1,
+      select_op_2,
+      elements,
+      launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(partition, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/keys.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/keys.cu
new file mode 100644
index 000000000..b6b9e4fd5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/keys.cu
@@ -0,0 +1,224 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_arch.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <nvbench_helper.cuh>
+
+// %//RANGE//% TUNE_RADIX_BITS bits 8:9:1
+#define TUNE_RADIX_BITS 8
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+
+using value_t = cub::NullType;
+
+constexpr bool is_descending   = false;
+constexpr bool is_overwrite_ok = false;
+
+#if !TUNE_BASE
+template <typename KeyT, typename ValueT, typename OffsetT>
+struct policy_hub_t
+{
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int ONESWEEP_RADIX_BITS = TUNE_RADIX_BITS;
+    static constexpr bool ONESWEEP           = true;
+    static constexpr bool OFFSET_64BIT       = sizeof(OffsetT) == 8;
+
+    // Onesweep policy
+    using OnesweepPolicy = cub::AgentRadixSortOnesweepPolicy<
+      TUNE_THREADS_PER_BLOCK,
+      TUNE_ITEMS_PER_THREAD,
+      DominantT,
+      1,
+      cub::RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      cub::BLOCK_SCAN_RAKING_MEMOIZE,
+      cub::RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // These kernels are launched once, no point in tuning at the moment
+    using HistogramPolicy    = cub::AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>;
+    using ExclusiveSumPolicy = cub::AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+    using ScanPolicy =
+      cub::AgentScanPolicy<512,
+                           23,
+                           OffsetT,
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE,
+                           cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // No point in tuning
+    static constexpr int SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5;
+
+    // No point in tuning single-tile policy
+    using SingleTilePolicy = cub::AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      cub::BLOCK_LOAD_DIRECT,
+      cub::LOAD_LDG,
+      cub::RADIX_RANK_MEMOIZE,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr std::size_t max_onesweep_temp_storage_size()
+{
+  using portion_offset  = int;
+  using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
+  using agent_radix_sort_onesweep_t =
+    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+
+  using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+
+  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr std::size_t max_temp_storage_size()
+{
+  using policy_t = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t;
+
+  static_assert(policy_t::ONESWEEP);
+  return max_onesweep_temp_storage_size<KeyT, ValueT, OffsetT>();
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr bool fits_in_default_shared_memory()
+{
+  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
+}
+#else // TUNE_BASE
+template <typename, typename, typename>
+constexpr bool fits_in_default_shared_memory()
+{
+  return true;
+}
+#endif // TUNE_BASE
+
+template <typename T, typename OffsetT>
+void radix_sort_keys(std::integral_constant<bool, true>, nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using offset_t = cub::detail::choose_offset_t<OffsetT>;
+
+  using key_t = T;
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+#endif // TUNE_BASE
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit   = sizeof(key_t) * 8;
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  thrust::device_vector<T> buffer_1 = generate(elements, entropy);
+  thrust::device_vector<T> buffer_2(elements);
+
+  key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
+  key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());
+
+  cub::DoubleBuffer<key_t> d_keys(d_buffer_1, d_buffer_2);
+  cub::DoubleBuffer<value_t> d_values;
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(elements);
+
+  // Allocate temporary storage:
+  std::size_t temp_size{};
+
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    d_keys,
+    d_values,
+    static_cast<offset_t>(elements),
+    begin_bit,
+    end_bit,
+    is_overwrite_ok,
+    0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DoubleBuffer<key_t> keys     = d_keys;
+    cub::DoubleBuffer<value_t> values = d_values;
+
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      keys,
+      values,
+      static_cast<offset_t>(elements),
+      begin_bit,
+      end_bit,
+      is_overwrite_ok,
+      launch.get_stream());
+  });
+}
+
+template <typename T, typename OffsetT>
+void radix_sort_keys(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<T, OffsetT>)
+{
+  (void) is_descending;
+  (void) is_overwrite_ok;
+}
+
+template <typename T, typename OffsetT>
+void radix_sort_keys(nvbench::state& state, nvbench::type_list<T, OffsetT> tl)
+{
+  using offset_t = cub::detail::choose_offset_t<OffsetT>;
+
+  radix_sort_keys(std::integral_constant<bool, fits_in_default_shared_memory<T, value_t, offset_t>()>{}, state, tl);
+}
+
+NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/pairs.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/pairs.cu
new file mode 100644
index 000000000..4a9f229bc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/radix_sort/pairs.cu
@@ -0,0 +1,250 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_arch.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <nvbench_helper.cuh>
+
+// %//RANGE//% TUNE_RADIX_BITS bits 8:9:1
+#define TUNE_RADIX_BITS 8
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+
+constexpr bool is_descending   = false;
+constexpr bool is_overwrite_ok = false;
+
+#if !TUNE_BASE
+template <typename KeyT, typename ValueT, typename OffsetT>
+struct policy_hub_t
+{
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int ONESWEEP_RADIX_BITS = TUNE_RADIX_BITS;
+    static constexpr bool ONESWEEP           = true;
+    static constexpr bool OFFSET_64BIT       = sizeof(OffsetT) == 8;
+
+    // Onesweep policy
+    using OnesweepPolicy = cub::AgentRadixSortOnesweepPolicy<
+      TUNE_THREADS_PER_BLOCK,
+      TUNE_ITEMS_PER_THREAD,
+      DominantT,
+      1,
+      cub::RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      cub::BLOCK_SCAN_RAKING_MEMOIZE,
+      cub::RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // These kernels are launched once, no point in tuning at the moment
+    using HistogramPolicy    = cub::AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>;
+    using ExclusiveSumPolicy = cub::AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+    using ScanPolicy =
+      cub::AgentScanPolicy<512,
+                           23,
+                           OffsetT,
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE,
+                           cub::BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // No point in tuning
+    static constexpr int SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5;
+
+    // No point in tuning single-tile policy
+    using SingleTilePolicy = cub::AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      cub::BLOCK_LOAD_DIRECT,
+      cub::LOAD_LDG,
+      cub::RADIX_RANK_MEMOIZE,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr std::size_t max_onesweep_temp_storage_size()
+{
+  using portion_offset  = int;
+  using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
+  using agent_radix_sort_onesweep_t =
+    cub::AgentRadixSortOnesweep<onesweep_policy, is_descending, KeyT, ValueT, OffsetT, portion_offset>;
+
+  using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
+  using hist_agent  = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;
+
+  return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr std::size_t max_temp_storage_size()
+{
+  using policy_t = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t;
+
+  static_assert(policy_t::ONESWEEP);
+  return max_onesweep_temp_storage_size<KeyT, ValueT, OffsetT>();
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+constexpr bool fits_in_default_shared_memory()
+{
+  return max_temp_storage_size<KeyT, ValueT, OffsetT>() < cub::detail::max_smem_per_block;
+}
+#else // TUNE_BASE
+template <typename, typename, typename>
+constexpr bool fits_in_default_shared_memory()
+{
+  return true;
+}
+#endif // TUNE_BASE
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+void radix_sort_values(
+  std::integral_constant<bool, true>, nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  using offset_t = cub::detail::choose_offset_t<OffsetT>;
+
+  using key_t   = KeyT;
+  using value_t = ValueT;
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<key_t, value_t, offset_t>;
+  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchRadixSort<is_descending, key_t, value_t, offset_t>;
+#endif // TUNE_BASE
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit   = sizeof(key_t) * 8;
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  thrust::device_vector<key_t> keys_buffer_1     = generate(elements, entropy);
+  thrust::device_vector<value_t> values_buffer_1 = generate(elements);
+  thrust::device_vector<key_t> keys_buffer_2(elements);
+  thrust::device_vector<value_t> values_buffer_2(elements);
+
+  key_t* d_keys_buffer_1     = thrust::raw_pointer_cast(keys_buffer_1.data());
+  key_t* d_keys_buffer_2     = thrust::raw_pointer_cast(keys_buffer_2.data());
+  value_t* d_values_buffer_1 = thrust::raw_pointer_cast(values_buffer_1.data());
+  value_t* d_values_buffer_2 = thrust::raw_pointer_cast(values_buffer_2.data());
+
+  cub::DoubleBuffer<key_t> d_keys(d_keys_buffer_1, d_keys_buffer_2);
+  cub::DoubleBuffer<value_t> d_values(d_values_buffer_1, d_values_buffer_2);
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_reads<ValueT>(elements);
+  state.add_global_memory_writes<KeyT>(elements);
+  state.add_global_memory_writes<ValueT>(elements);
+
+  // Allocate temporary storage:
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    d_keys,
+    d_values,
+    static_cast<offset_t>(elements),
+    begin_bit,
+    end_bit,
+    is_overwrite_ok,
+    0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    cub::DoubleBuffer<key_t> keys     = d_keys;
+    cub::DoubleBuffer<value_t> values = d_values;
+
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      keys,
+      values,
+      static_cast<offset_t>(elements),
+      begin_bit,
+      end_bit,
+      is_overwrite_ok,
+      launch.get_stream());
+  });
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+void radix_sort_values(std::integral_constant<bool, false>, nvbench::state&, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  (void) is_descending;
+  (void) is_overwrite_ok;
+}
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+void radix_sort_values(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT> tl)
+{
+  using offset_t = cub::detail::choose_offset_t<OffsetT>;
+
+  radix_sort_values(std::integral_constant<bool, fits_in_default_shared_memory<KeyT, ValueT, offset_t>()>{}, state, tl);
+}
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types = integral_types;
+#endif // TUNE_KeyT
+
+#ifdef TUNE_ValueT
+using value_types = nvbench::type_list<TUNE_ValueT>;
+#else // !defined(Tune_ValueT)
+using value_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t
+#  if NVBENCH_HELPER_HAS_I128
+                     ,
+                     int128_t
+#  endif
+                     >;
+#endif // TUNE_ValueT
+
+NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.201"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/base.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/base.cuh
new file mode 100644
index 000000000..9de575d06
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/base.cuh
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_reduce.cuh>
+
+#ifndef TUNE_BASE
+#  define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
+#endif
+
+#if !TUNE_BASE
+template <typename AccumT, typename OffsetT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
+    static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
+    static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
+
+    using ReducePolicy =
+      cub::AgentReducePolicy<threads_per_block,
+                             items_per_thread,
+                             AccumT,
+                             items_per_vec_load,
+                             cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                             cub::LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OffsetT>
+void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using accum_t     = T;
+  using input_it_t  = const T*;
+  using output_it_t = T*;
+  using offset_t    = cub::detail::choose_offset_t<OffsetT>;
+  using output_t    = T;
+  using init_t      = T;
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t, offset_t>;
+  using dispatch_t = cub::DispatchReduce<input_it_t, output_it_t, offset_t, op_t, init_t, accum_t, policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchReduce<input_it_t, output_it_t, offset_t, op_t, init_t, accum_t>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(1);
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, 0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage, temp_size, d_in, d_out, static_cast<offset_t>(elements), op_t{}, init_t{}, launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/by_key.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/by_key.cu
new file mode 100644
index 000000000..68d74ca56
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/by_key.cu
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_reduce.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+struct device_reduce_by_key_policy_hub
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using ReduceByKeyPolicyT =
+      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
+                                  TUNE_ITEMS,
+                                  TUNE_LOAD_ALGORITHM,
+                                  TUNE_LOAD_MODIFIER,
+                                  cub::BLOCK_SCAN_WARP_SCANS,
+                                  delay_constructor_t>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class KeyT, class ValueT, class OffsetT>
+static void reduce(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  using keys_input_it_t            = const KeyT*;
+  using unique_output_it_t         = KeyT*;
+  using vals_input_it_t            = const ValueT*;
+  using aggregate_output_it_t      = ValueT*;
+  using num_runs_output_iterator_t = OffsetT*;
+  using equality_op_t              = cub::Equality;
+  using reduction_op_t             = cub::Sum;
+  using accum_t                    = ValueT;
+  using offset_t                   = OffsetT;
+
+#if !TUNE_BASE
+  using dispatch_t = cub::DispatchReduceByKey<
+    keys_input_it_t,
+    unique_output_it_t,
+    vals_input_it_t,
+    aggregate_output_it_t,
+    num_runs_output_iterator_t,
+    equality_op_t,
+    reduction_op_t,
+    offset_t,
+    accum_t,
+    device_reduce_by_key_policy_hub>;
+#else
+  using dispatch_t = cub::DispatchReduceByKey<
+    keys_input_it_t,
+    unique_output_it_t,
+    vals_input_it_t,
+    aggregate_output_it_t,
+    num_runs_output_iterator_t,
+    equality_op_t,
+    reduction_op_t,
+    offset_t,
+    accum_t>;
+#endif
+
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<OffsetT> num_runs_out(1);
+  thrust::device_vector<ValueT> in_vals(elements);
+  thrust::device_vector<ValueT> out_vals(elements);
+  thrust::device_vector<KeyT> out_keys(elements);
+  thrust::device_vector<KeyT> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+
+  KeyT* d_in_keys         = thrust::raw_pointer_cast(in_keys.data());
+  KeyT* d_out_keys        = thrust::raw_pointer_cast(out_keys.data());
+  ValueT* d_in_vals       = thrust::raw_pointer_cast(in_vals.data());
+  ValueT* d_out_vals      = thrust::raw_pointer_cast(out_vals.data());
+  OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
+
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_keys,
+    d_in_vals,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    reduction_op_t{},
+    elements,
+    0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_keys,
+    d_in_vals,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    reduction_op_t{},
+    elements,
+    0);
+  cudaDeviceSynchronize();
+  const OffsetT num_runs = num_runs_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_reads<ValueT>(elements);
+  state.add_global_memory_writes<ValueT>(num_runs);
+  state.add_global_memory_writes<KeyT>(num_runs);
+  state.add_global_memory_writes<OffsetT>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t
+#  if NVBENCH_HELPER_HAS_I128
+                     ,
+                     int128_t
+#  endif
+                     >;
+#endif // TUNE_KeyT
+
+#ifdef TUNE_ValueT
+using value_types = nvbench::type_list<TUNE_ValueT>;
+#else // !defined(TUNE_ValueT)
+using value_types = all_types;
+#endif // TUNE_ValueT
+
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/max.cu
similarity index 84%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh
rename to source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/max.cu
index b909bbf72..791d5bfe1 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/config.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/max.cu
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -25,16 +25,11 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * Static configuration header for the CUB project.
- */
+#include <nvbench_helper.cuh>
 
-#pragma once
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-#include "util_arch.cuh"
-#include "util_compiler.cuh"
-#include "util_cpp_dialect.cuh"
-#include "util_deprecated.cuh"
-#include "util_macro.cuh"
-#include "util_namespace.cuh"
+using op_t = max_t;
+#include "base.cuh"
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/min.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/min.cu
new file mode 100644
index 000000000..a6c149ffd
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/min.cu
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+// NOTE: this benchmark is intented to cover DPX instructions on Hopper+ architectures.
+//       It specifically uses cub::Min instead of a user-defined operator.
+#define TUNE_T int16_t
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+using op_t = cub::Min;
+#include "base.cuh"
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/sum.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/sum.cu
new file mode 100644
index 000000000..77164fad1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/reduce/sum.cu
@@ -0,0 +1,35 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+using op_t = cub::Sum;
+#include "base.cuh"
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/encode.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/encode.cu
new file mode 100644
index 000000000..e84137636
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/encode.cu
@@ -0,0 +1,188 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_run_length_encode.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+struct device_reduce_by_key_policy_hub
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using ReduceByKeyPolicyT =
+      cub::AgentReduceByKeyPolicy<TUNE_THREADS,
+                                  TUNE_ITEMS,
+                                  TUNE_LOAD_ALGORITHM,
+                                  TUNE_LOAD_MODIFIER,
+                                  cub::BLOCK_SCAN_WARP_SCANS,
+                                  delay_constructor_t>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class T, class OffsetT>
+static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using offset_t                   = OffsetT;
+  using keys_input_it_t            = const T*;
+  using unique_output_it_t         = T*;
+  using vals_input_it_t            = cub::ConstantInputIterator<offset_t, OffsetT>;
+  using aggregate_output_it_t      = offset_t*;
+  using num_runs_output_iterator_t = offset_t*;
+  using equality_op_t              = cub::Equality;
+  using reduction_op_t             = cub::Sum;
+  using accum_t                    = offset_t;
+
+#if !TUNE_BASE
+  using dispatch_t = cub::DispatchReduceByKey<
+    keys_input_it_t,
+    unique_output_it_t,
+    vals_input_it_t,
+    aggregate_output_it_t,
+    num_runs_output_iterator_t,
+    equality_op_t,
+    reduction_op_t,
+    offset_t,
+    accum_t,
+    device_reduce_by_key_policy_hub>;
+#else
+  using policy_t   = cub::detail::device_run_length_encode_policy_hub<accum_t, T>;
+  using dispatch_t = cub::DispatchReduceByKey<
+    keys_input_it_t,
+    unique_output_it_t,
+    vals_input_it_t,
+    aggregate_output_it_t,
+    num_runs_output_iterator_t,
+    equality_op_t,
+    reduction_op_t,
+    offset_t,
+    accum_t,
+    policy_t>;
+#endif
+
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<offset_t> num_runs_out(1);
+  thrust::device_vector<offset_t> out_vals(elements);
+  thrust::device_vector<T> out_keys(elements);
+  thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+
+  T* d_in_keys             = thrust::raw_pointer_cast(in_keys.data());
+  T* d_out_keys            = thrust::raw_pointer_cast(out_keys.data());
+  offset_t* d_out_vals     = thrust::raw_pointer_cast(out_vals.data());
+  offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
+  vals_input_it_t d_in_vals(offset_t{1});
+
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_keys,
+    d_in_vals,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    reduction_op_t{},
+    elements,
+    0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_keys,
+    d_in_vals,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    reduction_op_t{},
+    elements,
+    0);
+  cudaDeviceSynchronize();
+  const OffsetT num_runs = num_runs_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(num_runs);
+  state.add_global_memory_writes<OffsetT>(num_runs);
+  state.add_global_memory_writes<OffsetT>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_keys,
+      d_in_vals,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      reduction_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+
+NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
new file mode 100644
index 000000000..df19717ab
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu
@@ -0,0 +1,173 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_run_length_encode.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_TIME_SLICING ts 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+struct device_rle_policy_hub
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using RleSweepPolicyT =
+      cub::AgentRlePolicy<TUNE_THREADS,
+                          TUNE_ITEMS,
+                          TUNE_LOAD_ALGORITHM,
+                          TUNE_LOAD_MODIFIER,
+                          TUNE_TIME_SLICING,
+                          cub::BLOCK_SCAN_WARP_SCANS,
+                          delay_constructor_t>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class T, class OffsetT>
+static void rle(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using offset_t                   = OffsetT;
+  using keys_input_it_t            = const T*;
+  using offset_output_it_t         = offset_t*;
+  using length_output_it_t         = offset_t*;
+  using num_runs_output_iterator_t = offset_t*;
+  using equality_op_t              = cub::Equality;
+  using accum_t                    = offset_t;
+
+#if !TUNE_BASE
+  using dispatch_t =
+    cub::DeviceRleDispatch<keys_input_it_t,
+                           offset_output_it_t,
+                           length_output_it_t,
+                           num_runs_output_iterator_t,
+                           equality_op_t,
+                           offset_t,
+                           device_rle_policy_hub>;
+#else
+  using dispatch_t =
+    cub::DeviceRleDispatch<keys_input_it_t,
+                           offset_output_it_t,
+                           length_output_it_t,
+                           num_runs_output_iterator_t,
+                           equality_op_t,
+                           offset_t>;
+#endif
+
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<offset_t> num_runs_out(1);
+  thrust::device_vector<offset_t> out_offsets(elements);
+  thrust::device_vector<offset_t> out_lengths(elements);
+  thrust::device_vector<T> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+
+  T* d_in_keys             = thrust::raw_pointer_cast(in_keys.data());
+  offset_t* d_out_offsets  = thrust::raw_pointer_cast(out_offsets.data());
+  offset_t* d_out_lengths  = thrust::raw_pointer_cast(out_lengths.data());
+  offset_t* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
+
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_offsets,
+    d_out_lengths,
+    d_num_runs_out,
+    equality_op_t{},
+    elements,
+    0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_out_offsets,
+    d_out_lengths,
+    d_num_runs_out,
+    equality_op_t{},
+    elements,
+    0);
+  cudaDeviceSynchronize();
+  const OffsetT num_runs = num_runs_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<OffsetT>(num_runs);
+  state.add_global_memory_writes<OffsetT>(num_runs);
+  state.add_global_memory_writes<OffsetT>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_out_offsets,
+      d_out_lengths,
+      d_num_runs_out,
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+
+NVBENCH_BENCH_TYPES(rle, NVBENCH_TYPE_AXES(all_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/base.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/base.cuh
new file mode 100644
index 000000000..e3cd7a7be
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/base.cuh
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_scan.cuh>
+
+#include <cuda/std/__functional/invoke.h>
+
+#include <look_back_helper.cuh>
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename AccumT>
+struct policy_hub_t
+{
+  template <int NOMINAL_BLOCK_THREADS_4B,
+            int NOMINAL_ITEMS_PER_THREAD_4B,
+            typename ComputeT,
+            cub::BlockLoadAlgorithm LOAD_ALGORITHM,
+            cub::CacheLoadModifier LOAD_MODIFIER,
+            cub::BlockStoreAlgorithm STORE_ALGORITHM,
+            cub::BlockScanAlgorithm SCAN_ALGORITHM>
+  using agent_policy_t = cub::AgentScanPolicy<
+    NOMINAL_BLOCK_THREADS_4B,
+    NOMINAL_ITEMS_PER_THREAD_4B,
+    ComputeT,
+    LOAD_ALGORITHM,
+    LOAD_MODIFIER,
+    STORE_ALGORITHM,
+    SCAN_ALGORITHM,
+    cub::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
+    delay_constructor_t>;
+
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    using ScanPolicyT =
+      agent_policy_t<TUNE_THREADS,
+                     TUNE_ITEMS,
+                     AccumT,
+                     TUNE_LOAD_ALGORITHM,
+                     TUNE_LOAD_MODIFIER,
+                     TUNE_STORE_ALGORITHM,
+                     cub::BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // TUNE_BASE
+
+template <typename T, typename OffsetT>
+static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using init_t      = cub::detail::InputValue<T>;
+  using accum_t     = ::cuda::std::__accumulator_t<op_t, T, T>;
+  using input_it_t  = const T*;
+  using output_it_t = T*;
+  using offset_t    = OffsetT;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t>;
+  using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t, policy_t>;
+#else
+  using dispatch_t = cub::DispatchScan<input_it_t, output_it_t, op_t, init_t, offset_t, accum_t>;
+#endif
+
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<T> input = generate(elements);
+  thrust::device_vector<T> output(elements);
+
+  T* d_input  = thrust::raw_pointer_cast(input.data());
+  T* d_output = thrust::raw_pointer_cast(output.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(elements);
+
+  size_t tmp_size;
+  dispatch_t::Dispatch(
+    nullptr, tmp_size, d_input, d_output, op_t{}, init_t{T{}}, static_cast<int>(input.size()), 0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      thrust::raw_pointer_cast(tmp.data()),
+      tmp_size,
+      d_input,
+      d_output,
+      op_t{},
+      init_t{T{}},
+      static_cast<int>(input.size()),
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>;
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/by_key.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/by_key.cu
new file mode 100644
index 000000000..26676d66c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/by_key.cu
@@ -0,0 +1,176 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_scan.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_DIRECT
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM  cub::BLOCK_LOAD_WARP_TRANSPOSE
+#    define TUNE_STORE_ALGORITHM cub::BLOCK_STORE_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    using ScanByKeyPolicyT = cub::AgentScanByKeyPolicy<
+      TUNE_THREADS,
+      TUNE_ITEMS,
+      // TODO Tune
+      TUNE_LOAD_ALGORITHM,
+      TUNE_LOAD_MODIFIER,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      TUNE_STORE_ALGORITHM,
+      delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename KeyT, typename ValueT, typename OffsetT>
+static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  using init_value_t    = ValueT;
+  using op_t            = cub::Sum;
+  using accum_t         = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
+  using key_input_it_t  = const KeyT*;
+  using val_input_it_t  = const ValueT*;
+  using val_output_it_t = ValueT*;
+  using equality_op_t   = cub::Equality;
+  using offset_t        = OffsetT;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t;
+  using dispatch_t = cub::DispatchScanByKey<
+    key_input_it_t,
+    val_input_it_t,
+    val_output_it_t,
+    equality_op_t,
+    op_t,
+    init_value_t,
+    offset_t,
+    accum_t,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::
+    DispatchScanByKey<key_input_it_t, val_input_it_t, val_output_it_t, equality_op_t, op_t, init_value_t, offset_t, accum_t>;
+#endif // TUNE_BASE
+
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+
+  thrust::device_vector<ValueT> in_vals(elements);
+  thrust::device_vector<ValueT> out_vals(elements);
+  thrust::device_vector<KeyT> keys = generate.uniform.key_segments(elements, 0, 5200);
+
+  KeyT* d_keys       = thrust::raw_pointer_cast(keys.data());
+  ValueT* d_in_vals  = thrust::raw_pointer_cast(in_vals.data());
+  ValueT* d_out_vals = thrust::raw_pointer_cast(out_vals.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_reads<ValueT>(elements);
+  state.add_global_memory_writes<ValueT>(elements);
+
+  size_t tmp_size;
+  dispatch_t::Dispatch(
+    nullptr,
+    tmp_size,
+    d_keys,
+    d_in_vals,
+    d_out_vals,
+    equality_op_t{},
+    op_t{},
+    init_value_t{},
+    static_cast<int>(elements),
+    0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
+  nvbench::uint8_t* d_tmp = thrust::raw_pointer_cast(tmp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_tmp,
+      tmp_size,
+      d_keys,
+      d_in_vals,
+      d_out_vals,
+      equality_op_t{},
+      op_t{},
+      init_value_t{},
+      static_cast<int>(elements),
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types = all_types;
+#endif // TUNE_KeyT
+
+#ifdef TUNE_ValueT
+using value_types = nvbench::type_list<TUNE_ValueT>;
+#else // !defined(TUNE_ValueT)
+using value_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t
+#  if NVBENCH_HELPER_HAS_I128
+                     ,
+                     int128_t
+#  endif
+                     >;
+#endif // TUNE_ValueT
+
+NVBENCH_BENCH_TYPES(scan, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/max.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/max.cu
new file mode 100644
index 000000000..599a0f3b4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/max.cu
@@ -0,0 +1,39 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+
+#include <nvbench_helper.cuh>
+
+using op_t = max_t;
+#include "base.cuh"
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/sum.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/sum.cu
new file mode 100644
index 000000000..ca8bfe469
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/scan/exclusive/sum.cu
@@ -0,0 +1,39 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+
+using op_t = cub::Sum;
+#include "base.cuh"
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/segmented_sort/keys.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/segmented_sort/keys.cu
new file mode 100644
index 000000000..3e42f45b8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/segmented_sort/keys.cu
@@ -0,0 +1,274 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_segmented_sort.cuh>
+
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_L_ITEMS ipt 7:24:1
+// %RANGE% TUNE_M_ITEMS ipmw 1:17:1
+// %RANGE% TUNE_S_ITEMS ipsw 1:17:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_SW_THREADS_POW2 tpsw 1:4:1
+// %RANGE% TUNE_MW_THREADS_POW2 tpmw 1:5:1
+// %RANGE% TUNE_RADIX_BITS bits 4:8:1
+// %RANGE% TUNE_PARTITIONING_THRESHOLD pt 100:800:50
+// %RANGE% TUNE_RANK_ALGORITHM ra 0:4:1
+// %RANGE% TUNE_LOAD ld 0:2:1
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_S_LOAD sld 0:2:1
+// %RANGE% TUNE_S_TRANSPOSE strp 0:1:1
+// %RANGE% TUNE_M_LOAD mld 0:2:1
+// %RANGE% TUNE_M_TRANSPOSE mtrp 0:1:1
+
+#if !TUNE_BASE
+
+#  define TUNE_SW_THREADS (1 << TUNE_SW_THREADS_POW2)
+#  define TUNE_MW_THREADS (1 << TUNE_MW_THREADS_POW2)
+
+#  define SMALL_SEGMENT_SIZE  TUNE_S_ITEMS* TUNE_SW_THREADS
+#  define MEDIUM_SEGMENT_SIZE TUNE_M_ITEMS* TUNE_MW_THREADS
+#  define LARGE_SEGMENT_SIZE  TUNE_L_ITEMS* TUNE_THREADS
+
+#  if (LARGE_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) || (LARGE_SEGMENT_SIZE <= MEDIUM_SEGMENT_SIZE)
+#    error Large segment size must be larger than small and medium segment sizes
+#  endif
+
+#  if (MEDIUM_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE)
+#    error Medium segment size must be larger than small one
+#  endif
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_LOAD == 2
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+#  if TUNE_S_LOAD == 0
+#    define TUNE_S_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_S_LOAD == 1
+#    define TUNE_S_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_S_LOAD == 2
+#    define TUNE_S_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_S_LOAD
+
+#  if TUNE_M_LOAD == 0
+#    define TUNE_M_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  elif TUNE_M_LOAD == 1
+#    define TUNE_M_LOAD_MODIFIER cub::LOAD_LDG
+#  else // TUNE_M_LOAD == 2
+#    define TUNE_M_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_M_LOAD
+
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_S_TRANSPOSE == 0
+#    define TUNE_S_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT
+#  else // TUNE_S_TRANSPOSE == 1
+#    define TUNE_S_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE
+#  endif // TUNE_S_TRANSPOSE
+
+#  if TUNE_M_TRANSPOSE == 0
+#    define TUNE_M_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT
+#  else // TUNE_M_TRANSPOSE == 1
+#    define TUNE_M_LOAD_ALGORITHM cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE
+#  endif // TUNE_M_TRANSPOSE
+
+template <class KeyT>
+struct device_seg_sort_policy_hub
+{
+  using DominantT = KeyT;
+
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    static constexpr int BLOCK_THREADS          = TUNE_THREADS;
+    static constexpr int RADIX_BITS             = TUNE_RADIX_BITS;
+    static constexpr int PARTITIONING_THRESHOLD = TUNE_PARTITIONING_THRESHOLD;
+
+    using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      TUNE_L_ITEMS,
+      DominantT,
+      TUNE_LOAD_ALGORITHM,
+      TUNE_LOAD_MODIFIER,
+      static_cast<cub::RadixRankAlgorithm>(TUNE_RANK_ALGORITHM),
+      cub::BLOCK_SCAN_WARP_SCANS,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD  = TUNE_S_ITEMS;
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = TUNE_M_ITEMS;
+
+    using SmallAndMediumSegmentedSortPolicyT = cub::AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::
+        AgentSubWarpMergeSortPolicy<TUNE_SW_THREADS, ITEMS_PER_SMALL_THREAD, TUNE_S_LOAD_ALGORITHM, TUNE_S_LOAD_MODIFIER>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<TUNE_MW_THREADS,
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       TUNE_M_LOAD_ALGORITHM,
+                                       TUNE_M_LOAD_MODIFIER>>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class T, typename OffsetT>
+void seg_sort(nvbench::state& state,
+              nvbench::type_list<T, OffsetT> ts,
+              const thrust::device_vector<OffsetT>& offsets,
+              bit_entropy entropy)
+{
+  constexpr bool is_descending   = false;
+  constexpr bool is_overwrite_ok = false;
+
+  using offset_t          = OffsetT;
+  using begin_offset_it_t = const offset_t*;
+  using end_offset_it_t   = const offset_t*;
+  using key_t             = T;
+  using value_t           = cub::NullType;
+
+#if !TUNE_BASE
+  using policy_t   = device_seg_sort_policy_hub<key_t>;
+  using dispatch_t = //
+    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t, policy_t>;
+#else
+  using dispatch_t = //
+    cub::DispatchSegmentedSort<is_descending, key_t, value_t, offset_t, begin_offset_it_t, end_offset_it_t>;
+#endif
+
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto segments = offsets.size() - 1;
+
+  thrust::device_vector<key_t> buffer_1 = generate(elements, entropy);
+  thrust::device_vector<key_t> buffer_2(elements);
+
+  key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
+  key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());
+
+  cub::DoubleBuffer<key_t> d_keys(d_buffer_1, d_buffer_2);
+  cub::DoubleBuffer<value_t> d_values;
+
+  begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(offsets.data());
+  end_offset_it_t d_end_offsets     = d_begin_offsets + 1;
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<key_t>(elements);
+  state.add_global_memory_writes<key_t>(elements);
+  state.add_global_memory_reads<offset_t>(segments + 1);
+
+  std::size_t temp_storage_bytes{};
+  std::uint8_t* d_temp_storage{};
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_keys,
+    d_values,
+    elements,
+    segments,
+    d_begin_offsets,
+    d_end_offsets,
+    is_overwrite_ok,
+    0);
+
+  thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cub::DoubleBuffer<key_t> keys     = d_keys;
+    cub::DoubleBuffer<value_t> values = d_values;
+
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      keys,
+      values,
+      elements,
+      segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_ok,
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<uint32_t>;
+
+template <class T, typename OffsetT>
+void power_law(nvbench::state& state, nvbench::type_list<T, OffsetT> ts)
+{
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto segments                    = static_cast<std::size_t>(state.get_int64("Segments{io}"));
+  const bit_entropy entropy              = str_to_entropy(state.get_string("Entropy"));
+  thrust::device_vector<OffsetT> offsets = generate.power_law.segment_offsets(elements, segments);
+
+  seg_sort(state, ts, offsets, entropy);
+}
+
+NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types))
+  .set_name("power")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
+  .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4))
+  .add_string_axis("Entropy", {"1.000", "0.201"});
+
+template <class T, typename OffsetT>
+void uniform(nvbench::state& state, nvbench::type_list<T, OffsetT> ts)
+{
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const auto max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegmentSize"));
+
+  const auto max_segment_size_log = static_cast<OffsetT>(std::log2(max_segment_size));
+  const auto min_segment_size     = 1 << (max_segment_size_log - 1);
+
+  thrust::device_vector<OffsetT> offsets =
+    generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size);
+
+  seg_sort(state, ts, offsets, bit_entropy::_1_000);
+}
+
+NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types))
+  .set_name("small")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
+  .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 8, 1));
+
+NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(fundamental_types, some_offset_types))
+  .set_name("large")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
+  .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(10, 18, 2));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/flagged.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/flagged.cu
new file mode 100644
index 000000000..0d5b5eac8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/flagged.cu
@@ -0,0 +1,172 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/count.h>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OffsetT, typename InPlaceAlgT>
+void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlaceAlgT>)
+{
+  using input_it_t         = const T*;
+  using flag_it_t          = const bool*;
+  using output_it_t        = T*;
+  using num_selected_it_t  = OffsetT*;
+  using select_op_t        = cub::NullType;
+  using equality_op_t      = cub::NullType;
+  using offset_t           = OffsetT;
+  constexpr bool may_alias = InPlaceAlgT::value;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // !TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  auto generator = generate(elements, entropy);
+
+  thrust::device_vector<T> in       = generator;
+  thrust::device_vector<bool> flags = generator;
+  thrust::device_vector<offset_t> num_selected(1);
+
+  // TODO Extract into helper TU
+  const auto selected_elements = thrust::count(flags.cbegin(), flags.cend(), true);
+  thrust::device_vector<T> out(selected_elements);
+
+  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
+  flag_it_t d_flags                = thrust::raw_pointer_cast(flags.data());
+  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_reads<bool>(elements);
+  state.add_global_memory_writes<T>(selected_elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op_t{}, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_selected,
+      select_op_t{},
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using in_place_alg = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, offset_types, in_place_alg))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "IsInPlace{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/if.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/if.cu
new file mode 100644
index 000000000..9bb57f661
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/if.cu
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/count.h>
+
+#include <limits>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class T>
+struct less_then_t
+{
+  T m_val;
+
+  __device__ bool operator()(const T& val) const
+  {
+    return val < m_val;
+  }
+};
+
+template <typename T>
+T value_from_entropy(double percentage)
+{
+  if (percentage == 1)
+  {
+    return std::numeric_limits<T>::max();
+  }
+
+  const auto max_val = static_cast<double>(std::numeric_limits<T>::max());
+  const auto min_val = static_cast<double>(std::numeric_limits<T>::lowest());
+  const auto result  = min_val + percentage * max_val - percentage * min_val;
+  return static_cast<T>(result);
+}
+
+template <typename T, typename OffsetT, typename InPlaceAlgT>
+void select(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlaceAlgT>)
+{
+  using input_it_t         = const T*;
+  using flag_it_t          = cub::NullType*;
+  using output_it_t        = T*;
+  using num_selected_it_t  = OffsetT*;
+  using select_op_t        = less_then_t<T>;
+  using equality_op_t      = cub::NullType;
+  using offset_t           = OffsetT;
+  constexpr bool may_alias = InPlaceAlgT::value;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));
+
+  T val = value_from_entropy<T>(entropy_to_probability(entropy));
+  select_op_t select_op{val};
+
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<offset_t> num_selected(1);
+
+  // TODO Extract into helper TU
+  const auto selected_elements = thrust::count_if(in.cbegin(), in.cend(), select_op);
+  thrust::device_vector<T> out(selected_elements);
+
+  input_it_t d_in                  = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out                = thrust::raw_pointer_cast(out.data());
+  flag_it_t d_flags                = nullptr;
+  num_selected_it_t d_num_selected = thrust::raw_pointer_cast(num_selected.data());
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(selected_elements);
+  state.add_global_memory_writes<offset_t>(1);
+
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_selected, select_op, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_selected,
+      select_op,
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using in_place_alg = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(fundamental_types, offset_types, in_place_alg))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "IsInPlace{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.000"});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique.cu
new file mode 100644
index 000000000..02d2bc2ce
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique.cu
@@ -0,0 +1,150 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/device_select.cuh>
+
+#include <limits>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+constexpr bool keep_rejects = false;
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+template <typename InputT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,
+                               ITEMS_PER_THREAD,
+                               TUNE_LOAD_ALGORITHM,
+                               TUNE_LOAD_MODIFIER,
+                               cub::BLOCK_SCAN_WARP_SCANS,
+                               delay_constructor_t>;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <typename T, typename OffsetT, typename InPlaceAlgT>
+static void unique(nvbench::state& state, nvbench::type_list<T, OffsetT, InPlaceAlgT>)
+{
+  using input_it_t         = const T*;
+  using flag_it_t          = cub::NullType*;
+  using output_it_t        = T*;
+  using num_selected_it_t  = OffsetT*;
+  using select_op_t        = cub::NullType;
+  using equality_op_t      = cub::Equality;
+  using offset_t           = OffsetT;
+  constexpr bool may_alias = InPlaceAlgT::value;
+
+#if !TUNE_BASE
+  using policy_t   = policy_hub_t<T>;
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias,
+    policy_t>;
+#else // TUNE_BASE
+  using dispatch_t = cub::DispatchSelectIf<
+    input_it_t,
+    flag_it_t,
+    output_it_t,
+    num_selected_it_t,
+    select_op_t,
+    equality_op_t,
+    offset_t,
+    keep_rejects,
+    may_alias>;
+#endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<T> in = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+  thrust::device_vector<T> out(elements);
+  thrust::device_vector<offset_t> num_unique_out(1);
+
+  input_it_t d_in                = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out              = thrust::raw_pointer_cast(out.data());
+  flag_it_t d_flags              = nullptr;
+  num_selected_it_t d_num_unique = thrust::raw_pointer_cast(num_unique_out.data());
+
+  // Get temporary storage requirements
+  std::size_t temp_size{};
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  // Get number of unique elements
+  dispatch_t::Dispatch(
+    temp_storage, temp_size, d_in, d_flags, d_out, d_num_unique, select_op_t{}, equality_op_t{}, elements, 0);
+
+  cudaDeviceSynchronize();
+  const OffsetT num_unique = num_unique_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(num_unique);
+  state.add_global_memory_writes<offset_t>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_unique,
+      select_op_t{},
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using in_place_alg = nvbench::type_list<::cuda::std::false_type, ::cuda::std::true_type>;
+
+NVBENCH_BENCH_TYPES(unique, NVBENCH_TYPE_AXES(fundamental_types, offset_types, in_place_alg))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}", "IsInPlace{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique_by_key.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique_by_key.cu
new file mode 100644
index 000000000..808a2dab1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/select/unique_by_key.cu
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_select.cuh>
+
+#include <look_back_helper.cuh>
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS ipt 7:24:1
+// %RANGE% TUNE_THREADS tpb 128:1024:32
+// %RANGE% TUNE_TRANSPOSE trp 0:1:1
+// %RANGE% TUNE_LOAD ld 0:1:1
+// %RANGE% TUNE_MAGIC_NS ns 0:2048:4
+// %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
+// %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
+
+#if !TUNE_BASE
+#  if TUNE_TRANSPOSE == 0
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_DIRECT
+#  else // TUNE_TRANSPOSE == 1
+#    define TUNE_LOAD_ALGORITHM cub::BLOCK_LOAD_WARP_TRANSPOSE
+#  endif // TUNE_TRANSPOSE
+
+#  if TUNE_LOAD == 0
+#    define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
+#  else // TUNE_LOAD == 1
+#    define TUNE_LOAD_MODIFIER cub::LOAD_CA
+#  endif // TUNE_LOAD
+
+struct policy_hub
+{
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using UniqueByKeyPolicyT =
+      cub::AgentUniqueByKeyPolicy<TUNE_THREADS,
+                                  TUNE_ITEMS,
+                                  TUNE_LOAD_ALGORITHM,
+                                  TUNE_LOAD_MODIFIER,
+                                  cub::BLOCK_SCAN_WARP_SCANS,
+                                  delay_constructor_t>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+#endif // !TUNE_BASE
+
+template <class KeyT, class ValueT, class OffsetT>
+static void select(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
+{
+  using keys_input_it_t            = const KeyT*;
+  using keys_output_it_t           = KeyT*;
+  using vals_input_it_t            = const ValueT*;
+  using vals_output_it_t           = ValueT*;
+  using num_runs_output_iterator_t = OffsetT*;
+  using equality_op_t              = cub::Equality;
+  using offset_t                   = OffsetT;
+
+#if !TUNE_BASE
+  using dispatch_t = cub::DispatchUniqueByKey<
+    keys_input_it_t,
+    vals_input_it_t,
+    keys_output_it_t,
+    vals_output_it_t,
+    num_runs_output_iterator_t,
+    equality_op_t,
+    offset_t,
+    policy_hub>;
+#else
+  using dispatch_t =
+    cub::DispatchUniqueByKey<keys_input_it_t,
+                             vals_input_it_t,
+                             keys_output_it_t,
+                             vals_output_it_t,
+                             num_runs_output_iterator_t,
+                             equality_op_t,
+                             offset_t>;
+#endif
+
+  const auto elements                    = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  constexpr std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size     = static_cast<std::size_t>(state.get_int64("MaxSegSize"));
+
+  thrust::device_vector<OffsetT> num_runs_out(1);
+  thrust::device_vector<ValueT> in_vals(elements);
+  thrust::device_vector<ValueT> out_vals(elements);
+  thrust::device_vector<KeyT> out_keys(elements);
+  thrust::device_vector<KeyT> in_keys = generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+
+  KeyT* d_in_keys         = thrust::raw_pointer_cast(in_keys.data());
+  KeyT* d_out_keys        = thrust::raw_pointer_cast(out_keys.data());
+  ValueT* d_in_vals       = thrust::raw_pointer_cast(in_vals.data());
+  ValueT* d_out_vals      = thrust::raw_pointer_cast(out_vals.data());
+  OffsetT* d_num_runs_out = thrust::raw_pointer_cast(num_runs_out.data());
+
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_in_vals,
+    d_out_keys,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    elements,
+    0);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  dispatch_t::Dispatch(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in_keys,
+    d_in_vals,
+    d_out_keys,
+    d_out_vals,
+    d_num_runs_out,
+    equality_op_t{},
+    elements,
+    0);
+  cudaDeviceSynchronize();
+  const OffsetT num_runs = num_runs_out[0];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<KeyT>(elements);
+  state.add_global_memory_reads<ValueT>(elements);
+  state.add_global_memory_writes<ValueT>(num_runs);
+  state.add_global_memory_writes<KeyT>(num_runs);
+  state.add_global_memory_writes<OffsetT>(1);
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in_keys,
+      d_in_vals,
+      d_out_keys,
+      d_out_vals,
+      d_num_runs_out,
+      equality_op_t{},
+      elements,
+      launch.get_stream());
+  });
+}
+
+using some_offset_types = nvbench::type_list<nvbench::int32_t>;
+
+#ifdef TUNE_KeyT
+using key_types = nvbench::type_list<TUNE_KeyT>;
+#else // !defined(TUNE_KeyT)
+using key_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t
+#  if NVBENCH_HELPER_HAS_I128
+                     ,
+                     int128_t
+#  endif
+                     >;
+#endif // TUNE_KeyT
+
+#ifdef TUNE_ValueT
+using value_types = nvbench::type_list<TUNE_ValueT>;
+#else // !defined(TUNE_ValueT)
+using value_types = all_types;
+#endif // TUNE_ValueT
+
+NVBENCH_BENCH_TYPES(select, NVBENCH_TYPE_AXES(key_types, value_types, some_offset_types))
+  .set_name("base")
+  .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
+  .add_int64_power_of_two_axis("MaxSegSize", {1, 4, 8});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream.h b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream.h
new file mode 100644
index 000000000..5db142981
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream.h
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <stdexcept>
+
+#include <nvbench_helper.cuh>
+
+template <typename... RandomAccessIteratorsIn>
+#if TUNE_BASE
+using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
+#else
+struct policy_hub_t
+{
+  struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
+  {
+    static constexpr int min_bif    = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
+    using algo_policy =
+      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::prefetch,
+                       cub::detail::transform::prefetch_policy_t<TUNE_THREADS>,
+                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
+  };
+};
+#endif
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types =
+  nvbench::type_list<std::int8_t,
+                     std::int16_t,
+                     float,
+                     double
+#  ifdef NVBENCH_HELPER_HAS_I128
+                     ,
+                     __int128
+#  endif
+                     >;
+#endif
+
+// BabelStream uses 2^25, H200 can fit 2^31 int128s
+// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
+// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
+// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
+// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
+inline auto array_size_powers = nvbench::range(16, 28, 4);
+
+template <typename OffsetT,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
+void bench_transform(
+  nvbench::state& state,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  OffsetT num_items,
+  TransformOp transform_op,
+  ExecTag exec_tag = nvbench::exec_tag::no_batch)
+{
+  state.exec(exec_tag, [&](const nvbench::launch& launch) {
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
+  });
+}
+
+// Modified from BabelStream to also work for integers
+inline constexpr auto startA      = 1; // BabelStream: 0.1
+inline constexpr auto startB      = 2; // BabelStream: 0.2
+inline constexpr auto startC      = 3; // BabelStream: 0.1
+inline constexpr auto startScalar = 4; // BabelStream: 0.4
+
+// TODO(bgruber): we should put those somewhere into libcu++:
+// from C++ GSL
+struct narrowing_error : std::runtime_error
+{
+  narrowing_error()
+      : std::runtime_error("Narrowing error")
+  {}
+};
+
+// from C++ GSL
+// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
+template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
+constexpr DstT narrow(SrcT value)
+{
+  constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
+  const auto converted                   = static_cast<DstT>(value);
+  if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
+  {
+    throw narrowing_error{};
+  }
+  return converted;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream1.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream1.cu
new file mode 100644
index 000000000..87abdfef6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream1.cu
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+
+  const T scalar = startScalar;
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
+    return ci * scalar;
+  });
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("mul")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream2.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream2.cu
new file mode 100644
index 000000000..c8fa017b7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream2.cu
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+      return ai + bi;
+    });
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("add")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T, typename OffsetT>
+static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+      return bi + scalar * ci;
+    });
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("triad")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream3.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream3.cu
new file mode 100644
index 000000000..db5415542
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform/babelstream3.cu
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n         = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
+  // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
+  // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
+  thrust::device_vector<T> d;
+  if (!overwrite)
+  {
+    d.resize(n);
+  }
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state,
+    ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
+    overwrite ? a.begin() : d.begin(),
+    n,
+    [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+      return ai + bi + scalar * ci;
+    },
+    nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
+                              // OverwriteInput=0, but doesn't hurt
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("nstream")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers)
+  .add_int64_axis("OverwriteInput", {1});
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform_reduce/sum.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform_reduce/sum.cu
new file mode 100644
index 000000000..21eeb2dff
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/bench/transform_reduce/sum.cu
@@ -0,0 +1,201 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "thrust/iterator/transform_iterator.h"
+#include <nvbench_helper.cuh>
+
+// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
+// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
+// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
+
+#ifndef TUNE_BASE
+#  define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
+#endif
+
+#if !TUNE_BASE
+template <typename AccumT, typename OffsetT>
+struct policy_hub_t
+{
+  struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
+  {
+    static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
+    static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
+    static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
+
+    using ReducePolicy =
+      cub::AgentReducePolicy<threads_per_block,
+                             items_per_thread,
+                             AccumT,
+                             items_per_vec_load,
+                             cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                             cub::LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = policy_t;
+};
+#endif // !TUNE_BASE
+
+template <class T>
+struct square_t
+{
+  __host__ __device__ T operator()(const T& x) const
+  {
+    return x * x;
+  }
+};
+
+#define USE_TRANSPOSE_ITERATOR 0
+
+#if USE_TRANSPOSE_ITERATOR
+template <typename T, typename OffsetT>
+void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using accum_t        = T;
+  using input_it_t     = thrust::transform_iterator<square_t<T>, typename thrust::device_vector<T>::iterator>;
+  using output_it_t    = T*;
+  using offset_t       = cub::detail::choose_offset_t<OffsetT>;
+  using output_t       = T;
+  using init_t         = T;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<T>;
+
+#  if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t, offset_t>;
+  using dispatch_t = cub::DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t, policy_t>;
+#  else // TUNE_BASE
+  using dispatch_t = cub::DispatchReduce<input_it_t, output_it_t, offset_t, reduction_op_t, init_t, accum_t>;
+#  endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(1);
+
+  input_it_t d_in   = thrust::make_transform_iterator(in.begin(), square_t<T>{});
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(
+    nullptr, temp_size, d_in, d_out, static_cast<offset_t>(elements), reduction_op_t{}, init_t{}, 0 /* stream */);
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_out,
+      static_cast<offset_t>(elements),
+      reduction_op_t{},
+      init_t{},
+      launch.get_stream());
+  });
+}
+#else
+template <typename T, typename OffsetT>
+void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  using accum_t        = T;
+  using input_it_t     = const T*;
+  using output_it_t    = T*;
+  using offset_t       = cub::detail::choose_offset_t<OffsetT>;
+  using output_t       = T;
+  using init_t         = T;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<T>;
+
+#  if !TUNE_BASE
+  using policy_t   = policy_hub_t<accum_t, offset_t>;
+  using dispatch_t = cub::
+    DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t, policy_t>;
+#  else // TUNE_BASE
+  using dispatch_t =
+    cub::DispatchTransformReduce<input_it_t, output_it_t, offset_t, reduction_op_t, transform_op_t, init_t, accum_t>;
+#  endif // TUNE_BASE
+
+  // Retrieve axis parameters
+  const auto elements         = static_cast<std::size_t>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> in = generate(elements);
+  thrust::device_vector<T> out(1);
+
+  input_it_t d_in   = thrust::raw_pointer_cast(in.data());
+  output_it_t d_out = thrust::raw_pointer_cast(out.data());
+
+  // Enable throughput calculations and add "Size" column to results.
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements, "Size");
+  state.add_global_memory_writes<T>(1);
+
+  // Allocate temporary storage:
+  std::size_t temp_size;
+  dispatch_t::Dispatch(
+    nullptr,
+    temp_size,
+    d_in,
+    d_out,
+    static_cast<offset_t>(elements),
+    reduction_op_t{},
+    init_t{},
+    0 /* stream */,
+    transform_op_t{});
+
+  thrust::device_vector<nvbench::uint8_t> temp(temp_size);
+  auto* temp_storage = thrust::raw_pointer_cast(temp.data());
+
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
+    dispatch_t::Dispatch(
+      temp_storage,
+      temp_size,
+      d_in,
+      d_out,
+      static_cast<offset_t>(elements),
+      reduction_op_t{},
+      init_t{},
+      launch.get_stream(),
+      transform_op_t{});
+  });
+}
+#endif
+
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/.gitignore b/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/.gitignore
new file mode 100644
index 000000000..94143827e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/.gitignore
@@ -0,0 +1 @@
+Dockerfile
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/recipe.py b/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/recipe.py
new file mode 100644
index 000000000..4dc700765
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/docker/recipe.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+import hpccm
+
+hpccm.config.set_container_format('docker')
+
+Stage0 += hpccm.primitives.baseimage(image='nvidia/cuda:12.2.0-devel-ubuntu22.04')
+Stage0 += hpccm.building_blocks.apt_get(ospackages=['git', 'tmux', 'gcc', 'g++', 'vim', 'python3', 'python-is-python3', 'ninja-build'])
+# Stage0 += hpccm.building_blocks.llvm(version='15', extra_tools=True, toolset=True)
+Stage0 += hpccm.building_blocks.cmake(eula=True, version='3.26.3')
+# Stage0 += hpccm.building_blocks.nsight_compute(eula=True, version='2023.1.1')
+Stage0 += hpccm.building_blocks.pip(packages=['fpzip', 'numpy', 'pandas', 'pynvml'], pip='pip3')
+Stage0 += hpccm.primitives.environment(variables={'CUDA_MODULE_LOADING': 'EAGER'})
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/CMakeLists.txt
new file mode 100644
index 000000000..24b12c121
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/CMakeLists.txt
@@ -0,0 +1,40 @@
+cccl_get_catch2()
+cccl_get_nvbench()
+
+add_library(nvbench_helper OBJECT nvbench_helper/nvbench_helper.cuh
+                                  nvbench_helper/nvbench_helper.cu)
+
+target_link_libraries(nvbench_helper PUBLIC CUB::CUB
+                                            Thrust::Thrust
+                                            CUB::libcudacxx
+                                            nvbench::nvbench
+                                     PRIVATE CUDA::curand)
+
+target_include_directories(nvbench_helper PUBLIC "${CMAKE_CURRENT_LIST_DIR}/nvbench_helper")
+set_target_properties(nvbench_helper PROPERTIES CUDA_STANDARD 17 CXX_STANDARD 17)
+
+
+option(CUB_ENABLE_NVBENCH_HELPER_TESTS "Enable tests for nvbench_helper" OFF)
+mark_as_advanced(CUB_ENABLE_NVBENCH_HELPER_TESTS)
+
+if (CUB_ENABLE_NVBENCH_HELPER_TESTS)
+  cccl_get_boost()
+
+  function(add_nvbench_helper_test device_system)
+    set(nvbench_helper_test_target nvbench_helper.test.${device_system})
+    add_executable(${nvbench_helper_test_target} test/gen_seed.cu
+                                                 test/gen_range.cu
+                                                 test/gen_entropy.cu
+                                                 test/gen_uniform_distribution.cu
+                                                 test/gen_power_law_distribution.cu
+                                                 test/main.cpp)
+    cccl_configure_target(${nvbench_helper_test_target} DIALECT 17)
+    target_link_libraries(${nvbench_helper_test_target} PRIVATE nvbench_helper Catch2::Catch2 Boost::math)
+    if ("${device_system}" STREQUAL "cpp")
+      target_compile_definitions(${nvbench_helper_test_target} PRIVATE THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP)
+    endif()
+  endfunction()
+
+  add_nvbench_helper_test(cpp)
+  add_nvbench_helper_test(cuda)
+endif()
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/look_back_helper.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/look_back_helper.cuh
new file mode 100644
index 000000000..ffd64e2f0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/look_back_helper.cuh
@@ -0,0 +1,50 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#if !TUNE_BASE
+#  include <cub/agent/single_pass_scan_operators.cuh>
+
+#  include <nvbench_helper.cuh>
+
+#  if !defined(TUNE_MAGIC_NS) || !defined(TUNE_L2_WRITE_LATENCY_NS) || !defined(TUNE_DELAY_CONSTRUCTOR_ID)
+#    error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined"
+#  endif
+
+using delay_constructors = nvbench::type_list<
+  cub::detail::no_delay_constructor_t<TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::fixed_delay_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backoff_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backoff_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backoff_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backon_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backon_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
+  cub::detail::exponential_backon_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>>;
+
+using delay_constructor_t = nvbench::tl::get<TUNE_DELAY_CONSTRUCTOR_ID, delay_constructors>;
+#endif // !TUNE_BASE
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu
new file mode 100644
index 000000000..061bec7ed
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cu
@@ -0,0 +1,785 @@
+#include <cub/device/device_copy.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/tabulate.h>
+
+#include <cstdint>
+#include <random>
+#include <type_traits>
+
+#include "thrust/device_vector.h"
+#include <curand.h>
+#include <nvbench_helper.cuh>
+
+namespace
+{
+
+constexpr double lognormal_mean  = 3.0;
+constexpr double lognormal_sigma = 1.2;
+
+enum class executor
+{
+  host,
+  device
+};
+
+class host_generator_t
+{
+public:
+  template <typename T>
+  void generate(seed_t seed, cuda::std::span<T> device_span, bit_entropy entropy, T min, T max);
+
+  const double* new_uniform_distribution(seed_t seed, std::size_t num_items);
+  const double* new_lognormal_distribution(seed_t seed, std::size_t num_items);
+  const double* new_constant(std::size_t num_items, double val);
+
+private:
+  thrust::host_vector<double> m_distribution;
+};
+
+const double* host_generator_t::new_uniform_distribution(seed_t seed, std::size_t num_items)
+{
+  m_distribution.resize(num_items);
+  double* h_distribution = thrust::raw_pointer_cast(m_distribution.data());
+
+  std::default_random_engine re(seed.get());
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+
+  for (std::size_t i = 0; i < num_items; i++)
+  {
+    h_distribution[i] = dist(re);
+  }
+
+  return h_distribution;
+}
+
+const double* host_generator_t::new_lognormal_distribution(seed_t seed, std::size_t num_items)
+{
+  m_distribution.resize(num_items);
+  double* h_distribution = thrust::raw_pointer_cast(m_distribution.data());
+
+  std::default_random_engine re(seed.get());
+  std::lognormal_distribution<double> dist(lognormal_mean, lognormal_sigma);
+
+  for (std::size_t i = 0; i < num_items; i++)
+  {
+    h_distribution[i] = dist(re);
+  }
+
+  return h_distribution;
+}
+
+const double* host_generator_t::new_constant(std::size_t num_items, double val)
+{
+  m_distribution.resize(num_items);
+  double* h_distribution = thrust::raw_pointer_cast(m_distribution.data());
+  thrust::fill_n(thrust::host, h_distribution, num_items, val);
+  return h_distribution;
+}
+
+class device_generator_t
+{
+public:
+  device_generator_t()
+  {
+    curandCreateGenerator(&m_gen, CURAND_RNG_PSEUDO_DEFAULT);
+  }
+
+  ~device_generator_t()
+  {
+    curandDestroyGenerator(m_gen);
+  }
+
+  template <typename T>
+  void generate(seed_t seed, cuda::std::span<T> device_span, bit_entropy entropy, T min, T max);
+
+  const double* new_uniform_distribution(seed_t seed, std::size_t num_items);
+  const double* new_lognormal_distribution(seed_t seed, std::size_t num_items);
+  const double* new_constant(std::size_t num_items, double val);
+
+private:
+  curandGenerator_t m_gen;
+  thrust::device_vector<double> m_distribution;
+};
+
+template <typename T>
+struct random_to_item_t
+{
+  double m_min;
+  double m_max;
+
+  __host__ __device__ random_to_item_t(T min, T max)
+      : m_min(static_cast<double>(min))
+      , m_max(static_cast<double>(max))
+  {}
+
+  __host__ __device__ T operator()(double random_value) const
+  {
+    if constexpr (std::is_floating_point_v<T>)
+    {
+      return static_cast<T>((m_max - m_min) * random_value + m_min);
+    }
+    else
+    {
+      return static_cast<T>(floor((m_max - m_min + 1) * random_value + m_min));
+    }
+  }
+};
+
+const double* device_generator_t::new_uniform_distribution(seed_t seed, std::size_t num_items)
+{
+  m_distribution.resize(num_items);
+  double* d_distribution = thrust::raw_pointer_cast(m_distribution.data());
+
+  curandSetPseudoRandomGeneratorSeed(m_gen, seed.get());
+  curandGenerateUniformDouble(m_gen, d_distribution, num_items);
+
+  return d_distribution;
+}
+
+const double* device_generator_t::new_lognormal_distribution(seed_t seed, std::size_t num_items)
+{
+  m_distribution.resize(num_items);
+  double* d_distribution = thrust::raw_pointer_cast(m_distribution.data());
+
+  curandSetPseudoRandomGeneratorSeed(m_gen, seed.get());
+  curandGenerateLogNormalDouble(m_gen, d_distribution, num_items, lognormal_mean, lognormal_sigma);
+
+  return d_distribution;
+}
+
+const double* device_generator_t::new_constant(std::size_t num_items, double val)
+{
+  m_distribution.resize(num_items);
+  double* d_distribution = thrust::raw_pointer_cast(m_distribution.data());
+  thrust::fill_n(thrust::device, d_distribution, num_items, val);
+  return d_distribution;
+}
+
+struct and_t
+{
+  template <class T>
+  __host__ __device__ T operator()(T a, T b) const
+  {
+    return a & b;
+  }
+
+  __host__ __device__ float operator()(float a, float b) const
+  {
+    const std::uint32_t result = reinterpret_cast<std::uint32_t&>(a) & reinterpret_cast<std::uint32_t&>(b);
+    return reinterpret_cast<const float&>(result);
+  }
+
+  __host__ __device__ double operator()(double a, double b) const
+  {
+    const std::uint64_t result = reinterpret_cast<std::uint64_t&>(a) & reinterpret_cast<std::uint64_t&>(b);
+    return reinterpret_cast<const double&>(result);
+  }
+
+  __host__ __device__ complex operator()(complex a, complex b) const
+  {
+    double a_real = a.real();
+    double a_imag = a.imag();
+
+    double b_real = b.real();
+    double b_imag = b.imag();
+
+    const std::uint64_t result_real =
+      reinterpret_cast<std::uint64_t&>(a_real) & reinterpret_cast<std::uint64_t&>(b_real);
+
+    const std::uint64_t result_imag =
+      reinterpret_cast<std::uint64_t&>(a_imag) & reinterpret_cast<std::uint64_t&>(b_imag);
+
+    return {static_cast<float>(reinterpret_cast<const double&>(result_real)),
+            static_cast<float>(reinterpret_cast<const double&>(result_imag))};
+  }
+};
+
+struct set_real_t
+{
+  complex m_min{};
+  complex m_max{};
+  complex* m_d_in{};
+  const double* m_d_tmp{};
+
+  __host__ __device__ void operator()(std::size_t i) const
+  {
+    m_d_in[i].real(random_to_item_t<double>{m_min.real(), m_max.real()}(m_d_tmp[i]));
+  }
+};
+
+struct set_imag_t
+{
+  complex m_min{};
+  complex m_max{};
+  complex* m_d_in{};
+  const double* m_d_tmp{};
+
+  __host__ __device__ void operator()(std::size_t i) const
+  {
+    m_d_in[i].imag(random_to_item_t<double>{m_min.imag(), m_max.imag()}(m_d_tmp[i]));
+  }
+};
+
+template <class T>
+struct lognormal_transformer_t
+{
+  std::size_t total_elements;
+  double sum;
+
+  __host__ __device__ T operator()(double val) const
+  {
+    return floor(val * total_elements / sum);
+  }
+};
+
+class generator_t
+{
+public:
+  template <typename T>
+  void generate(executor exec, seed_t seed, cuda::std::span<T> span, bit_entropy entropy, T min, T max)
+  {
+    construct_guard(exec);
+
+    if (exec == executor::device)
+    {
+      this->generate(thrust::device, *m_device_generator, seed, span, entropy, min, max);
+    }
+    else
+    {
+      this->generate(thrust::host, *m_host_generator, seed, span, entropy, min, max);
+    }
+  }
+
+  template <typename T>
+  void power_law_segment_offsets(executor exec, seed_t seed, cuda::std::span<T> span, std::size_t total_elements)
+  {
+    construct_guard(exec);
+
+    if (exec == executor::device)
+    {
+      this->power_law_segment_offsets(thrust::device, *m_device_generator, seed, span, total_elements);
+    }
+    else
+    {
+      this->power_law_segment_offsets(thrust::host, *m_host_generator, seed, span, total_elements);
+    }
+  }
+
+private:
+  void construct_guard(executor exec)
+  {
+    if (exec == executor::device)
+    {
+      if (!m_device_generator)
+      {
+        m_device_generator.emplace();
+      }
+    }
+    else
+    {
+      if (!m_host_generator)
+      {
+        m_host_generator.emplace();
+      }
+    }
+  }
+
+  template <typename ExecT, typename DistT, typename T>
+  void generate(const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span<T> span, bit_entropy entropy, T min, T max);
+
+  template <typename ExecT, typename DistT>
+  void generate(const ExecT& exec,
+                DistT& dist,
+                seed_t seed,
+                cuda::std::span<complex> span,
+                bit_entropy entropy,
+                complex min,
+                complex max);
+
+  template <typename ExecT, typename DistT>
+  void generate(
+    const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span<bool> span, bit_entropy entropy, bool min, bool max);
+
+  template <typename ExecT, typename DistT, typename T>
+  void power_law_segment_offsets(
+    const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span<T> span, std::size_t total_elements);
+
+  std::optional<host_generator_t> m_host_generator;
+  std::optional<device_generator_t> m_device_generator;
+};
+
+template <typename ExecT, typename DistT, typename T>
+void generator_t::generate(
+  const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span<T> span, bit_entropy entropy, T min, T max)
+{
+  switch (entropy)
+  {
+    case bit_entropy::_1_000: {
+      const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+
+      thrust::transform(
+        exec, uniform_distribution, uniform_distribution + span.size(), span.data(), random_to_item_t<T>(min, max));
+      return;
+    }
+    case bit_entropy::_0_000: {
+      std::mt19937 rng;
+      rng.seed(static_cast<std::mt19937::result_type>(seed.get()));
+      std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+      T random_value = random_to_item_t<T>(min, max)(dist(rng));
+      thrust::fill(exec, span.data(), span.data() + span.size(), random_value);
+      return;
+    }
+    default: {
+      const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+      ++seed;
+
+      thrust::transform(
+        exec, uniform_distribution, uniform_distribution + span.size(), span.data(), random_to_item_t<T>(min, max));
+
+      const int number_of_steps = static_cast<int>(entropy);
+
+      constexpr bool is_device = std::is_same_v<DistT, device_generator_t>;
+      using vec_t              = std::conditional_t<is_device, thrust::device_vector<T>, thrust::host_vector<T>>;
+      vec_t tmp_vec(span.size());
+      cuda::std::span<T> tmp(thrust::raw_pointer_cast(tmp_vec.data()), tmp_vec.size());
+
+      for (int i = 0; i < number_of_steps; i++, ++seed)
+      {
+        this->generate(is_device ? executor::device : executor::host, seed, tmp, bit_entropy::_1_000, min, max);
+
+        thrust::transform(exec, span.data(), span.data() + span.size(), tmp.data(), span.data(), and_t{});
+      }
+      return;
+    }
+  };
+}
+
+template <typename ExecT, typename DistT>
+void generator_t::generate(
+  const ExecT& exec,
+  DistT& dist,
+  seed_t seed,
+  cuda::std::span<complex> span,
+  bit_entropy entropy,
+  complex min,
+  complex max)
+{
+  switch (entropy)
+  {
+    case bit_entropy::_1_000: {
+      const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+      thrust::for_each_n(
+        exec, thrust::make_counting_iterator(0), span.size(), set_real_t{min, max, span.data(), uniform_distribution});
+      ++seed;
+
+      uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+      thrust::for_each_n(
+        exec, thrust::make_counting_iterator(0), span.size(), set_imag_t{min, max, span.data(), uniform_distribution});
+      ++seed;
+      return;
+    }
+    case bit_entropy::_0_000: {
+      std::mt19937 rng;
+      rng.seed(static_cast<std::mt19937::result_type>(seed.get()));
+      std::uniform_real_distribution<double> dist(0.0f, 1.0f);
+      const float random_imag = random_to_item_t<double>(min.imag(), max.imag())(dist(rng));
+      const float random_real = random_to_item_t<double>(min.imag(), max.imag())(dist(rng));
+      thrust::fill(exec, span.data(), span.data() + span.size(), complex{random_real, random_imag});
+      return;
+    }
+    default: {
+      const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+      thrust::for_each_n(
+        exec, thrust::make_counting_iterator(0), span.size(), set_real_t{min, max, span.data(), uniform_distribution});
+      ++seed;
+
+      uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+      thrust::for_each_n(
+        exec, thrust::make_counting_iterator(0), span.size(), set_imag_t{min, max, span.data(), uniform_distribution});
+      ++seed;
+
+      const int number_of_steps = static_cast<int>(entropy);
+
+      constexpr bool is_device = std::is_same_v<DistT, device_generator_t>;
+      using vec_t = std::conditional_t<is_device, thrust::device_vector<complex>, thrust::host_vector<complex>>;
+
+      vec_t tmp_vec(span.size());
+      cuda::std::span<complex> tmp(thrust::raw_pointer_cast(tmp_vec.data()), tmp_vec.size());
+
+      for (int i = 0; i < number_of_steps; i++, ++seed)
+      {
+        this->generate(is_device ? executor::device : executor::host, seed, tmp, bit_entropy::_1_000, min, max);
+
+        thrust::transform(exec, span.data(), span.data() + span.size(), tmp.data(), span.data(), and_t{}); // TODO issue
+      }
+      return;
+    }
+  };
+}
+
+struct random_to_probability_t
+{
+  double m_probability;
+
+  __host__ __device__ bool operator()(double random_value) const
+  {
+    return random_value < m_probability;
+  }
+};
+
+template <typename ExecT, typename DistT>
+void generator_t::generate(
+  const ExecT& exec,
+  DistT& dist,
+  seed_t seed,
+  cuda::std::span<bool> span,
+  bit_entropy entropy,
+  bool /* min */,
+  bool /* max */)
+{
+  if (entropy == bit_entropy::_0_000)
+  {
+    thrust::fill(exec, span.data(), span.data() + span.size(), false);
+  }
+  else if (entropy == bit_entropy::_1_000)
+  {
+    thrust::fill(exec, span.data(), span.data() + span.size(), true);
+  }
+  else
+  {
+    const double* uniform_distribution = dist.new_uniform_distribution(seed, span.size());
+
+    thrust::transform(
+      exec,
+      uniform_distribution,
+      uniform_distribution + span.size(),
+      span.data(),
+      random_to_probability_t{entropy_to_probability(entropy)});
+  }
+}
+
+template <class T>
+struct lognormal_adjust_t
+{
+  T* segment_sizes{};
+
+  __host__ __device__ T operator()(std::size_t sid) const
+  {
+    return segment_sizes[sid] + 1;
+  }
+};
+
+template <typename ExecT, typename DistT, typename T>
+void generator_t::power_law_segment_offsets(
+  const ExecT& exec, DistT& dist, seed_t seed, cuda::std::span<T> device_segment_offsets, std::size_t total_elements)
+{
+  const std::size_t total_segments   = device_segment_offsets.size() - 1;
+  const double* uniform_distribution = dist.new_lognormal_distribution(seed, total_segments);
+
+  if (thrust::count(exec, uniform_distribution, uniform_distribution + total_segments, 0.0) == total_segments)
+  {
+    uniform_distribution = dist.new_constant(total_segments, 1.0);
+  }
+
+  const double sum = thrust::reduce(exec, uniform_distribution, uniform_distribution + total_segments);
+
+  thrust::transform(
+    exec,
+    uniform_distribution,
+    uniform_distribution + total_segments,
+    device_segment_offsets.data(),
+    lognormal_transformer_t<T>{total_elements, sum});
+
+  const int diff =
+    total_elements
+    - thrust::reduce(exec, device_segment_offsets.data(), device_segment_offsets.data() + device_segment_offsets.size());
+
+  if (diff > 0)
+  {
+    thrust::tabulate(exec,
+                     device_segment_offsets.data(),
+                     device_segment_offsets.data() + diff,
+                     lognormal_adjust_t<T>{device_segment_offsets.data()});
+  }
+
+  thrust::exclusive_scan(
+    exec,
+    device_segment_offsets.data(),
+    device_segment_offsets.data() + device_segment_offsets.size(),
+    device_segment_offsets.data());
+}
+
+template <typename T>
+void gen(executor exec, seed_t seed, cuda::std::span<T> span, bit_entropy entropy, T min, T max)
+{
+  generator_t{}.generate(exec, seed, span, entropy, min, max);
+}
+
+} // namespace
+
+namespace detail
+{
+
+template <typename T>
+void gen_host(seed_t seed, cuda::std::span<T> span, bit_entropy entropy, T min, T max)
+{
+  gen(executor::host, seed, span, entropy, min, max);
+}
+
+template <typename T>
+void gen_device(seed_t seed, cuda::std::span<T> device_span, bit_entropy entropy, T min, T max)
+{
+  gen(executor::device, seed, device_span, entropy, min, max);
+}
+
+template <class T>
+struct offset_to_iterator_t
+{
+  T* base_it;
+
+  __host__ __device__ __forceinline__ T* operator()(std::size_t offset) const
+  {
+    return base_it + offset;
+  }
+};
+
+template <class T>
+struct repeat_index_t
+{
+  __host__ __device__ __forceinline__ thrust::constant_iterator<T> operator()(std::size_t i)
+  {
+    return thrust::constant_iterator<T>(static_cast<T>(i));
+  }
+};
+
+struct offset_to_size_t
+{
+  std::size_t* offsets = nullptr;
+
+  __host__ __device__ __forceinline__ std::size_t operator()(std::size_t i)
+  {
+    return offsets[i + 1] - offsets[i];
+  }
+};
+
+template <typename T>
+void gen_key_segments(executor exec, seed_t seed, cuda::std::span<T> keys, cuda::std::span<std::size_t> segment_offsets)
+{
+  thrust::counting_iterator<int> iota(0);
+  offset_to_iterator_t<T> dst_transform_op{keys.data()};
+
+  const std::size_t total_segments = segment_offsets.size() - 1;
+
+  auto d_range_srcs  = thrust::make_transform_iterator(iota, repeat_index_t<T>{});
+  auto d_range_dsts  = thrust::make_transform_iterator(segment_offsets.data(), dst_transform_op);
+  auto d_range_sizes = thrust::make_transform_iterator(iota, offset_to_size_t{segment_offsets.data()});
+
+  if (exec == executor::device)
+  {
+    std::uint8_t* d_temp_storage   = nullptr;
+    std::size_t temp_storage_bytes = 0;
+    cub::DeviceCopy::Batched(
+      d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceCopy::Batched(
+      d_temp_storage, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes, total_segments);
+    cudaDeviceSynchronize();
+  }
+  else
+  {
+    for (std::size_t sid = 0; sid < total_segments; sid++)
+    {
+      thrust::copy(d_range_srcs[sid], d_range_srcs[sid] + d_range_sizes[sid], d_range_dsts[sid]);
+    }
+  }
+}
+
+template <class T>
+struct ge_t
+{
+  T val;
+
+  __host__ __device__ bool operator()(T x)
+  {
+    return x >= val;
+  }
+};
+
+template <typename T>
+std::size_t gen_uniform_offsets(
+  executor exec,
+  seed_t seed,
+  cuda::std::span<T> segment_offsets,
+  std::size_t min_segment_size,
+  std::size_t max_segment_size)
+{
+  const T total_elements = segment_offsets.size() - 2;
+
+  gen(exec,
+      seed,
+      segment_offsets,
+      bit_entropy::_1_000,
+      static_cast<T>(min_segment_size),
+      static_cast<T>(max_segment_size));
+
+  auto tail = [&](const auto& policy) {
+    thrust::fill_n(policy, segment_offsets.data() + total_elements, 1, total_elements + 1);
+    thrust::exclusive_scan(
+      policy, segment_offsets.data(), segment_offsets.data() + segment_offsets.size(), segment_offsets.data());
+    auto iter = thrust::find_if(
+      policy, segment_offsets.data(), segment_offsets.data() + segment_offsets.size(), ge_t<T>{total_elements});
+    auto dist = thrust::distance(segment_offsets.data(), iter);
+    thrust::fill_n(policy, segment_offsets.data() + dist, 1, total_elements);
+    return dist + 1;
+  };
+
+  if (exec == executor::device)
+  {
+    return tail(thrust::device);
+  }
+
+  return tail(thrust::host);
+}
+
+} // namespace detail
+
+namespace detail
+{
+
+/**
+ * @brief Generates a vector of random key segments.
+ *
+ * Not all parameter combinations can be satisfied. For instance, if the total
+ * elements is less than the minimal segment size, the function will return a
+ * vector with a single element that is outside of the requested range.
+ * At most one segment can be out of the requested range.
+ */
+template <typename T>
+void gen_uniform_key_segments_host(
+  seed_t seed, cuda::std::span<T> keys, std::size_t min_segment_size, std::size_t max_segment_size)
+{
+  thrust::host_vector<std::size_t> segment_offsets(keys.size() + 2);
+
+  {
+    cuda::std::span<std::size_t> segment_offsets_span(
+      thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size());
+    const std::size_t offsets_size =
+      gen_uniform_offsets(executor::host, seed, segment_offsets_span, min_segment_size, max_segment_size);
+    segment_offsets.resize(offsets_size);
+  }
+
+  cuda::std::span<std::size_t> segment_offsets_span(
+    thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size());
+
+  gen_key_segments(executor::host, seed, keys, segment_offsets_span);
+}
+
+template <typename T>
+void gen_uniform_key_segments_device(
+  seed_t seed, cuda::std::span<T> keys, std::size_t min_segment_size, std::size_t max_segment_size)
+{
+  thrust::device_vector<std::size_t> segment_offsets(keys.size() + 2);
+
+  {
+    cuda::std::span<std::size_t> segment_offsets_span(
+      thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size());
+    const std::size_t offsets_size =
+      gen_uniform_offsets(executor::device, seed, segment_offsets_span, min_segment_size, max_segment_size);
+    segment_offsets.resize(offsets_size);
+  }
+
+  cuda::std::span<std::size_t> segment_offsets_span(
+    thrust::raw_pointer_cast(segment_offsets.data()), segment_offsets.size());
+
+  gen_key_segments(executor::device, seed, keys, segment_offsets_span);
+}
+
+template <typename T>
+std::size_t gen_uniform_segment_offsets_host(
+  seed_t seed, cuda::std::span<T> segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size)
+{
+  return gen_uniform_offsets(executor::host, seed, segment_offsets, min_segment_size, max_segment_size);
+}
+
+template <typename T>
+std::size_t gen_uniform_segment_offsets_device(
+  seed_t seed, cuda::std::span<T> segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size)
+{
+  return gen_uniform_offsets(executor::device, seed, segment_offsets, min_segment_size, max_segment_size);
+}
+
+template <typename T>
+void gen_power_law_segment_offsets_host(seed_t seed, cuda::std::span<T> segment_offsets, std::size_t elements)
+{
+  generator_t{}.power_law_segment_offsets<T>(executor::host, seed, segment_offsets, elements);
+}
+
+template <typename T>
+void gen_power_law_segment_offsets_device(seed_t seed, cuda::std::span<T> segment_offsets, std::size_t elements)
+{
+  generator_t{}.power_law_segment_offsets<T>(executor::device, seed, segment_offsets, elements);
+}
+
+void do_not_optimize(const void* ptr)
+{
+  (void) ptr;
+}
+
+} // namespace detail
+
+#define INSTANTIATE(TYPE)                                                                                       \
+  template void detail::gen_power_law_segment_offsets_host<TYPE>(seed_t, cuda::std::span<TYPE>, std::size_t);   \
+  template void detail::gen_power_law_segment_offsets_device<TYPE>(seed_t, cuda::std::span<TYPE>, std::size_t); \
+  template std::size_t detail::gen_uniform_segment_offsets_host<TYPE>(                                          \
+    seed_t, cuda::std::span<TYPE>, std::size_t, std::size_t);                                                   \
+  template std::size_t detail::gen_uniform_segment_offsets_device<TYPE>(                                        \
+    seed_t, cuda::std::span<TYPE>, std::size_t, std::size_t)
+
+INSTANTIATE(uint32_t);
+INSTANTIATE(uint64_t);
+
+#undef INSTANTIATE
+
+#define INSTANTIATE(TYPE)                                                                                               \
+  template void detail::gen_uniform_key_segments_host<TYPE>(seed_t, cuda::std::span<TYPE>, std::size_t, std::size_t);   \
+  template void detail::gen_uniform_key_segments_device<TYPE>(seed_t, cuda::std::span<TYPE>, std::size_t, std::size_t); \
+  template void detail::gen_device<TYPE>(seed_t, cuda::std::span<TYPE>, bit_entropy, TYPE min, TYPE max);               \
+  template void detail::gen_host<TYPE>(seed_t, cuda::std::span<TYPE>, bit_entropy, TYPE min, TYPE max)
+
+INSTANTIATE(bool);
+
+INSTANTIATE(uint8_t);
+INSTANTIATE(uint16_t);
+INSTANTIATE(uint32_t);
+INSTANTIATE(uint64_t);
+
+INSTANTIATE(int8_t);
+INSTANTIATE(int16_t);
+INSTANTIATE(int32_t);
+INSTANTIATE(int64_t);
+
+#if NVBENCH_HELPER_HAS_I128
+INSTANTIATE(int128_t);
+INSTANTIATE(uint128_t);
+#endif
+
+INSTANTIATE(float);
+INSTANTIATE(double);
+INSTANTIATE(complex);
+#undef INSTANTIATE
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
new file mode 100644
index 000000000..1001ecf77
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -0,0 +1,590 @@
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+
+#include <cuda/std/complex>
+#include <cuda/std/span>
+
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+#include <nvbench/nvbench.cuh>
+
+#if defined(_MSC_VER)
+#  define NVBENCH_HELPER_HAS_I128 0
+#else
+#  define NVBENCH_HELPER_HAS_I128 1
+#endif
+
+#if NVBENCH_HELPER_HAS_I128
+using int128_t  = __int128_t;
+using uint128_t = __uint128_t;
+
+NVBENCH_DECLARE_TYPE_STRINGS(int128_t, "I128", "int128_t");
+NVBENCH_DECLARE_TYPE_STRINGS(uint128_t, "U128", "uint128_t");
+#endif
+
+using complex = cuda::std::complex<float>;
+
+NVBENCH_DECLARE_TYPE_STRINGS(complex, "C64", "complex");
+NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::false_type, "false", "false_type");
+NVBENCH_DECLARE_TYPE_STRINGS(::cuda::std::true_type, "true", "true_type");
+
+namespace detail
+{
+
+template <class T, class List>
+struct push_back
+{};
+
+template <class T, class... As>
+struct push_back<T, nvbench::type_list<As...>>
+{
+  using type = nvbench::type_list<As..., T>;
+};
+
+} // namespace detail
+
+template <class T, class List>
+using push_back_t = typename detail::push_back<T, List>::type;
+
+#ifdef TUNE_OffsetT
+using offset_types = nvbench::type_list<TUNE_OffsetT>;
+#else
+using offset_types = nvbench::type_list<int32_t, int64_t>;
+#endif
+
+#ifdef TUNE_T
+using integral_types    = nvbench::type_list<TUNE_T>;
+using fundamental_types = nvbench::type_list<TUNE_T>;
+using all_types         = nvbench::type_list<TUNE_T>;
+#else
+using integral_types = nvbench::type_list<int8_t, int16_t, int32_t, int64_t>;
+
+using fundamental_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#  if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#  endif
+                     float,
+                     double>;
+
+using all_types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#  if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#  endif
+                     float,
+                     double,
+                     complex>;
+#endif
+
+template <class T>
+class value_wrapper_t
+{
+  T m_val{};
+
+public:
+  explicit value_wrapper_t(T val)
+      : m_val(val)
+  {}
+
+  T get() const
+  {
+    return m_val;
+  }
+
+  value_wrapper_t& operator++()
+  {
+    m_val++;
+    return *this;
+  }
+};
+
+class seed_t : public value_wrapper_t<unsigned long long int>
+{
+public:
+  using value_wrapper_t::value_wrapper_t;
+  using value_wrapper_t::operator++;
+
+  seed_t()
+      : value_wrapper_t(42)
+  {}
+};
+
+enum class bit_entropy
+{
+  _1_000 = 0,
+  _0_811 = 1,
+  _0_544 = 2,
+  _0_337 = 3,
+  _0_201 = 4,
+  _0_000 = 4200
+};
+NVBENCH_DECLARE_TYPE_STRINGS(bit_entropy, "BE", "bit entropy");
+
+[[nodiscard]] inline double entropy_to_probability(bit_entropy entropy)
+{
+  switch (entropy)
+  {
+    case bit_entropy::_1_000:
+      return 1.0;
+    case bit_entropy::_0_811:
+      return 0.811;
+    case bit_entropy::_0_544:
+      return 0.544;
+    case bit_entropy::_0_337:
+      return 0.337;
+    case bit_entropy::_0_201:
+      return 0.201;
+    case bit_entropy::_0_000:
+      return 0.0;
+    default:
+      return 0.0;
+  }
+}
+
+[[nodiscard]] inline bit_entropy str_to_entropy(std::string str)
+{
+  if (str == "1.000")
+  {
+    return bit_entropy::_1_000;
+  }
+  else if (str == "0.811")
+  {
+    return bit_entropy::_0_811;
+  }
+  else if (str == "0.544")
+  {
+    return bit_entropy::_0_544;
+  }
+  else if (str == "0.337")
+  {
+    return bit_entropy::_0_337;
+  }
+  else if (str == "0.201")
+  {
+    return bit_entropy::_0_201;
+  }
+  else if (str == "0.000")
+  {
+    return bit_entropy::_0_000;
+  }
+
+  throw std::runtime_error("Can't convert string to bit entropy");
+}
+
+namespace detail
+{
+
+void do_not_optimize(const void* ptr);
+
+template <typename T>
+void gen_host(seed_t seed, cuda::std::span<T> data, bit_entropy entropy, T min, T max);
+
+template <typename T>
+void gen_device(seed_t seed, cuda::std::span<T> data, bit_entropy entropy, T min, T max);
+
+template <typename T>
+void gen_uniform_key_segments_host(
+  seed_t seed, cuda::std::span<T> data, std::size_t min_segment_size, std::size_t max_segment_size);
+
+template <typename T>
+void gen_uniform_key_segments_device(
+  seed_t seed, cuda::std::span<T> data, std::size_t min_segment_size, std::size_t max_segment_size);
+
+template <typename T>
+std::size_t gen_uniform_segment_offsets_host(
+  seed_t seed, cuda::std::span<T> segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size);
+
+template <typename T>
+std::size_t gen_uniform_segment_offsets_device(
+  seed_t seed, cuda::std::span<T> segment_offsets, std::size_t min_segment_size, std::size_t max_segment_size);
+
+template <typename T>
+void gen_power_law_segment_offsets_host(seed_t seed, cuda::std::span<T> segment_offsets, std::size_t elements);
+
+template <typename T>
+void gen_power_law_segment_offsets_device(seed_t seed, cuda::std::span<T> segment_offsets, std::size_t elements);
+
+namespace
+{
+
+struct generator_base_t
+{
+  seed_t m_seed{};
+  const std::size_t m_elements{0};
+  const bit_entropy m_entropy{bit_entropy::_1_000};
+
+  template <typename T>
+  thrust::device_vector<T> generate(T min, T max)
+  {
+    thrust::device_vector<T> vec(m_elements);
+    cuda::std::span<T> span(thrust::raw_pointer_cast(vec.data()), m_elements);
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    gen_device(m_seed, span, m_entropy, min, max);
+#else
+    gen_host(m_seed, span, m_entropy, min, max);
+#endif
+    ++m_seed;
+    return vec;
+  }
+};
+
+template <class T>
+struct vector_generator_t : generator_base_t
+{
+  const T m_min{std::numeric_limits<T>::min()};
+  const T m_max{std::numeric_limits<T>::max()};
+
+  operator thrust::device_vector<T>()
+  {
+    return generator_base_t::generate(m_min, m_max);
+  }
+};
+
+template <>
+struct vector_generator_t<void> : generator_base_t
+{
+  template <typename T>
+  operator thrust::device_vector<T>()
+  {
+    return generator_base_t::generate(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+  }
+
+  // This overload is needed because numeric limits is not specialized for complex, making
+  // the min and max values for complex equal zero.
+  operator thrust::device_vector<complex>()
+  {
+    const complex min =
+      complex{std::numeric_limits<complex::value_type>::min(), std::numeric_limits<complex::value_type>::min()};
+    const complex max =
+      complex{std::numeric_limits<complex::value_type>::max(), std::numeric_limits<complex::value_type>::max()};
+
+    return generator_base_t::generate(min, max);
+  }
+};
+
+struct uniform_key_segments_generator_t
+{
+  seed_t m_seed{};
+  const std::size_t m_total_elements{0};
+  const std::size_t m_min_segment_size{0};
+  const std::size_t m_max_segment_size{0};
+
+  template <class KeyT>
+  operator thrust::device_vector<KeyT>()
+  {
+    thrust::device_vector<KeyT> keys_vec(m_total_elements);
+    cuda::std::span<KeyT> keys(thrust::raw_pointer_cast(keys_vec.data()), keys_vec.size());
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    gen_uniform_key_segments_device(m_seed, keys, m_min_segment_size, m_max_segment_size);
+#else
+    gen_uniform_key_segments_host(m_seed, keys, m_min_segment_size, m_max_segment_size);
+#endif
+    ++m_seed;
+    return keys_vec;
+  }
+};
+
+struct uniform_segment_offsets_generator_t
+{
+  seed_t m_seed{};
+  const std::size_t m_total_elements{0};
+  const std::size_t m_min_segment_size{0};
+  const std::size_t m_max_segment_size{0};
+
+  template <class OffsetT>
+  operator thrust::device_vector<OffsetT>()
+  {
+    thrust::device_vector<OffsetT> offsets_vec(m_total_elements + 2);
+    cuda::std::span<OffsetT> offsets(thrust::raw_pointer_cast(offsets_vec.data()), offsets_vec.size());
+    const std::size_t offsets_size =
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+      gen_uniform_segment_offsets_device(m_seed, offsets, m_min_segment_size, m_max_segment_size);
+#else
+      gen_uniform_segment_offsets_host(m_seed, offsets, m_min_segment_size, m_max_segment_size);
+#endif
+    offsets_vec.resize(offsets_size);
+    offsets_vec.shrink_to_fit();
+    ++m_seed;
+    return offsets_vec;
+  }
+};
+
+struct power_law_segment_offsets_generator_t
+{
+  seed_t m_seed{};
+  const std::size_t m_elements{0};
+  const std::size_t m_segments{0};
+
+  template <class OffsetT>
+  operator thrust::device_vector<OffsetT>()
+  {
+    thrust::device_vector<OffsetT> offsets_vec(m_segments + 1);
+    cuda::std::span<OffsetT> offsets(thrust::raw_pointer_cast(offsets_vec.data()), offsets_vec.size());
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    gen_power_law_segment_offsets_device(m_seed, offsets, m_elements);
+#else
+    gen_power_law_segment_offsets_host(m_seed, offsets, m_elements);
+#endif
+    ++m_seed;
+    return offsets_vec;
+  }
+};
+
+struct gen_uniform_key_segments_t
+{
+  uniform_key_segments_generator_t
+  operator()(std::size_t total_elements, std::size_t min_segment_size, std::size_t max_segment_size) const
+  {
+    return {seed_t{}, total_elements, min_segment_size, max_segment_size};
+  }
+};
+
+struct gen_uniform_segment_offsets_t
+{
+  uniform_segment_offsets_generator_t
+  operator()(std::size_t total_elements, std::size_t min_segment_size, std::size_t max_segment_size) const
+  {
+    return {seed_t{}, total_elements, min_segment_size, max_segment_size};
+  }
+};
+
+struct gen_uniform_t
+{
+  gen_uniform_key_segments_t key_segments{};
+  gen_uniform_segment_offsets_t segment_offsets{};
+};
+
+struct gen_power_law_segment_offsets_t
+{
+  power_law_segment_offsets_generator_t operator()(std::size_t elements, std::size_t segments) const
+  {
+    return {seed_t{}, elements, segments};
+  }
+};
+
+struct gen_power_law_t
+{
+  gen_power_law_segment_offsets_t segment_offsets{};
+};
+
+struct gen_t
+{
+  vector_generator_t<void> operator()(std::size_t elements, bit_entropy entropy = bit_entropy::_1_000) const
+  {
+    return {seed_t{}, elements, entropy};
+  }
+
+  template <class T>
+  vector_generator_t<T> operator()(
+    std::size_t elements,
+    bit_entropy entropy = bit_entropy::_1_000,
+    T min               = std::numeric_limits<T>::min,
+    T max               = std::numeric_limits<T>::max()) const
+  {
+    return {seed_t{}, elements, entropy, min, max};
+  }
+
+  gen_uniform_t uniform{};
+  gen_power_law_t power_law{};
+};
+} // namespace
+
+} // namespace detail
+
+inline detail::gen_t generate;
+
+template <class T>
+void do_not_optimize(const T& val)
+{
+  detail::do_not_optimize(&val);
+}
+
+struct less_t
+{
+  template <typename DataType>
+  __host__ __device__ bool operator()(const DataType& lhs, const DataType& rhs) const
+  {
+    return lhs < rhs;
+  }
+};
+
+template <>
+__host__ __device__ inline bool less_t::operator()(const complex& lhs, const complex& rhs) const
+{
+  double magnitude_0 = cuda::std::abs(lhs);
+  double magnitude_1 = cuda::std::abs(rhs);
+
+  if (cuda::std::isnan(magnitude_0) || cuda::std::isnan(magnitude_1))
+  {
+    // NaN's are always equal.
+    return false;
+  }
+  else if (cuda::std::isinf(magnitude_0) || cuda::std::isinf(magnitude_1))
+  {
+    // If the real or imaginary part of the complex number has a very large value
+    // (close to the maximum representable value for a double), it is possible that
+    // the magnitude computation can result in positive infinity:
+    // ```cpp
+    // const double large_number = std::numeric_limits<double>::max() / 2;
+    // std::complex<double> z(large_number, large_number);
+    // std::abs(z) == inf;
+    // ```
+    // Dividing both components by a constant before computing the magnitude prevents overflow.
+    const complex::value_type scaler = 0.5;
+
+    magnitude_0 = cuda::std::abs(lhs * scaler);
+    magnitude_1 = cuda::std::abs(rhs * scaler);
+  }
+
+  const complex::value_type difference = cuda::std::abs(magnitude_0 - magnitude_1);
+  const complex::value_type threshold  = cuda::std::numeric_limits<complex::value_type>::epsilon() * 2;
+
+  if (difference < threshold)
+  {
+    // Triangles with the same magnitude are sorted by their phase angle.
+    const complex::value_type phase_angle_0 = cuda::std::arg(lhs);
+    const complex::value_type phase_angle_1 = cuda::std::arg(rhs);
+
+    return phase_angle_0 < phase_angle_1;
+  }
+  else
+  {
+    return magnitude_0 < magnitude_1;
+  }
+}
+
+struct max_t
+{
+  template <typename DataType>
+  __host__ __device__ DataType operator()(const DataType& lhs, const DataType& rhs)
+  {
+    less_t less{};
+    return less(lhs, rhs) ? rhs : lhs;
+  }
+};
+
+namespace
+{
+struct caching_allocator_t
+{
+  using value_type = char;
+
+  caching_allocator_t() = default;
+  ~caching_allocator_t()
+  {
+    free_all();
+  }
+
+  char* allocate(std::ptrdiff_t num_bytes)
+  {
+    value_type* result{};
+    auto free_block = free_blocks.find(num_bytes);
+
+    if (free_block != free_blocks.end())
+    {
+      result = free_block->second;
+      free_blocks.erase(free_block);
+    }
+    else
+    {
+      result = do_allocate(num_bytes);
+    }
+
+    allocated_blocks.insert(std::make_pair(result, num_bytes));
+    return result;
+  }
+
+  void deallocate(char* ptr, size_t)
+  {
+    auto iter = allocated_blocks.find(ptr);
+    if (iter == allocated_blocks.end())
+    {
+      throw std::runtime_error("Memory was not allocated by this allocator");
+    }
+
+    std::ptrdiff_t num_bytes = iter->second;
+    allocated_blocks.erase(iter);
+    free_blocks.insert(std::make_pair(num_bytes, ptr));
+  }
+
+private:
+  using free_blocks_type      = std::multimap<std::ptrdiff_t, char*>;
+  using allocated_blocks_type = std::map<char*, std::ptrdiff_t>;
+
+  free_blocks_type free_blocks;
+  allocated_blocks_type allocated_blocks;
+
+  void free_all()
+  {
+    for (auto i : free_blocks)
+    {
+      do_deallocate(i.second);
+    }
+
+    for (auto i : allocated_blocks)
+    {
+      do_deallocate(i.first);
+    }
+  }
+
+  value_type* do_allocate(std::size_t num_bytes)
+  {
+    value_type* result{};
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    const cudaError_t status = cudaMalloc(&result, num_bytes);
+    if (cudaSuccess != status)
+    {
+      throw std::runtime_error(std::string("Failed to allocate device memory: ") + cudaGetErrorString(status));
+    }
+#else
+    result = new value_type[num_bytes];
+#endif
+    return result;
+  }
+
+  void do_deallocate(value_type* ptr)
+  {
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    cudaFree(ptr);
+#else
+    delete[] ptr;
+#endif
+  }
+};
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+auto policy(caching_allocator_t& alloc)
+{
+  return thrust::cuda::par(alloc);
+}
+#else
+auto policy(caching_allocator_t&)
+{
+  return thrust::device;
+}
+#endif
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+auto policy(caching_allocator_t& alloc, nvbench::launch& launch)
+{
+  return thrust::cuda::par(alloc).on(launch.get_stream());
+}
+#else
+auto policy(caching_allocator_t&, nvbench::launch&)
+{
+  return thrust::device;
+}
+#endif
+
+} // namespace
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_entropy.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_entropy.cu
new file mode 100644
index 000000000..967b8ff0e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_entropy.cu
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_run_length_encode.cuh>
+
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <array>
+
+#include <catch2/catch.hpp>
+#include <nvbench_helper.cuh>
+
+template <class T>
+double get_expected_entropy(bit_entropy in_entropy)
+{
+  if (in_entropy == bit_entropy::_0_000)
+  {
+    return 0.0;
+  }
+
+  if (in_entropy == bit_entropy::_1_000)
+  {
+    return sizeof(T) * 8;
+  }
+
+  const int samples    = static_cast<int>(in_entropy) + 1;
+  const double p1      = std::pow(0.5, samples);
+  const double p2      = 1 - p1;
+  const double entropy = (-p1 * std::log2(p1)) + (-p2 * std::log2(p2));
+  return sizeof(T) * 8 * entropy;
+}
+
+template <class T>
+double compute_actual_entropy(thrust::device_vector<T> in)
+{
+  const int n = static_cast<int>(in.size());
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  thrust::device_vector<T> unique(n);
+  thrust::device_vector<int> counts(n);
+  thrust::device_vector<int> num_runs(1);
+  thrust::sort(in.begin(), in.end(), less_t{});
+
+  // RLE
+  void* d_temp_storage           = nullptr;
+  std::size_t temp_storage_bytes = 0;
+
+  T* d_in             = thrust::raw_pointer_cast(in.data());
+  T* d_unique_out     = thrust::raw_pointer_cast(unique.data());
+  int* d_counts_out   = thrust::raw_pointer_cast(counts.data());
+  int* d_num_runs_out = thrust::raw_pointer_cast(num_runs.data());
+
+  cub::DeviceRunLengthEncode::Encode(
+    d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, n);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  cub::DeviceRunLengthEncode::Encode(
+    d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, n);
+
+  thrust::host_vector<int> h_counts   = counts;
+  thrust::host_vector<int> h_num_runs = num_runs;
+#else
+  std::vector<T> h_in(in.begin(), in.end());
+  std::sort(h_in.begin(), h_in.end(), less_t{});
+  thrust::host_vector<int> h_counts;
+  T prev     = h_in[0];
+  int length = 1;
+
+  for (std::size_t i = 1; i < h_in.size(); i++)
+  {
+    const T next = h_in[i];
+    if (next == prev)
+    {
+      length++;
+    }
+    else
+    {
+      h_counts.push_back(length);
+      prev   = next;
+      length = 1;
+    }
+  }
+  h_counts.push_back(length);
+
+  thrust::host_vector<int> h_num_runs(1, h_counts.size());
+#endif
+
+  // normalize counts
+  thrust::host_vector<double> ps(h_num_runs[0]);
+  for (std::size_t i = 0; i < ps.size(); i++)
+  {
+    ps[i] = static_cast<double>(h_counts[i]) / n;
+  }
+
+  double entropy = 0.0;
+
+  if (ps.size())
+  {
+    for (double p : ps)
+    {
+      entropy -= p * std::log2(p);
+    }
+  }
+
+  return entropy;
+}
+
+TEMPLATE_LIST_TEST_CASE("Generators produce data with given entropy", "[gen]", fundamental_types)
+{
+  constexpr int num_entropy_levels = 6;
+  std::array<bit_entropy, num_entropy_levels> entropy_levels{
+    bit_entropy::_0_000,
+    bit_entropy::_0_201,
+    bit_entropy::_0_337,
+    bit_entropy::_0_544,
+    bit_entropy::_0_811,
+    bit_entropy::_1_000};
+
+  std::vector<double> entropy(num_entropy_levels);
+  std::transform(entropy_levels.cbegin(), entropy_levels.cend(), entropy.begin(), [](bit_entropy entropy) {
+    const thrust::device_vector<TestType> data = generate(1 << 24, entropy);
+    return compute_actual_entropy(data);
+  });
+
+  REQUIRE(std::is_sorted(entropy.begin(), entropy.end(), less_t{}));
+  REQUIRE(std::unique(entropy.begin(), entropy.end()) == entropy.end());
+}
+
+TEST_CASE("Generators support bool", "[gen]")
+{
+  constexpr int num_entropy_levels = 6;
+  std::array<bit_entropy, num_entropy_levels> entropy_levels{
+    bit_entropy::_0_000,
+    bit_entropy::_0_201,
+    bit_entropy::_0_337,
+    bit_entropy::_0_544,
+    bit_entropy::_0_811,
+    bit_entropy::_1_000};
+
+  std::vector<std::size_t> number_of_set(num_entropy_levels);
+  std::transform(entropy_levels.cbegin(), entropy_levels.cend(), number_of_set.begin(), [](bit_entropy entropy) {
+    const thrust::device_vector<bool> data = generate(1 << 24, entropy);
+    return thrust::count(data.begin(), data.end(), true);
+  });
+
+  REQUIRE(std::is_sorted(number_of_set.begin(), number_of_set.end()));
+  REQUIRE(std::unique(number_of_set.begin(), number_of_set.end()) == number_of_set.end());
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu
new file mode 100644
index 000000000..0d06d308b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_power_law_distribution.cu
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include <boost/math/statistics/anderson_darling.hpp>
+#include <boost/math/statistics/univariate_statistics.hpp>
+#include <catch2/catch.hpp>
+#include <nvbench_helper.cuh>
+
+bool is_normal(thrust::host_vector<double> data)
+{
+  std::sort(data.begin(), data.end());
+  const double A2 = boost::math::statistics::anderson_darling_normality_statistic(data);
+  return A2 / data.size() < 0.05;
+}
+
+using types = nvbench::type_list<uint32_t, uint64_t>;
+
+TEMPLATE_LIST_TEST_CASE("Generators produce power law distributed data", "[gen][power-law]", types)
+{
+  const std::size_t elements                              = 1 << 28;
+  const std::size_t segments                              = 4 * 1024;
+  const thrust::device_vector<TestType> d_segment_offsets = generate.power_law.segment_offsets(elements, segments);
+  REQUIRE(d_segment_offsets.size() == segments + 1);
+
+  std::size_t actual_elements = 0;
+  thrust::host_vector<double> log_sizes(segments);
+  const thrust::host_vector<TestType> h_segment_offsets = d_segment_offsets;
+  for (int i = 0; i < segments; ++i)
+  {
+    const TestType begin = h_segment_offsets[i];
+    const TestType end   = h_segment_offsets[i + 1];
+    REQUIRE(begin <= end);
+
+    const std::size_t size = end - begin;
+    actual_elements += size;
+    log_sizes[i] = std::log(size);
+  }
+
+  REQUIRE(actual_elements == elements);
+  REQUIRE(is_normal(std::move(log_sizes)));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_range.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_range.cu
new file mode 100644
index 000000000..064e0b2f1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_range.cu
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/extrema.h>
+
+#include <limits>
+
+#include <catch2/catch.hpp>
+#include <nvbench_helper.cuh>
+
+using types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#endif
+                     float,
+                     double>;
+
+TEMPLATE_LIST_TEST_CASE("Generators produce data within specified range", "[gen]", types)
+{
+  const auto min = static_cast<TestType>(GENERATE_COPY(take(3, random(-124, 0))));
+  const auto max = static_cast<TestType>(GENERATE_COPY(take(3, random(0, 124))));
+
+  const thrust::device_vector<TestType> data = generate(1 << 16, bit_entropy::_1_000, min, max);
+
+  const TestType min_element = *thrust::min_element(data.begin(), data.end());
+  const TestType max_element = *thrust::max_element(data.begin(), data.end());
+
+  REQUIRE(min_element >= min);
+  REQUIRE(max_element <= max);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_seed.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_seed.cu
new file mode 100644
index 000000000..3f04b2c88
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_seed.cu
@@ -0,0 +1,56 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+
+#include <catch2/catch.hpp>
+#include <nvbench_helper.cuh>
+
+using types =
+  nvbench::type_list<bool,
+                     int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#endif
+                     float,
+                     double,
+                     complex>;
+
+TEMPLATE_LIST_TEST_CASE("Generator seeds the data", "[gen]", types)
+{
+  auto generator = generate(1 << 24, bit_entropy::_0_811);
+
+  const thrust::device_vector<TestType> vec_1 = generator;
+  const thrust::device_vector<TestType> vec_2 = generator;
+
+  REQUIRE(vec_1.size() == vec_2.size());
+  REQUIRE_FALSE(thrust::equal(vec_1.begin(), vec_1.end(), vec_2.begin()));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu
new file mode 100644
index 000000000..ed09ef753
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/gen_uniform_distribution.cu
@@ -0,0 +1,215 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cmath>
+#include <limits>
+#include <map>
+
+#include <boost/math/distributions/chi_squared.hpp>
+#include <catch2/catch.hpp>
+#include <nvbench_helper.cuh>
+
+template <typename T>
+bool is_uniform(thrust::host_vector<T> data, T min, T max)
+{
+  const double value_range = static_cast<double>(max) - min;
+  const bool exact_binning = value_range < (1 << 20);
+  const int number_of_bins = exact_binning ? static_cast<int>(max - min + 1) : static_cast<int>(std::sqrt(data.size()));
+  thrust::host_vector<int> bins(number_of_bins, 0);
+
+  const double interval       = value_range / static_cast<double>(number_of_bins);
+  const double expected_count = static_cast<double>(data.size()) / number_of_bins;
+
+  for (T val : data)
+  {
+    int bin_index = exact_binning ? val - min : (val - static_cast<double>(min)) / interval;
+
+    if (bin_index >= 0 && bin_index < number_of_bins)
+    {
+      bins[bin_index]++;
+    }
+  }
+
+  double chi_square = 0.0;
+  for (const auto& count : bins)
+  {
+    chi_square += std::pow(count - expected_count, 2) / expected_count;
+  }
+
+  boost::math::chi_squared_distribution<double> chi_squared_dist(number_of_bins - 1);
+
+  const double confidence     = 0.95;
+  const double critical_value = boost::math::quantile(chi_squared_dist, confidence);
+
+  return chi_square <= critical_value;
+}
+
+using types =
+  nvbench::type_list<int8_t,
+                     int16_t,
+                     int32_t,
+                     int64_t,
+#if NVBENCH_HELPER_HAS_I128
+                     int128_t,
+#endif
+                     float,
+                     double>;
+
+TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed data", "[gen][uniform]", types)
+{
+  const std::size_t elements = 1 << GENERATE_COPY(16, 20, 24, 28);
+  const TestType min         = std::numeric_limits<TestType>::min();
+  const TestType max         = std::numeric_limits<TestType>::max();
+
+  const thrust::device_vector<TestType> data = generate(elements, bit_entropy::_1_000, min, max);
+
+  REQUIRE(is_uniform<TestType>(data, min, max));
+}
+
+struct complex_to_real_t
+{
+  __host__ __device__ float operator()(const complex& c) const
+  {
+    return c.real();
+  }
+};
+
+struct complex_to_imag_t
+{
+  __host__ __device__ float operator()(const complex& c) const
+  {
+    return c.imag();
+  }
+};
+
+TEST_CASE("Generators produce uniformly distributed complex", "[gen]")
+{
+  const float min = std::numeric_limits<float>::min();
+  const float max = std::numeric_limits<float>::max();
+
+  const thrust::device_vector<complex> data = generate(1 << 16);
+
+  thrust::device_vector<float> component(data.size());
+  thrust::transform(data.begin(), data.end(), component.begin(), complex_to_real_t());
+  REQUIRE(is_uniform<float>(component, min, max));
+
+  thrust::transform(data.begin(), data.end(), component.begin(), complex_to_imag_t());
+  REQUIRE(is_uniform<float>(component, min, max));
+}
+
+TEST_CASE("Generators produce uniformly distributed bools", "[gen]")
+{
+  const thrust::device_vector<bool> data = generate(1 << 24, bit_entropy::_0_544);
+
+  const std::size_t falses = thrust::count(data.begin(), data.end(), false);
+  const std::size_t trues  = thrust::count(data.begin(), data.end(), true);
+
+  REQUIRE(falses > 0);
+  REQUIRE(trues > 0);
+  REQUIRE(falses + trues == data.size());
+
+  const double ratio = static_cast<double>(falses) / trues;
+  REQUIRE(ratio > 0.7);
+}
+
+using offsets = nvbench::type_list<uint32_t, uint64_t>;
+
+TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed offsets", "[gen]", offsets)
+{
+  const std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size = 256;
+  const std::size_t elements         = 1 << GENERATE_COPY(16, 20, 24, 28);
+  const thrust::device_vector<TestType> d_segments =
+    generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size);
+  const thrust::host_vector<TestType> h_segments = d_segments;
+  const std::size_t num_segments                 = h_segments.size() - 1;
+
+  std::size_t actual_elements = 0;
+  thrust::host_vector<int> segment_sizes(num_segments);
+  for (std::size_t sid = 0; sid < num_segments; sid++)
+  {
+    const TestType begin = h_segments[sid];
+    const TestType end   = h_segments[sid + 1];
+    REQUIRE(begin <= end);
+
+    const TestType size = end - begin;
+    REQUIRE(size >= min_segment_size);
+    REQUIRE(size <= max_segment_size);
+
+    segment_sizes[sid] = size;
+    actual_elements += size;
+  }
+
+  REQUIRE(actual_elements == elements);
+  REQUIRE(is_uniform<int>(std::move(segment_sizes), min_segment_size, max_segment_size));
+}
+
+TEMPLATE_LIST_TEST_CASE("Generators produce uniformly distributed key segments", "[gen]", types)
+{
+  const std::size_t min_segment_size = 1;
+  const std::size_t max_segment_size = 128;
+  const std::size_t elements         = 1 << GENERATE_COPY(16, 20, 24, 28);
+  const thrust::device_vector<TestType> d_keys =
+    generate.uniform.key_segments(elements, min_segment_size, max_segment_size);
+  REQUIRE(d_keys.size() == elements);
+
+  const thrust::host_vector<TestType> h_keys = d_keys;
+
+  thrust::host_vector<int> segment_sizes;
+
+  TestType prev = h_keys[0];
+  int length    = 1;
+
+  for (std::size_t kid = 1; kid < elements; kid++)
+  {
+    TestType next = h_keys[kid];
+
+    if (next == prev)
+    {
+      length++;
+    }
+    else
+    {
+      REQUIRE(length >= min_segment_size);
+      REQUIRE(length <= max_segment_size);
+
+      segment_sizes.push_back(length);
+
+      prev   = next;
+      length = 1;
+    }
+  }
+  REQUIRE(length >= min_segment_size);
+  REQUIRE(length <= max_segment_size);
+  segment_sizes.push_back(length);
+
+  REQUIRE(is_uniform<int>(std::move(segment_sizes), min_segment_size, max_segment_size));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/main.cpp b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/main.cpp
new file mode 100644
index 000000000..5dc819f5c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/benchmarks/nvbench_helper/test/main.cpp
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#define CATCH_CONFIG_MAIN
+#include <catch2/catch.hpp>
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubAddSubdir.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubAddSubdir.cmake
new file mode 100644
index 000000000..5f04ab8d4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubAddSubdir.cmake
@@ -0,0 +1 @@
+cccl_add_subdir_helper(CUB)
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildCompilerTargets.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildCompilerTargets.cmake
new file mode 100644
index 000000000..ad1fcdc9f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildCompilerTargets.cmake
@@ -0,0 +1,33 @@
+# This file defines the `cub_build_compiler_targets()` function, which
+# creates the following interface targets:
+#
+# cub.compiler_interface
+# - Interface target linked into all targets in the CUB developer build.
+#   This should not be directly used; it is only used to construct the
+#   per-dialect targets below.
+#
+# cub.compiler_interface_cppXX
+# - Interface targets providing dialect-specific compiler flags. These should
+#   be linked into the developer build targets, as they include both
+#   cub.compiler_interface and cccl.compiler_interface_cppXX.
+
+function(cub_build_compiler_targets)
+  set(cuda_compile_options)
+  set(cxx_compile_options)
+  set(cxx_compile_definitions)
+
+  cccl_build_compiler_interface(cub.compiler_interface
+    "${cuda_compile_options}"
+    "${cxx_compile_options}"
+    "${cxx_compile_definitions}"
+  )
+
+  foreach (dialect IN LISTS CCCL_KNOWN_CXX_DIALECTS)
+    add_library(cub.compiler_interface_cpp${dialect} INTERFACE)
+    target_link_libraries(cub.compiler_interface_cpp${dialect} INTERFACE
+      # order matters here, we need the project options to override the cccl options.
+      cccl.compiler_interface_cpp${dialect}
+      cub.compiler_interface
+    )
+  endforeach()
+endfunction()
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildTargetList.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildTargetList.cmake
new file mode 100644
index 000000000..4755a07dc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubBuildTargetList.cmake
@@ -0,0 +1,241 @@
+# This file provides utilities for building and working with CUB
+# configuration targets.
+#
+# CUB_TARGETS
+#  - Built by the calling the `cub_build_target_list()` function.
+#  - Each item is the name of a CUB interface target that is configured for a
+#    certain build configuration. Currently only C++ standard dialect is
+#    considered.
+#
+# cub_build_target_list()
+# - Creates the CUB_TARGETS list.
+#
+# The following functions can be used to test/set metadata on a CUB target:
+#
+# cub_get_target_property(<prop_var> <target_name> <prop>)
+#   - Checks the ${prop} target property on CUB target ${target_name}
+#     and sets the ${prop_var} variable in the caller's scope.
+#   - <prop_var> is any valid cmake identifier.
+#   - <target_name> is the name of a CUB target.
+#   - <prop> is one of the following:
+#     - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20.
+#     - PREFIX: A unique prefix that should be used to name all
+#       targets/tests/examples that use this configuration.
+#
+# cub_get_target_properties(<target_name>)
+#   - Defines ${target_name}_${prop} in the caller's scope, for `prop` in:
+#     {DIALECT, PREFIX}. See above for details.
+#
+# cub_clone_target_properties(<dst_target> <src_target>)
+#   - Set the {DIALECT, PREFIX} metadata on ${dst_target} to match
+#     ${src_target}. See above for details.
+#   - This *MUST* be called on any targets that link to another CUB target
+#     to ensure that dialect information is updated correctly, e.g.
+#     `cub_clone_target_properties(${my_cub_test} ${some_cub_target})`
+
+# Dialects:
+set(CUB_CPP_DIALECT_OPTIONS
+  11 14 17 20
+  CACHE INTERNAL "C++ dialects supported by CUB." FORCE
+)
+
+define_property(TARGET PROPERTY _CUB_DIALECT
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, 17 or 20."
+  FULL_DOCS "A target's C++ dialect: 11, 14, 17 or 20."
+)
+define_property(TARGET PROPERTY _CUB_PREFIX
+  BRIEF_DOCS "A prefix describing the config, eg. 'cub.cpp17'."
+  FULL_DOCS "A prefix describing the config, eg. 'cub.cpp17'."
+)
+
+function(cub_set_target_properties target_name dialect prefix)
+  cccl_configure_target(${target_name} DIALECT ${dialect})
+
+  set_target_properties(${target_name}
+    PROPERTIES
+      _CUB_DIALECT ${dialect}
+      _CUB_PREFIX ${prefix}
+  )
+endfunction()
+
+# Get a cub property from a target and store it in var_name
+# cub_get_target_property(<var_name> <target_name> [DIALECT|PREFIX]
+macro(cub_get_target_property prop_var target_name prop)
+  get_property(${prop_var} TARGET ${target_name} PROPERTY _CUB_${prop})
+endmacro()
+
+# Defines the following string variables in the caller's scope:
+# - ${target_name}_DIALECT
+# - ${target_name}_PREFIX
+macro(cub_get_target_properties target_name)
+  cub_get_target_property(${target_name}_DIALECT ${target_name} DIALECT)
+  cub_get_target_property(${target_name}_PREFIX ${target_name} PREFIX)
+endmacro()
+
+# Set one target's _CUB_* properties to match another target
+function(cub_clone_target_properties dst_target src_target)
+  cub_get_target_properties(${src_target})
+  cub_set_target_properties(${dst_target}
+    ${${src_target}_DIALECT}
+    ${${src_target}_PREFIX}
+  )
+endfunction()
+
+# Set ${var_name} to TRUE or FALSE in the caller's scope
+function(_cub_is_config_valid var_name dialect)
+  if (CUB_ENABLE_DIALECT_CPP${dialect})
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_cub_init_target_list)
+  set(CUB_TARGETS "" CACHE INTERNAL "" FORCE)
+endfunction()
+
+function(_cub_add_target_to_target_list target_name dialect prefix)
+  cub_set_target_properties(${target_name} ${dialect} ${prefix})
+
+  target_link_libraries(${target_name} INTERFACE
+    CUB::CUB
+    cub.compiler_interface_cpp${dialect}
+  )
+
+  if (TARGET cub.thrust)
+    target_link_libraries(${target_name} INTERFACE cub.thrust)
+  endif()
+
+  set(CUB_TARGETS ${CUB_TARGETS} ${target_name} CACHE INTERNAL "" FORCE)
+
+  set(label "cpp${dialect}")
+  string(TOLOWER "${label}" label)
+  message(STATUS "Enabling CUB configuration: ${label}")
+endfunction()
+
+# Build a ${CUB_TARGETS} list containing target names for all
+# requested configurations
+function(cub_build_target_list)
+  # Clear the list of targets:
+  _cub_init_target_list()
+
+  # Handle dialect options:
+  set(num_dialects_enabled 0)
+  foreach (dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
+    # Create CMake options:
+    set(default_value OFF)
+    if (dialect EQUAL 17) # Default to just 17 on:
+      set(default_value ON)
+    endif()
+    option(CUB_ENABLE_DIALECT_CPP${dialect}
+      "Generate C++${dialect} build configurations."
+      ${default_value}
+    )
+
+    if (CUB_ENABLE_DIALECT_CPP${dialect})
+      math(EXPR num_dialects_enabled "${num_dialects_enabled} + 1")
+    endif()
+  endforeach()
+
+  # Ensure that only one C++ dialect is enabled when dialect info is hidden:
+  if ((NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES) AND (NOT num_dialects_enabled EQUAL 1))
+    message(FATAL_ERROR
+      "Only one CUB_ENABLE_DIALECT_CPP## option allowed when "
+      "CUB_ENABLE_CPP_DIALECT_IN_NAMES is OFF."
+    )
+  endif()
+
+  # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3:
+  if (CUB_ENABLE_DIALECT_CPP17)
+    cmake_minimum_required(VERSION 3.18.3)
+  endif()
+
+  # Supported versions of MSVC do not distinguish between C++11 and C++14.
+  # Warn the user that they may be generating a ton of redundant targets.
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
+      CUB_ENABLE_DIALECT_CPP11)
+    message(WARNING
+      "Supported versions of MSVC (2017+) do not distinguish between C++11 "
+      "and C++14. The requested C++11 targets will be built with C++14."
+    )
+  endif()
+
+  # Generic config flags:
+  macro(add_flag_option flag docstring default)
+    set(cub_opt "CUB_${flag}")
+    option(${cub_opt} "${docstring}" "${default}")
+    mark_as_advanced(${cub_opt})
+  endmacro()
+  add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_14 "Don't warn about deprecated C++14." OFF)
+  add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
+
+  # Set up the CUB target while testing out our find_package scripts.
+  find_package(CUB REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${CCCL_SOURCE_DIR}/lib/cmake/cub/"
+  )
+
+  # TODO
+  # Some of the iterators and unittests depend on thrust. We should break the
+  # cyclical dependency by migrating CUB's Thrust bits into Thrust.
+  find_package(Thrust ${CUB_VERSION} EXACT CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${CCCL_SOURCE_DIR}/lib/cmake/thrust/"
+  )
+
+  if (Thrust_FOUND)
+    thrust_set_CUB_target(CUB::CUB)
+    thrust_create_target(cub.thrust HOST CPP DEVICE CUDA)
+  else()
+    message(STATUS
+      "Thrust was not found. Set CMake variable 'Thrust_DIR' to the "
+      "thrust-config.cmake file of a Thrust ${CUB_VERSION} installation to "
+      "enable additional testing."
+    )
+  endif()
+
+  # Build CUB_TARGETS
+  foreach(dialect IN LISTS CUB_CPP_DIALECT_OPTIONS)
+    _cub_is_config_valid(config_valid ${dialect})
+    if (config_valid)
+      if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES)
+        set(prefix "cub")
+      else()
+        set(prefix "cub.cpp${dialect}")
+      endif()
+      set(target_name "${prefix}")
+
+      add_library(${target_name} INTERFACE)
+
+      # Set configuration metadata for this cub interface target:
+      _cub_add_target_to_target_list(${target_name} ${dialect} ${prefix})
+    endif()
+  endforeach() # dialects
+
+  list(LENGTH CUB_TARGETS count)
+  message(STATUS "${count} unique cub.dialect configurations generated")
+
+  # Top level meta-target. Makes it easier to just build CUB targets when
+  # building both CUB and Thrust. Add all project files here so IDEs will be
+  # aware of them. This will not generate build rules.
+  file(GLOB_RECURSE all_sources
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    "${CUB_SOURCE_DIR}/cub/*.cuh"
+  )
+
+  # Add a cub.all target that builds all configs.
+  if (NOT CUB_ENABLE_CPP_DIALECT_IN_NAMES)
+    add_custom_target(cub.all)
+  else()
+    add_custom_target(cub.all SOURCES ${all_sources})
+
+    # Create meta targets for each config:
+    foreach(cub_target IN LISTS CUB_TARGETS)
+      cub_get_target_property(config_prefix ${cub_target} PREFIX)
+      add_custom_target(${config_prefix}.all)
+      add_dependencies(cub.all ${config_prefix}.all)
+    endforeach()
+  endif()
+endfunction()
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubCudaConfig.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubCudaConfig.cmake
new file mode 100644
index 000000000..594009296
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubCudaConfig.cmake
@@ -0,0 +1,48 @@
+enable_language(CUDA)
+
+#
+# Architecture options:
+#
+
+# Since we have to filter the arch list based on target features, we don't
+# currently support the convenience arch flags:
+if ("all" IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+    "all-major" IN_LIST CMAKE_CUDA_ARCHITECTURES OR
+    "native" IN_LIST CMAKE_CUDA_ARCHITECTURES)
+  message(FATAL_ERROR
+    "The CUB dev build requires an explicit list of architectures in CMAKE_CUDA_ARCHITECTURES. "
+    "The convenience flags of 'all', 'all-major', and 'native' are not supported.\n"
+    "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
+# Create a new arch list that only contains arches that support CDP:
+set(CUB_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
+set(CUB_CUDA_ARCHITECTURES_RDC ${CUB_CUDA_ARCHITECTURES})
+list(FILTER CUB_CUDA_ARCHITECTURES_RDC EXCLUDE REGEX "53|62|72")
+
+message(STATUS "CUB_CUDA_ARCHITECTURES:     ${CUB_CUDA_ARCHITECTURES}")
+message(STATUS "CUB_CUDA_ARCHITECTURES_RDC: ${CUB_CUDA_ARCHITECTURES_RDC}")
+
+if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  # Currently, there are linkage issues caused by bugs in interaction between MSBuild and CMake object libraries
+  # that take place with -rdc builds. Changing the default for now.
+  option(CUB_ENABLE_RDC_TESTS "Enable tests that require separable compilation." OFF)
+else()
+  option(CUB_ENABLE_RDC_TESTS "Enable tests that require separable compilation." ON)
+endif()
+
+option(CUB_FORCE_RDC "Enable separable compilation on all targets that support it." OFF)
+
+list(LENGTH CUB_CUDA_ARCHITECTURES_RDC rdc_arch_count)
+if (rdc_arch_count EQUAL 0)
+  message(NOTICE "Disabling CUB CDPv1 targets as no enabled architectures support it.")
+  set(CUB_ENABLE_RDC_TESTS OFF CACHE BOOL "" FORCE)
+  set(CUB_FORCE_RDC OFF CACHE BOOL "" FORCE)
+endif()
+
+#
+# Clang CUDA options
+#
+if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions")
+endif ()
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubHeaderTesting.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubHeaderTesting.cmake
new file mode 100644
index 000000000..bf2cf6e2e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubHeaderTesting.cmake
@@ -0,0 +1,49 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+# Meta target for all configs' header builds:
+add_custom_target(cub.all.headers)
+
+function(cub_add_header_test label definitions)
+  foreach(cub_target IN LISTS CUB_TARGETS)
+    cub_get_target_property(config_dialect ${cub_target} DIALECT)
+    cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+    set(headertest_target ${config_prefix}.headers.${label})
+
+    cccl_generate_header_tests(${headertest_target} cub
+      DIALECT ${config_dialect}
+      GLOBS "cub/*.cuh"
+    )
+    target_link_libraries(${headertest_target} PUBLIC ${cub_target})
+    target_compile_definitions(${headertest_target} PRIVATE ${definitions})
+    cub_clone_target_properties(${headertest_target} ${cub_target})
+    cub_configure_cuda_target(${headertest_target} RDC ${CUB_FORCE_RDC})
+
+    add_dependencies(cub.all.headers ${headertest_target})
+    add_dependencies(${config_prefix}.all ${headertest_target})
+  endforeach()
+endfunction()
+
+# Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
+set(header_definitions
+  "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+  "CUB_WRAPPED_NAMESPACE=wrapped_cub")
+cub_add_header_test(base "${header_definitions}")
+
+# Check that BF16 support can be disabled
+set(header_definitions
+  "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+  "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+  "CCCL_DISABLE_BF16_SUPPORT")
+cub_add_header_test(no_bf16 "${header_definitions}")
+
+# Check that half support can be disabled
+set(header_definitions
+  "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+  "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+  "CCCL_DISABLE_FP16_SUPPORT")
+cub_add_header_test(no_half "${header_definitions}")
diff --git a/source/tnn/device/cuda/thirdparty/cub/cmake/CubUtilities.cmake b/source/tnn/device/cuda/thirdparty/cub/cmake/CubUtilities.cmake
new file mode 100644
index 000000000..9607f228c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cmake/CubUtilities.cmake
@@ -0,0 +1,30 @@
+# cub_configure_cuda_target(<target_name> RDC <ON|OFF>)
+#
+# Configures `target_name` with the appropriate CUDA architectures and RDC state.
+function(cub_configure_cuda_target target_name)
+  set(options)
+  set(one_value_args RDC)
+  set(multi_value_args)
+  cmake_parse_arguments(cub_cuda "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN})
+
+  if (cub_cuda_UNPARSED_ARGUMENTS)
+    message(AUTHOR_WARNING
+      "Unrecognized arguments passed to cub_configure_cuda_target: "
+      ${cub_cuda_UNPARSED_ARGUMENTS})
+  endif()
+
+  if (NOT DEFINED cub_cuda_RDC)
+    message(AUTHOR_WARNING "RDC option required for cub_configure_cuda_target.")
+  endif()
+
+  if (cub_cuda_RDC)
+    set_target_properties(${target_name} PROPERTIES
+      CUDA_ARCHITECTURES "${CUB_CUDA_ARCHITECTURES_RDC}"
+      POSITION_INDEPENDENT_CODE ON
+      CUDA_SEPARABLE_COMPILATION ON)
+  else()
+    set_target_properties(${target_name} PROPERTIES
+      CUDA_ARCHITECTURES "${CUB_CUDA_ARCHITECTURES}"
+      CUDA_SEPARABLE_COMPILATION OFF)
+  endif()
+endfunction()
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_adjacent_difference.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_adjacent_difference.cuh
new file mode 100644
index 000000000..37e1a0131
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_adjacent_difference.cuh
@@ -0,0 +1,253 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_adjacent_difference.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+CUB_NAMESPACE_BEGIN
+
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD                     = 1,
+          cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+          cub::CacheLoadModifier _LOAD_MODIFIER     = cub::LOAD_LDG,
+          cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
+struct AgentAdjacentDifferencePolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+template <typename Policy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          typename InputT,
+          typename OutputT,
+          bool MayAlias,
+          bool ReadLeft>
+struct AgentDifference
+{
+  using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
+
+  using BlockLoad  = typename cub::BlockLoadType<Policy, LoadIt>::type;
+  using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
+
+  using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference<InputT, Policy::BLOCK_THREADS>;
+
+  union _TempStorage
+  {
+    typename BlockLoad::TempStorage load;
+    typename BlockStore::TempStorage store;
+    typename BlockAdjacentDifferenceT::TempStorage adjacent_difference;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  static constexpr int BLOCK_THREADS      = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD   = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE     = Policy::ITEMS_PER_TILE;
+  static constexpr int SHARED_MEMORY_SIZE = static_cast<int>(sizeof(TempStorage));
+
+  _TempStorage& temp_storage;
+  InputIteratorT input_it;
+  LoadIt load_it;
+  InputT* first_tile_previous;
+  OutputIteratorT result;
+  DifferenceOpT difference_op;
+  OffsetT num_items;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentDifference(
+    TempStorage& temp_storage,
+    InputIteratorT input_it,
+    InputT* first_tile_previous,
+    OutputIteratorT result,
+    DifferenceOpT difference_op,
+    OffsetT num_items)
+      : temp_storage(temp_storage.Alias())
+      , input_it(input_it)
+      , load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
+      , first_tile_previous(first_tile_previous)
+      , result(result)
+      , difference_op(difference_op)
+      , num_items(num_items)
+  {}
+
+  template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile_impl(int num_remaining, int tile_idx, OffsetT tile_base)
+  {
+    InputT input[ITEMS_PER_THREAD];
+    OutputT output[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoad(temp_storage.load).Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base));
+    }
+    else
+    {
+      BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
+    }
+
+    CTA_SYNC();
+
+    if (ReadLeft)
+    {
+      if (IS_FIRST_TILE)
+      {
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input, output, difference_op, num_remaining);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference).SubtractLeft(input, output, difference_op);
+        }
+      }
+      else
+      {
+        InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base - 1);
+
+        if (IS_LAST_TILE)
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeftPartialTile(input, output, difference_op, num_remaining, tile_prev_input);
+        }
+        else
+        {
+          BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+            .SubtractLeft(input, output, difference_op, tile_prev_input);
+        }
+      }
+    }
+    else
+    {
+      if (IS_LAST_TILE)
+      {
+        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+          .SubtractRightPartialTile(input, output, difference_op, num_remaining);
+      }
+      else
+      {
+        InputT tile_next_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base + ITEMS_PER_TILE);
+
+        BlockAdjacentDifferenceT(temp_storage.adjacent_difference)
+          .SubtractRight(input, output, difference_op, tile_next_input);
+      }
+    }
+
+    CTA_SYNC();
+
+    if (IS_LAST_TILE)
+    {
+      BlockStore(temp_storage.store).Store(result + tile_base, output, num_remaining);
+    }
+    else
+    {
+      BlockStore(temp_storage.store).Store(result + tile_base, output);
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int num_remaining, int tile_idx, OffsetT tile_base)
+  {
+    if (tile_idx == 0)
+    {
+      consume_tile_impl<IS_LAST_TILE, true>(num_remaining, tile_idx, tile_base);
+    }
+    else
+    {
+      consume_tile_impl<IS_LAST_TILE, false>(num_remaining, tile_idx, tile_base);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process(int tile_idx, OffsetT tile_base)
+  {
+    OffsetT num_remaining = num_items - tile_base;
+
+    if (num_remaining > ITEMS_PER_TILE) // not a last tile
+    {
+      consume_tile<false>(num_remaining, tile_idx, tile_base);
+    }
+    else
+    {
+      consume_tile<true>(num_remaining, tile_idx, tile_base);
+    }
+  }
+};
+
+template <typename InputIteratorT, typename InputT, typename OffsetT, bool ReadLeft>
+struct AgentDifferenceInit
+{
+  static constexpr int BLOCK_THREADS = 128;
+
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Process(int tile_idx, InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
+  {
+    OffsetT tile_base = static_cast<OffsetT>(tile_idx) * items_per_tile;
+
+    if (tile_base > 0 && tile_idx < num_tiles)
+    {
+      if (ReadLeft)
+      {
+        result[tile_idx] = first[tile_base - 1];
+      }
+      else
+      {
+        result[tile_idx - 1] = first[tile_base];
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_batch_memcpy.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_batch_memcpy.cuh
new file mode 100644
index 000000000..de9a92239
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_batch_memcpy.cuh
@@ -0,0 +1,1178 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentBatchMemcpy implements device-wide copying of a batch of device-accessible
+ * source-buffers to device-accessible destination-buffers.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_run_length_decode.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+template <bool PTR_IS_FOUR_BYTE_ALIGNED>
+_CCCL_FORCEINLINE _CCCL_DEVICE void
+LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint4& data_out)
+{
+  data_out = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]};
+
+  if (!PTR_IS_FOUR_BYTE_ALIGNED)
+  {
+    uint32_t tail = aligned_ptr[4];
+    data_out.x    = __funnelshift_r(data_out.x, data_out.y, bit_shift);
+    data_out.y    = __funnelshift_r(data_out.y, data_out.z, bit_shift);
+    data_out.z    = __funnelshift_r(data_out.z, data_out.w, bit_shift);
+    data_out.w    = __funnelshift_r(data_out.w, tail, bit_shift);
+  }
+}
+
+template <bool PTR_IS_FOUR_BYTE_ALIGNED>
+_CCCL_FORCEINLINE _CCCL_DEVICE void
+LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint2& data_out)
+{
+  data_out = {aligned_ptr[0], aligned_ptr[1]};
+
+  if (!PTR_IS_FOUR_BYTE_ALIGNED)
+  {
+    uint32_t tail = aligned_ptr[2];
+    data_out.x    = __funnelshift_r(data_out.x, data_out.y, bit_shift);
+    data_out.y    = __funnelshift_r(data_out.y, tail, bit_shift);
+  }
+}
+
+template <bool PTR_IS_FOUR_BYTE_ALIGNED>
+_CCCL_FORCEINLINE _CCCL_DEVICE void
+LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint32_t& data_out)
+{
+  data_out = aligned_ptr[0];
+
+  if (!PTR_IS_FOUR_BYTE_ALIGNED)
+  {
+    uint32_t tail = aligned_ptr[1];
+    data_out      = __funnelshift_r(data_out, tail, bit_shift);
+  }
+}
+
+/**
+ * @brief Loads data from \p ptr into \p data_out without requiring \p ptr to be aligned.
+ * @note If \p ptr isn't aligned to four bytes, the bytes from the last four-byte aligned address up
+ * to \p ptr are loaded too (but dropped) and, hence, need to be device-accessible. Similarly, if
+ * \p ptr isn't aligned to four bytes, the bytes from `(ptr + sizeof(VectorT))` up to the following
+ * four-byte aligned address are loaded too (but dropped), and, hence, need to be device-accessible.
+ *
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ * @param ptr The pointer from which the data is supposed to be loaded
+ * @param data_out The vector type that stores the data loaded from \p ptr
+ */
+template <typename VectorT>
+_CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out)
+{
+  const uint32_t offset            = reinterpret_cast<std::uintptr_t>(ptr) % 4U;
+  const uint32_t* aligned_ptr      = reinterpret_cast<uint32_t const*>(ptr - offset);
+  constexpr uint32_t bits_per_byte = 8U;
+  const uint32_t bit_shift         = offset * bits_per_byte;
+
+  // If `ptr` is aligned to four bytes, we can perform a simple uint32_t-aliased load
+  if (offset == 0)
+  {
+    LoadVectorAndFunnelShiftR<true>(aligned_ptr, bit_shift, data_out);
+  }
+  // Otherwise, we need to load extra bytes and perform funnel-shifting
+  else
+  {
+    LoadVectorAndFunnelShiftR<false>(aligned_ptr, bit_shift, data_out);
+  }
+}
+
+/**
+ * @brief Helper data structure to hold information on the byte range for which we can safely
+ * perform vectorized copies.
+ *
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ */
+template <typename VectorT>
+struct PointerRange
+{
+  VectorT* out_begin;
+  VectorT* out_end;
+  const char* in_begin;
+  const char* in_end;
+};
+
+/**
+ * @brief Both `out_start_aligned` and `out_end_aligned` are indices into `out_ptr`.
+ * `out_start_aligned` is the first VectorT-aligned memory location after `out_ptr + 3`.
+ * `out_end_aligned` is the last VectorT-aligned memory location before `out_end - 4`, where out_end
+ * corresponds to one past the last byte to be copied. Bytes between `[out_start_aligned,
+ * out_end_aligned)` will be copied using VectorT. `out_ptr + 3` and `out_end - 4` are used instead
+ * of `out_ptr` and `out_end` to avoid `LoadVector` reading beyond data boundaries.
+ *
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ * @tparam ByteOffsetT Type used to index the bytes within the buffers
+ * @param in_begin Pointer to the beginning of the byte range that shall be copied
+ * @param out_begin Pointer to the beginning of the byte range that shall be copied
+ * @param num_bytes Number of bytes that shall be copied
+ * @return The byte range that can safely be copied using vectorized stores of type VectorT
+ */
+template <typename VectorT, typename ByteOffsetT>
+_CCCL_DEVICE _CCCL_FORCEINLINE PointerRange<VectorT>
+GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
+{
+  // Data type size used for vectorized stores
+  constexpr size_t out_datatype_size = sizeof(VectorT);
+  // Data type size used for type-aliased loads
+  constexpr size_t in_datatype_size = sizeof(uint32_t);
+
+  // char-aliased ptrs to simplify pointer arithmetic
+  char* out_ptr      = reinterpret_cast<char*>(out_begin);
+  const char* in_ptr = reinterpret_cast<const char*>(in_begin);
+
+  // Number of bytes between the first VectorT-aligned address at or before out_begin and out_begin
+  const uint32_t alignment_offset = reinterpret_cast<std::uintptr_t>(out_ptr) % out_datatype_size;
+
+  // The first VectorT-aligned address before (or at) out_begin
+  char* out_chars_aligned = reinterpret_cast<char*>(out_ptr - alignment_offset);
+
+  // The number of extra bytes preceding `in_ptr` that are loaded but dropped
+  uint32_t in_extra_bytes = reinterpret_cast<std::uintptr_t>(in_ptr) % in_datatype_size;
+
+  // The offset required by `LoadVector`:
+  // If the input pointer is not aligned, we load data from the last aligned address preceding the
+  // pointer. That is, loading up to (in_datatype_size-1) bytes before `in_ptr`
+  uint32_t in_offset_req = in_extra_bytes;
+
+  // Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin`
+  uint32_t out_start_aligned =
+    CUB_QUOTIENT_CEILING(in_offset_req + alignment_offset, out_datatype_size) * out_datatype_size;
+
+  // Compute the beginning of the aligned ranges (output and input pointers)
+  VectorT* out_aligned_begin   = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
+  const char* in_aligned_begin = in_ptr + (reinterpret_cast<char*>(out_aligned_begin) - out_ptr);
+
+  // If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
+  // bytes after the last byte that is copied. That is, we always load four bytes up to the next
+  // aligned input address at a time. E.g., if the last byte loaded is one byte past the last
+  // aligned address we'll also load the three bytes after that byte.
+  uint32_t in_extra_bytes_from_aligned = (reinterpret_cast<std::uintptr_t>(in_aligned_begin) % in_datatype_size);
+  uint32_t in_end_padding_req          = (in_datatype_size - in_extra_bytes_from_aligned) % in_datatype_size;
+
+  // Bytes after `out_chars_aligned` to the last VectorT-aligned
+  // address at (or before) `out_begin` + `num_bytes`
+  uint32_t out_end_aligned{};
+  if (in_end_padding_req + alignment_offset > num_bytes)
+  {
+    out_end_aligned = out_start_aligned;
+  }
+  else
+  {
+    out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size;
+  }
+
+  VectorT* out_aligned_end   = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
+  const char* in_aligned_end = in_ptr + (reinterpret_cast<char*>(out_aligned_end) - out_ptr);
+
+  return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};
+}
+
+/**
+ * @brief Cooperatively copies \p num_bytes from \p src to \p dest using vectorized stores of type
+ * \p VectorT for addresses within [dest, dest + num_bytes) that are aligned to \p VectorT. A
+ * byte-wise copy is used for byte-ranges that are not aligned to \p VectorT.
+ *
+ * @tparam LOGICAL_WARP_SIZE The number of threads cooperaing to copy the data; all threads within
+ * [0,  `LOGICAL_WARP_SIZE`) must invoke this method with the same arguments
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ * @tparam ByteOffsetT Type used to index the bytes within the buffers
+ * @param thread_rank The thread rank within the group that cooperates to copy the data must be
+ * within [0, `LOGICAL_WARP_SIZE`)
+ * @param dest Pointer to the memory location to copy to
+ * @param num_bytes Number of bytes to copy
+ * @param src Pointer to the memory location to copy from
+ */
+template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+VectorizedCopy(int32_t thread_rank, void* dest, ByteOffsetT num_bytes, const void* src)
+{
+  char* out_ptr      = reinterpret_cast<char*>(dest);
+  const char* in_ptr = reinterpret_cast<const char*>(src);
+
+  // Gets the byte range that can safely be copied using vectorized stores of type VectorT
+  auto aligned_range = GetAlignedPtrs<VectorT>(src, dest, num_bytes);
+
+  // If byte range for which we can use vectorized copies is empty -> use byte-wise copies
+  if (aligned_range.out_end <= aligned_range.out_begin)
+  {
+    for (ByteOffsetT ichar = thread_rank; ichar < num_bytes; ichar += LOGICAL_WARP_SIZE)
+    {
+      out_ptr[ichar] = in_ptr[ichar];
+    }
+  }
+  else
+  {
+    // Copy bytes in range `[dest, aligned_range.out_begin)`
+    out_ptr += thread_rank;
+    in_ptr += thread_rank;
+    while (out_ptr < reinterpret_cast<char*>(aligned_range.out_begin))
+    {
+      *out_ptr = *in_ptr;
+      out_ptr += LOGICAL_WARP_SIZE;
+      in_ptr += LOGICAL_WARP_SIZE;
+    }
+
+    // Copy bytes in range `[aligned_range.out_begin, aligned_range.out_end)`
+    VectorT* aligned_range_begin = aligned_range.out_begin + thread_rank;
+    const char* in_aligned_begin = aligned_range.in_begin + thread_rank * sizeof(VectorT);
+    while (aligned_range_begin < aligned_range.out_end)
+    {
+      VectorT data_in;
+      LoadVector(in_aligned_begin, data_in);
+      *aligned_range_begin = data_in;
+      in_aligned_begin += sizeof(VectorT) * LOGICAL_WARP_SIZE;
+      aligned_range_begin += LOGICAL_WARP_SIZE;
+    }
+
+    // Copy bytes in range `[aligned_range.out_end, dest + num_bytes)`.
+    out_ptr = reinterpret_cast<char*>(aligned_range.out_end) + thread_rank;
+    in_ptr  = aligned_range.in_end + thread_rank;
+    while (out_ptr < reinterpret_cast<char*>(dest) + num_bytes)
+    {
+      *out_ptr = *in_ptr;
+      out_ptr += LOGICAL_WARP_SIZE;
+      in_ptr += LOGICAL_WARP_SIZE;
+    }
+  }
+}
+
+template <bool IsMemcpy,
+          uint32_t LOGICAL_WARP_SIZE,
+          typename InputBufferT,
+          typename OutputBufferT,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_bytes, OffsetT offset = 0)
+{
+  VectorizedCopy<LOGICAL_WARP_SIZE, uint4>(
+    threadIdx.x % LOGICAL_WARP_SIZE,
+    &reinterpret_cast<char*>(output_buffer)[offset],
+    num_bytes,
+    &reinterpret_cast<const char*>(input_buffer)[offset]);
+}
+
+template <bool IsMemcpy,
+          uint32_t LOGICAL_WARP_SIZE,
+          typename InputBufferT,
+          typename OutputBufferT,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_items, OffsetT offset = 0)
+{
+  output_buffer += offset;
+  input_buffer += offset;
+  for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE)
+  {
+    *(output_buffer + i) = *(input_buffer + i);
+  }
+}
+
+template <bool IsMemcpy,
+          typename AliasT,
+          typename InputIt,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset)
+{
+  return *(reinterpret_cast<const AliasT*>(buffer_src) + offset);
+}
+
+template <bool IsMemcpy,
+          typename AliasT,
+          typename InputIt,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset)
+{
+  return *(buffer_src + offset);
+}
+
+template <bool IsMemcpy,
+          typename AliasT,
+          typename OutputIt,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
+{
+  *(reinterpret_cast<AliasT*>(buffer_dst) + offset) = value;
+}
+
+template <bool IsMemcpy,
+          typename AliasT,
+          typename OutputIt,
+          typename OffsetT,
+          typename ::cuda::std::enable_if<!IsMemcpy, int>::type = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
+{
+  *(buffer_dst + offset) = value;
+}
+
+/**
+ * @brief A helper class that allows threads to maintain multiple counters, where the counter that
+ * shall be incremented can be addressed dynamically without incurring register spillage.
+ *
+ * @tparam NUM_ITEMS The number of counters to allocate
+ * @tparam MAX_ITEM_VALUE The maximum count that must be supported.
+ * @tparam PREFER_POW2_BITS Whether the number of bits to dedicate to each counter should be a
+ * power-of-two. If enabled, this allows replacing integer multiplication with a bit-shift in
+ * exchange for higher register pressure.
+ * @tparam BackingUnitT The data type that is used to provide the bits of all the counters that
+ * shall be allocated.
+ */
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, bool PREFER_POW2_BITS, typename BackingUnitT = uint32_t>
+class BitPackedCounter
+{
+private:
+  /// The minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE]
+  static constexpr uint32_t MIN_BITS_PER_ITEM =
+    (MAX_ITEM_VALUE == 0U) ? 1U : cub::Log2<static_cast<int32_t>(MAX_ITEM_VALUE + 1U)>::VALUE;
+
+  /// The number of bits allocated for each item. For pre-Volta, we prefer a power-of-2 here to
+  /// have the compiler replace costly integer multiplication with bit-shifting.
+  static constexpr uint32_t BITS_PER_ITEM =
+    PREFER_POW2_BITS ? (0x01ULL << (cub::Log2<static_cast<int32_t>(MIN_BITS_PER_ITEM)>::VALUE)) : MIN_BITS_PER_ITEM;
+
+  /// The number of bits that each backing data type can store
+  static constexpr uint32_t NUM_BITS_PER_UNIT = sizeof(BackingUnitT) * 8;
+
+  /// The number of items that each backing data type can store
+  static constexpr uint32_t ITEMS_PER_UNIT = NUM_BITS_PER_UNIT / BITS_PER_ITEM;
+
+  /// The number of bits the backing data type is actually making use of
+  static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM;
+
+  /// The number of backing data types required to store the given number of items
+  static constexpr uint32_t NUM_TOTAL_UNITS = CUB_QUOTIENT_CEILING(NUM_ITEMS, ITEMS_PER_UNIT);
+
+  /// This is the net number of bit-storage provided by each unit (remainder bits are unused)
+  static constexpr uint32_t UNIT_MASK =
+    (USED_BITS_PER_UNIT >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << USED_BITS_PER_UNIT) - 1;
+  /// This is the bit-mask for each item
+  static constexpr uint32_t ITEM_MASK =
+    (BITS_PER_ITEM >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << BITS_PER_ITEM) - 1;
+
+  //------------------------------------------------------------------------------
+  // ACCESSORS
+  //------------------------------------------------------------------------------
+
+public:
+  _CCCL_DEVICE _CCCL_FORCEINLINE uint32_t Get(uint32_t index) const
+  {
+    const uint32_t target_offset = index * BITS_PER_ITEM;
+    uint32_t val                 = 0;
+
+#pragma unroll
+    for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
+    {
+      // In case the bit-offset of the counter at <index> is larger than the bit range of the
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
+      // we use the PTX instruction `shr` to make sure behaviour is well-defined.
+      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
+      const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
+      val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
+    }
+    return val;
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Add(uint32_t index, uint32_t value)
+  {
+    const uint32_t target_offset = index * BITS_PER_ITEM;
+
+#pragma unroll
+    for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
+    {
+      // In case the bit-offset of the counter at <index> is larger than the bit range of the
+      // current unit, the bit_shift amount will be larger than the bits provided by this unit. As
+      // C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
+      // we use the PTX instruction `shl` to make sure behaviour is well-defined.
+      // Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
+      const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
+      data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
+    }
+  }
+
+  _CCCL_DEVICE BitPackedCounter operator+(const BitPackedCounter& rhs) const
+  {
+    BitPackedCounter result;
+#pragma unroll
+    for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
+    {
+      result.data[i] = data[i] + rhs.data[i];
+    }
+    return result;
+  }
+
+  //------------------------------------------------------------------------------
+  // MEMBER VARIABLES
+  //------------------------------------------------------------------------------
+
+private:
+  BackingUnitT data[NUM_TOTAL_UNITS] = {};
+};
+
+/**
+ * Parameterizable tuning policy type for AgentBatchMemcpy
+ */
+template <uint32_t _BLOCK_THREADS,
+          uint32_t _BUFFERS_PER_THREAD,
+          uint32_t _TLEV_BYTES_PER_THREAD,
+          bool _PREFER_POW2_BITS,
+          uint32_t _BLOCK_LEVEL_TILE_SIZE,
+          uint32_t _WARP_LEVEL_THRESHOLD,
+          uint32_t _BLOCK_LEVEL_THRESHOLD,
+          class BuffDelayConstructor,
+          class BlockDelayConstructor>
+struct AgentBatchMemcpyPolicy
+{
+  /// Threads per thread block
+  static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
+  /// Items per thread (per tile of input)
+  static constexpr uint32_t BUFFERS_PER_THREAD = _BUFFERS_PER_THREAD;
+  /// The number of bytes that each thread will work on with each iteration of reading in bytes
+  /// from one or more
+  // source-buffers and writing them out to the respective destination-buffers.
+  static constexpr uint32_t TLEV_BYTES_PER_THREAD = _TLEV_BYTES_PER_THREAD;
+  /// Whether the BitPackedCounter should prefer allocating a power-of-2 number of bits per
+  /// counter
+  static constexpr uint32_t PREFER_POW2_BITS = _PREFER_POW2_BITS;
+  /// BLEV tile size granularity
+  static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = _BLOCK_LEVEL_TILE_SIZE;
+
+  static constexpr uint32_t WARP_LEVEL_THRESHOLD  = _WARP_LEVEL_THRESHOLD;
+  static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = _BLOCK_LEVEL_THRESHOLD;
+
+  using buff_delay_constructor  = BuffDelayConstructor;
+  using block_delay_constructor = BlockDelayConstructor;
+};
+
+template <typename AgentMemcpySmallBuffersPolicyT,
+          typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferOffsetT,
+          typename BlevBufferSrcsOutItT,
+          typename BlevBufferDstsOutItT,
+          typename BlevBufferSizesOutItT,
+          typename BlevBufferTileOffsetsOutItT,
+          typename BlockOffsetT,
+          typename BLevBufferOffsetTileState,
+          typename BLevBlockOffsetTileState,
+          bool IsMemcpy>
+class AgentBatchMemcpy
+{
+private:
+  //---------------------------------------------------------------------
+  // CONFIGS / CONSTANTS
+  //---------------------------------------------------------------------
+  // Tuning policy-based configurations
+  static constexpr uint32_t BLOCK_THREADS         = AgentMemcpySmallBuffersPolicyT::BLOCK_THREADS;
+  static constexpr uint32_t BUFFERS_PER_THREAD    = AgentMemcpySmallBuffersPolicyT::BUFFERS_PER_THREAD;
+  static constexpr uint32_t TLEV_BYTES_PER_THREAD = AgentMemcpySmallBuffersPolicyT::TLEV_BYTES_PER_THREAD;
+  static constexpr bool PREFER_POW2_BITS          = AgentMemcpySmallBuffersPolicyT::PREFER_POW2_BITS;
+  static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_TILE_SIZE;
+
+  // Derived configs
+  static constexpr uint32_t BUFFERS_PER_BLOCK       = BUFFERS_PER_THREAD * BLOCK_THREADS;
+  static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
+  static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
+
+  static constexpr uint32_t WARP_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::WARP_LEVEL_THRESHOLD;
+
+  static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_THRESHOLD;
+
+  static constexpr uint32_t BUFFER_STABLE_PARTITION = false;
+
+  // Constants
+  enum : uint32_t
+  {
+    TLEV_SIZE_CLASS = 0,
+    WLEV_SIZE_CLASS,
+    BLEV_SIZE_CLASS,
+    NUM_SIZE_CLASSES,
+  };
+
+  //---------------------------------------------------------------------
+  // TYPE DECLARATIONS
+  //---------------------------------------------------------------------
+  /// Internal load/store type. For byte-wise memcpy, a single-byte type
+  using AliasT =
+    typename ::cuda::std::conditional<IsMemcpy,
+                                      std::iterator_traits<char*>,
+                                      std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
+
+  /// Types of the input and output buffers
+  using InputBufferT  = cub::detail::value_t<InputBufferIt>;
+  using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
+
+  /// Type that has to be sufficiently large to hold any of the buffers' sizes.
+  /// The BufferSizeIteratorT's value type must be convertible to this type.
+  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
+
+  /// Type used to index into the tile of buffers that this thread block is assigned to.
+  using BlockBufferOffsetT = uint16_t;
+
+  /// Internal type used to index into the bytes of and represent size of a TLEV buffer
+  using TLevBufferSizeT = uint16_t;
+
+  /**
+   * @brief Helper struct to simplify BlockExchange within a single four-byte word
+   */
+  struct ZippedTLevByteAssignment
+  {
+    // The buffer id within this tile
+    BlockBufferOffsetT tile_buffer_id;
+
+    // Byte-offset within that buffer
+    TLevBufferSizeT buffer_byte_offset;
+  };
+
+  /**
+   * POD to keep track of <buffer_id, buffer_size> pairs after having partitioned this tile's
+   * buffers by their size.
+   */
+  struct BufferTuple
+  {
+    // Size is only valid (and relevant) for buffers that are use thread-level collaboration
+    TLevBufferSizeT size;
+
+    // The buffer id relativ to this tile (i.e., the buffer id within this tile)
+    BlockBufferOffsetT buffer_id;
+  };
+
+  // Load buffers in a striped arrangement if we do not want to performa a stable partitioning into
+  // small, medium, and large buffers, otherwise load them in a blocked arrangement
+  using BufferLoadT =
+    BlockLoad<BufferSizeT,
+              static_cast<int32_t>(BLOCK_THREADS),
+              static_cast<int32_t>(BUFFERS_PER_THREAD),
+              BUFFER_STABLE_PARTITION ? BLOCK_LOAD_WARP_TRANSPOSE : BLOCK_LOAD_STRIPED>;
+
+  // A vectorized counter that will count the number of buffers that fall into each of the
+  // size-classes. Where the size class representes the collaboration level that is required to
+  // process a buffer. The collaboration level being either:
+  //-> (1) TLEV (thread-level collaboration), requiring one or multiple threads but not a FULL warp
+  // to collaborate
+  //-> (2) WLEV (warp-level collaboration), requiring a full warp to collaborate on a buffer
+  //-> (3) BLEV (block-level collaboration), requiring one or multiple thread blocks to collaborate
+  // on a buffer */
+  using VectorizedSizeClassCounterT = BitPackedCounter<NUM_SIZE_CLASSES, BUFFERS_PER_BLOCK, PREFER_POW2_BITS>;
+
+  // Block-level scan used to compute the write offsets
+  using BlockSizeClassScanT = cub::BlockScan<VectorizedSizeClassCounterT, static_cast<int32_t>(BLOCK_THREADS)>;
+
+  //
+  using BlockBLevTileCountScanT = cub::BlockScan<BlockOffsetT, static_cast<int32_t>(BLOCK_THREADS)>;
+
+  // Block-level run-length decode algorithm to evenly distribute work of all buffers requiring
+  // thread-level collaboration
+  using BlockRunLengthDecodeT =
+    cub::BlockRunLengthDecode<BlockBufferOffsetT,
+                              static_cast<int32_t>(BLOCK_THREADS),
+                              static_cast<int32_t>(TLEV_BUFFERS_PER_THREAD),
+                              static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;
+
+  using BlockExchangeTLevT =
+    cub::BlockExchange<ZippedTLevByteAssignment,
+                       static_cast<int32_t>(BLOCK_THREADS),
+                       static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;
+
+  using BLevBuffScanPrefixCallbackOpT =
+    TilePrefixCallbackOp<BufferOffsetT,
+                         Sum,
+                         BLevBufferOffsetTileState,
+                         0,
+                         typename AgentMemcpySmallBuffersPolicyT::buff_delay_constructor>;
+
+  using BLevBlockScanPrefixCallbackOpT =
+    TilePrefixCallbackOp<BlockOffsetT,
+                         Sum,
+                         BLevBlockOffsetTileState,
+                         0,
+                         typename AgentMemcpySmallBuffersPolicyT::block_delay_constructor>;
+
+  //-----------------------------------------------------------------------------
+  // SHARED MEMORY DECLARATIONS
+  //-----------------------------------------------------------------------------
+  struct _TempStorage
+  {
+    union
+    {
+      typename BufferLoadT::TempStorage load_storage;
+
+      // Stage 1: histogram over the size classes in preparation for partitioning buffers by size
+      typename BlockSizeClassScanT::TempStorage size_scan_storage;
+
+      // Stage 2: Communicate the number ofer buffers requiring block-level collaboration
+      typename BLevBuffScanPrefixCallbackOpT::TempStorage buffer_scan_callback;
+
+      // Stage 3; batch memcpy buffers that require only thread-level collaboration
+      struct
+      {
+        BufferTuple buffers_by_size_class[BUFFERS_PER_BLOCK];
+
+        // Stage 3.1: Write buffers requiring block-level collaboration to queue
+        union
+        {
+          struct
+          {
+            typename BLevBlockScanPrefixCallbackOpT::TempStorage block_scan_callback;
+            typename BlockBLevTileCountScanT::TempStorage block_scan_storage;
+          } blev;
+
+          // Stage 3.3: run-length decode & block exchange for tlev
+          // rld_state needs to be persistent across loop iterations (RunLengthDecode calls) and,
+          // hence, cannot alias block_exchange_storage
+          struct
+          {
+            typename BlockRunLengthDecodeT::TempStorage rld_state;
+            typename BlockExchangeTLevT::TempStorage block_exchange_storage;
+          } tlev;
+        };
+      } staged;
+    };
+    BufferOffsetT blev_buffer_offset;
+  };
+
+  //-----------------------------------------------------------------------------
+  // PUBLIC TYPE MEMBERS
+  //-----------------------------------------------------------------------------
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //-----------------------------------------------------------------------------
+  // PRIVATE MEMBER FUNCTIONS
+  //-----------------------------------------------------------------------------
+
+private:
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /**
+   * @brief Loads this tile's buffers' sizes, without any guards (i.e., out-of-bounds checks)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  LoadBufferSizesFullTile(BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
+  {
+    BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes);
+  }
+
+  /**
+   * @brief Loads this tile's buffers' sizes, making sure to read at most \p num_valid items.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadBufferSizesPartialTile(
+    BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], BufferOffsetT num_valid)
+  {
+    // Out-of-bounds buffer items are initialized to '0', so those buffers will simply be ignored
+    // later on
+    constexpr BufferSizeT OOB_DEFAULT_BUFFER_SIZE = 0U;
+
+    BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes, num_valid, OOB_DEFAULT_BUFFER_SIZE);
+  }
+
+  /**
+   * @brief Computes the histogram over the number of buffers belonging to each of the three
+   * size-classes (TLEV, WLEV, BLEV).
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE VectorizedSizeClassCounterT
+  GetBufferSizeClassHistogram(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
+  {
+    VectorizedSizeClassCounterT vectorized_counters{};
+#pragma unroll
+    for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
+    {
+      // Whether to increment ANY of the buffer size classes at all
+      const uint32_t increment = buffer_sizes[i] > 0 ? 1U : 0U;
+      // Identify the buffer's size class
+      uint32_t buffer_size_class = 0;
+      buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
+      buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;
+
+      // Increment the count of the respective size class
+      vectorized_counters.Add(buffer_size_class, increment);
+    }
+    return vectorized_counters;
+  }
+
+  /**
+   * @brief Scatters the buffers into the respective buffer's size-class partition.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void PartitionBuffersBySize(
+    const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD],
+    VectorizedSizeClassCounterT& vectorized_offsets,
+    BufferTuple (&buffers_by_size_class)[BUFFERS_PER_BLOCK])
+  {
+    // If we intend to perform a stable partitioning, the thread's buffer are in a blocked
+    // arrangement, otherwise they are in a striped arrangement
+    BlockBufferOffsetT buffer_id = BUFFER_STABLE_PARTITION ? (BUFFERS_PER_THREAD * threadIdx.x) : (threadIdx.x);
+    constexpr BlockBufferOffsetT BUFFER_STRIDE =
+      BUFFER_STABLE_PARTITION ? static_cast<BlockBufferOffsetT>(1) : static_cast<BlockBufferOffsetT>(BLOCK_THREADS);
+
+#pragma unroll
+    for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
+    {
+      if (buffer_sizes[i] > 0)
+      {
+        uint32_t buffer_size_class = 0;
+        buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
+        buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;
+        const uint32_t write_offset         = vectorized_offsets.Get(buffer_size_class);
+        buffers_by_size_class[write_offset] = {static_cast<TLevBufferSizeT>(buffer_sizes[i]), buffer_id};
+        vectorized_offsets.Add(buffer_size_class, 1U);
+      }
+      buffer_id += BUFFER_STRIDE;
+    }
+  }
+
+  /**
+   * @brief Read in all the buffers that require block-level collaboration and put them to a queue
+   * that will get picked up in a separate, subsequent kernel.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void EnqueueBLEVBuffers(
+    BufferTuple* buffers_by_size_class,
+    InputBufferIt tile_buffer_srcs,
+    OutputBufferIt tile_buffer_dsts,
+    BufferSizeIteratorT tile_buffer_sizes,
+    BlockBufferOffsetT num_blev_buffers,
+    BufferOffsetT tile_buffer_offset,
+    BufferOffsetT tile_id)
+  {
+    BlockOffsetT block_offset[BLEV_BUFFERS_PER_THREAD];
+    // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
+    uint32_t blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
+#pragma unroll
+    for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
+    {
+      if (blev_buffer_offset < num_blev_buffers)
+      {
+        BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
+        block_offset[i] = CUB_QUOTIENT_CEILING(tile_buffer_sizes[tile_buffer_id], BLOCK_LEVEL_TILE_SIZE);
+      }
+      else
+      {
+        // Out-of-bounds buffers are assigned a tile count of '0'
+        block_offset[i] = 0U;
+      }
+      blev_buffer_offset++;
+    }
+
+    if (tile_id == 0)
+    {
+      BlockOffsetT block_aggregate;
+      BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
+        .ExclusiveSum(block_offset, block_offset, block_aggregate);
+      if (threadIdx.x == 0)
+      {
+        blev_block_scan_state.SetInclusive(0, block_aggregate);
+      }
+    }
+    else
+    {
+      BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op(
+        blev_block_scan_state, temp_storage.staged.blev.block_scan_callback, Sum(), tile_id);
+      BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
+        .ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
+    }
+    CTA_SYNC();
+
+    // Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
+    blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
+#pragma unroll
+    for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
+    {
+      if (blev_buffer_offset < num_blev_buffers)
+      {
+        BlockBufferOffsetT tile_buffer_id                         = buffers_by_size_class[blev_buffer_offset].buffer_id;
+        blev_buffer_srcs[tile_buffer_offset + blev_buffer_offset] = tile_buffer_srcs[tile_buffer_id];
+        blev_buffer_dsts[tile_buffer_offset + blev_buffer_offset] = tile_buffer_dsts[tile_buffer_id];
+        blev_buffer_sizes[tile_buffer_offset + blev_buffer_offset]        = tile_buffer_sizes[tile_buffer_id];
+        blev_buffer_tile_offsets[tile_buffer_offset + blev_buffer_offset] = block_offset[i];
+        blev_buffer_offset++;
+      }
+    }
+  }
+
+  /**
+   * @brief Read in all the buffers of this tile that require warp-level collaboration and copy
+   * their bytes to the corresponding destination buffer
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyWLEVBuffers(
+    BufferTuple* buffers_by_size_class,
+    InputBufferIt tile_buffer_srcs,
+    OutputBufferIt tile_buffer_dsts,
+    BufferSizeIteratorT tile_buffer_sizes,
+    BlockBufferOffsetT num_wlev_buffers)
+  {
+    const int32_t warp_id              = threadIdx.x / CUB_PTX_WARP_THREADS;
+    constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS;
+
+    for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += WARPS_PER_BLOCK)
+    {
+      const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id;
+      copy_items<IsMemcpy, CUB_PTX_WARP_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
+        tile_buffer_srcs[buffer_id], tile_buffer_dsts[buffer_id], tile_buffer_sizes[buffer_id]);
+    }
+  }
+
+  /**
+   * @brief Read in all the buffers of this tile that require thread-level collaboration and copy
+   * their bytes to the corresponding destination buffer
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyTLEVBuffers(
+    BufferTuple* buffers_by_size_class,
+    InputBufferIt tile_buffer_srcs,
+    OutputBufferIt tile_buffer_dsts,
+    BlockBufferOffsetT num_tlev_buffers)
+  {
+    // Read in the buffers' ids that require thread-level collaboration (where buffer id is the
+    // buffer within this tile)
+    BlockBufferOffsetT tlev_buffer_ids[TLEV_BUFFERS_PER_THREAD];
+    TLevBufferSizeT tlev_buffer_sizes[TLEV_BUFFERS_PER_THREAD];
+    // Currently we do not go over the TLEV buffers in multiple iterations, so we need to make sure
+    // we are able to be covered for the case that all our buffers are TLEV buffers
+    static_assert(TLEV_BUFFERS_PER_THREAD >= BUFFERS_PER_THREAD,
+                  "Unsupported confiugraiton: The number of 'thread-level buffers' must be at "
+                  "least as large as the number of overall buffers being processed by each "
+                  "thread.");
+
+    // Read in the TLEV buffer partition (i.e., the buffers that require thread-level collaboration)
+    uint32_t tlev_buffer_offset = threadIdx.x * TLEV_BUFFERS_PER_THREAD;
+
+    // Pre-populate the buffer sizes to 0 (i.e. zero-padding towards the end) to ensure
+    // out-of-bounds TLEV buffers will not be considered
+#pragma unroll
+    for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
+    {
+      tlev_buffer_sizes[i] = 0;
+    }
+
+    // Assign TLEV buffers in a blocked arrangement (each thread is assigned consecutive TLEV
+    // buffers)
+#pragma unroll
+    for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
+    {
+      if (tlev_buffer_offset < num_tlev_buffers)
+      {
+        tlev_buffer_ids[i]   = buffers_by_size_class[tlev_buffer_offset].buffer_id;
+        tlev_buffer_sizes[i] = buffers_by_size_class[tlev_buffer_offset].size;
+      }
+      tlev_buffer_offset++;
+    }
+
+    // Evenly distribute all the bytes that have to be copied from all the buffers that require
+    // thread-level collaboration using BlockRunLengthDecode
+    uint32_t num_total_tlev_bytes = 0U;
+    BlockRunLengthDecodeT block_run_length_decode(
+      temp_storage.staged.tlev.rld_state, tlev_buffer_ids, tlev_buffer_sizes, num_total_tlev_bytes);
+
+    // Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated
+    // until we were able to cover all the bytes of TLEV buffers
+    uint32_t decoded_window_offset = 0U;
+    while (decoded_window_offset < num_total_tlev_bytes)
+    {
+      BlockBufferOffsetT buffer_id[TLEV_BYTES_PER_THREAD];
+      TLevBufferSizeT buffer_byte_offset[TLEV_BYTES_PER_THREAD];
+
+      // Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and
+      // buffer_byte_offset[i] that buffer's byte that this thread supposed to copy
+      block_run_length_decode.RunLengthDecode(buffer_id, buffer_byte_offset, decoded_window_offset);
+
+      // Zip from SoA to AoS
+      ZippedTLevByteAssignment zipped_byte_assignment[TLEV_BYTES_PER_THREAD];
+#pragma unroll
+      for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
+      {
+        zipped_byte_assignment[i] = {buffer_id[i], buffer_byte_offset[i]};
+      }
+
+      // Exchange from blocked to striped arrangement for coalesced memory reads and writes
+      BlockExchangeTLevT(temp_storage.staged.tlev.block_exchange_storage)
+        .BlockedToStriped(zipped_byte_assignment, zipped_byte_assignment);
+
+      // Read in the bytes that this thread is assigned to
+      constexpr uint32_t WINDOW_SIZE = (TLEV_BYTES_PER_THREAD * BLOCK_THREADS);
+      const bool is_full_window      = decoded_window_offset + WINDOW_SIZE < num_total_tlev_bytes;
+      if (is_full_window)
+      {
+        uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
+        AliasT src_byte[TLEV_BYTES_PER_THREAD];
+#pragma unroll
+        for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
+        {
+          src_byte[i] = read_item<IsMemcpy, AliasT, InputBufferT>(
+            tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset);
+          absolute_tlev_byte_offset += BLOCK_THREADS;
+        }
+#pragma unroll
+        for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
+        {
+          write_item<IsMemcpy, AliasT, OutputBufferT>(
+            tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
+            zipped_byte_assignment[i].buffer_byte_offset,
+            src_byte[i]);
+        }
+      }
+      else
+      {
+        uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
+#pragma unroll
+        for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
+        {
+          if (absolute_tlev_byte_offset < num_total_tlev_bytes)
+          {
+            const AliasT src_byte = read_item<IsMemcpy, AliasT, InputBufferT>(
+              tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset);
+            write_item<IsMemcpy, AliasT, OutputBufferT>(
+              tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
+              zipped_byte_assignment[i].buffer_byte_offset,
+              src_byte);
+          }
+          absolute_tlev_byte_offset += BLOCK_THREADS;
+        }
+      }
+
+      decoded_window_offset += WINDOW_SIZE;
+
+      // Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
+      // with next iteration
+      CTA_SYNC();
+    }
+  }
+
+  //-----------------------------------------------------------------------------
+  // PUBLIC MEMBER FUNCTIONS
+  //-----------------------------------------------------------------------------
+
+public:
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(BufferOffsetT tile_id)
+  {
+    // Offset into this tile's buffers
+    BufferOffsetT buffer_offset = tile_id * BUFFERS_PER_BLOCK;
+
+    // Indicates whether all of this tiles items are within bounds
+    bool is_full_tile = buffer_offset + BUFFERS_PER_BLOCK < num_buffers;
+
+    // Load the buffer sizes of this tile's buffers
+    BufferSizeIteratorT tile_buffer_sizes_it = buffer_sizes_it + buffer_offset;
+    BufferSizeT buffer_sizes[BUFFERS_PER_THREAD];
+    if (is_full_tile)
+    {
+      LoadBufferSizesFullTile(tile_buffer_sizes_it, buffer_sizes);
+    }
+    else
+    {
+      LoadBufferSizesPartialTile(tile_buffer_sizes_it, buffer_sizes, num_buffers - buffer_offset);
+    }
+
+    // Ensure we can repurpose the BlockLoad's temporary storage
+    CTA_SYNC();
+
+    // Count how many buffers fall into each size-class
+    VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);
+
+    // Compute the prefix sum over the histogram
+    VectorizedSizeClassCounterT size_class_agg = {};
+    BlockSizeClassScanT(temp_storage.size_scan_storage)
+      .ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);
+
+    // Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
+    CTA_SYNC();
+
+    // Factor in the per-size-class counts / offsets
+    // That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
+    // has to be offset by the TLEV+WLEV buffer count
+    uint32_t buffer_count = 0U;
+    for (uint32_t i = 0; i < NUM_SIZE_CLASSES; i++)
+    {
+      size_class_histogram.Add(i, buffer_count);
+      buffer_count += size_class_agg.Get(i);
+    }
+
+    // Signal the number of BLEV buffers we're planning to write out
+    BufferOffsetT buffer_exclusive_prefix = 0;
+    if (tile_id == 0)
+    {
+      if (threadIdx.x == 0)
+      {
+        blev_buffer_scan_state.SetInclusive(tile_id, size_class_agg.Get(BLEV_SIZE_CLASS));
+      }
+      buffer_exclusive_prefix = 0;
+    }
+    else
+    {
+      BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op(
+        blev_buffer_scan_state, temp_storage.buffer_scan_callback, Sum(), tile_id);
+
+      // Signal our partial prefix and wait for the inclusive prefix of previous tiles
+      if (threadIdx.x < CUB_PTX_WARP_THREADS)
+      {
+        buffer_exclusive_prefix = blev_buffer_prefix_op(size_class_agg.Get(BLEV_SIZE_CLASS));
+      }
+    }
+    if (threadIdx.x == 0)
+    {
+      temp_storage.blev_buffer_offset = buffer_exclusive_prefix;
+    }
+
+    // Ensure the prefix callback has finished using its temporary storage and that it can be reused
+    // in the next stage
+    CTA_SYNC();
+
+    // Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
+    // size
+    PartitionBuffersBySize(buffer_sizes, size_class_histogram, temp_storage.staged.buffers_by_size_class);
+
+    // Ensure all buffers have been partitioned by their size class AND
+    // ensure that blev_buffer_offset has been written to shared memory
+    CTA_SYNC();
+
+    // TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
+    InputBufferIt tile_buffer_srcs        = input_buffer_it + buffer_offset;
+    OutputBufferIt tile_buffer_dsts       = output_buffer_it + buffer_offset;
+    BufferSizeIteratorT tile_buffer_sizes = buffer_sizes_it + buffer_offset;
+
+    // Copy block-level buffers
+    EnqueueBLEVBuffers(
+      &temp_storage.staged
+         .buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS) + size_class_agg.Get(WLEV_SIZE_CLASS)],
+      tile_buffer_srcs,
+      tile_buffer_dsts,
+      tile_buffer_sizes,
+      size_class_agg.Get(BLEV_SIZE_CLASS),
+      temp_storage.blev_buffer_offset,
+      tile_id);
+
+    // Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
+    CTA_SYNC();
+
+    // Copy warp-level buffers
+    BatchMemcpyWLEVBuffers(
+      &temp_storage.staged.buffers_by_size_class[size_class_agg.Get(TLEV_SIZE_CLASS)],
+      tile_buffer_srcs,
+      tile_buffer_dsts,
+      tile_buffer_sizes,
+      size_class_agg.Get(WLEV_SIZE_CLASS));
+
+    // Perform batch memcpy for all the buffers that require thread-level collaboration
+    uint32_t num_tlev_buffers = size_class_agg.Get(TLEV_SIZE_CLASS);
+    BatchMemcpyTLEVBuffers(
+      temp_storage.staged.buffers_by_size_class, tile_buffer_srcs, tile_buffer_dsts, num_tlev_buffers);
+  }
+
+  //-----------------------------------------------------------------------------
+  // CONSTRUCTOR
+  //-----------------------------------------------------------------------------
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentBatchMemcpy(
+    TempStorage& temp_storage,
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes_it,
+    BufferOffsetT num_buffers,
+    BlevBufferSrcsOutItT blev_buffer_srcs,
+    BlevBufferDstsOutItT blev_buffer_dsts,
+    BlevBufferSizesOutItT blev_buffer_sizes,
+    BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
+    BLevBufferOffsetTileState blev_buffer_scan_state,
+    BLevBlockOffsetTileState blev_block_scan_state)
+      : temp_storage(temp_storage.Alias())
+      , input_buffer_it(input_buffer_it)
+      , output_buffer_it(output_buffer_it)
+      , buffer_sizes_it(buffer_sizes_it)
+      , num_buffers(num_buffers)
+      , blev_buffer_srcs(blev_buffer_srcs)
+      , blev_buffer_dsts(blev_buffer_dsts)
+      , blev_buffer_sizes(blev_buffer_sizes)
+      , blev_buffer_tile_offsets(blev_buffer_tile_offsets)
+      , blev_buffer_scan_state(blev_buffer_scan_state)
+      , blev_block_scan_state(blev_block_scan_state)
+  {}
+
+private:
+  // Iterator providing the pointers to the source memory buffers
+  InputBufferIt input_buffer_it;
+  // Iterator providing the pointers to the destination memory buffers
+  OutputBufferIt output_buffer_it;
+  // Iterator providing the number of bytes to be copied for each pair of buffers
+  BufferSizeIteratorT buffer_sizes_it;
+  // The total number of buffer pairs
+  BufferOffsetT num_buffers;
+  // Output iterator to which the source pointers of the BLEV buffers are written
+  BlevBufferSrcsOutItT blev_buffer_srcs;
+  // Output iterator to which the destination pointers of the BLEV buffers are written
+  BlevBufferDstsOutItT blev_buffer_dsts;
+  // Output iterator to which the number of bytes to be copied of the BLEV buffers are written
+  BlevBufferSizesOutItT blev_buffer_sizes;
+  // Output iterator to which the mapping of tiles to BLEV buffers is written
+  BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets;
+  // The single-pass prefix scan's tile state used for tracking the prefix sum over the number of
+  // BLEV buffers
+  BLevBufferOffsetTileState blev_buffer_scan_state;
+  // The single-pass prefix scan's tile state used for tracking the prefix sum over tiles of BLEV
+  // buffers
+  BLevBlockOffsetTileState blev_block_scan_state;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_for.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_for.cuh
new file mode 100644
index 000000000..dbbb77a0c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_for.cuh
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+template <int BlockThreads, int ItemsPerThread>
+struct policy_t
+{
+  static constexpr int block_threads    = BlockThreads;
+  static constexpr int items_per_thread = ItemsPerThread;
+};
+
+template <class PolicyT, class OffsetT, class OpT>
+struct agent_block_striped_t
+{
+  static constexpr int items_per_thread = PolicyT::items_per_thread;
+
+  OffsetT tile_base;
+  OpT op;
+
+  template <bool IsFullTile>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int items_in_tile, int block_threads)
+  {
+#pragma unroll
+    for (int item = 0; item < items_per_thread; item++)
+    {
+      const auto idx = static_cast<OffsetT>(block_threads * item + threadIdx.x);
+
+      if (IsFullTile || idx < items_in_tile)
+      {
+        (void) op(tile_base + idx);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_histogram.cuh
new file mode 100644
index 000000000..f324de52b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_histogram.cuh
@@ -0,0 +1,923 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide
+ * histogram .
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+  GMEM,
+  SMEM,
+  BLEND
+};
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _PIXELS_PER_THREAD
+ *   Pixels per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _RLE_COMPRESS
+ *   Whether to perform localized RLE to compress samples before histogramming
+ *
+ * @tparam _MEM_PREFERENCE
+ *   Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+ *
+ * @tparam _WORK_STEALING
+ *   Whether to dequeue tiles from a global work queue
+ *
+ * @tparam _VEC_SIZE
+ *   Vector size for samples loading (1, 2, 4)
+ */
+template <int _BLOCK_THREADS,
+          int _PIXELS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          bool _RLE_COMPRESS,
+          BlockHistogramMemoryPreference _MEM_PREFERENCE,
+          bool _WORK_STEALING,
+          int _VEC_SIZE = 4>
+struct AgentHistogramPolicy
+{
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
+
+    /// Pixels per thread (per tile of input)
+    PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
+
+    /// Whether to perform localized RLE to compress samples before histogramming
+    IS_RLE_COMPRESS = _RLE_COMPRESS,
+
+    /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    MEM_PREFERENCE = _MEM_PREFERENCE,
+
+    /// Whether to dequeue tiles from a global work queue
+    IS_WORK_STEALING = _WORK_STEALING,
+  };
+
+  /// Vector size for samples loading (1, 2, 4)
+  static constexpr int VEC_SIZE = _VEC_SIZE;
+
+  ///< The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  ///< Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
+ * in device-wide histogram .
+ *
+ * @tparam AgentHistogramPolicyT
+ *   Parameterized AgentHistogramPolicy tuning policy type
+ *
+ * @tparam PRIVATIZED_SMEM_BINS
+ *   Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized
+ * counters to be maintained in device-accessible memory.
+ *
+ * @tparam NUM_CHANNELS
+ *   Number of channels interleaved in the input data.  Supports up to four channels.
+ *
+ * @tparam NUM_ACTIVE_CHANNELS
+ *   Number of channels actively being histogrammed
+ *
+ * @tparam SampleIteratorT
+ *   Random-access input iterator type for reading samples
+ *
+ * @tparam CounterT
+ *   Integer type for counting sample occurrences per histogram bin
+ *
+ * @tparam PrivatizedDecodeOpT
+ *   The transform operator type for determining privatized counter indices from samples, one for
+ * each channel
+ *
+ * @tparam OutputDecodeOpT
+ *   The transform operator type for determining output bin-ids from privatized counter indices, one
+ * for each channel
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   PTX compute capability (unused)
+ */
+template <typename AgentHistogramPolicyT,
+          int PRIVATIZED_SMEM_BINS,
+          int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename PrivatizedDecodeOpT,
+          typename OutputDecodeOpT,
+          typename OffsetT,
+          int LEGACY_PTX_ARCH = 0>
+struct AgentHistogram
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The sample type of the input iterator
+  using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+  /// The pixel type of SampleT
+  using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
+
+  /// The vec type of SampleT
+  static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
+  using VecT                   = typename CubVector<SampleT, VecSize>::Type;
+
+  /// Constants
+  enum
+  {
+    BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS,
+
+    PIXELS_PER_THREAD  = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+    SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS,
+    VECS_PER_THREAD    = SAMPLES_PER_THREAD / VecSize,
+
+    TILE_PIXELS  = PIXELS_PER_THREAD * BLOCK_THREADS,
+    TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+    IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+    MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM,
+
+    IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING,
+  };
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+  /// Input iterator wrapper type (for applying cache modifier)
+  // Wrap the native input pointer with CacheModifiedInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedSampleIteratorT =
+    ::cuda::std::_If<std::is_pointer<SampleIteratorT>::value,
+                     CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
+                     SampleIteratorT>;
+
+  /// Pixel input iterator type (for applying cache modifier)
+  using WrappedPixelIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>;
+
+  /// Qaud input iterator type (for applying cache modifier)
+  using WrappedVecsIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>;
+
+  /// Parameterized BlockLoad type for samples
+  using BlockLoadSampleT = BlockLoad<SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
+
+  /// Parameterized BlockLoad type for pixels
+  using BlockLoadPixelT = BlockLoad<PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
+
+  /// Parameterized BlockLoad type for vecs
+  using BlockLoadVecT = BlockLoad<VecT, BLOCK_THREADS, VECS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
+
+  /// Shared memory type required by this thread block
+  struct _TempStorage
+  {
+    // Smem needed for block-privatized smem histogram (with 1 word of padding)
+    CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];
+
+    int tile_idx;
+
+    // Aliasable storage layout
+    union Aliasable
+    {
+      // Smem needed for loading a tile of samples
+      typename BlockLoadSampleT::TempStorage sample_load;
+
+      // Smem needed for loading a tile of pixels
+      typename BlockLoadPixelT::TempStorage pixel_load;
+
+      // Smem needed for loading a tile of vecs
+      typename BlockLoadVecT::TempStorage vec_load;
+
+    } aliasable;
+  };
+
+  /// Temporary storage type (unionable)
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  /// Reference to temp_storage
+  _TempStorage& temp_storage;
+
+  /// Sample input iterator (with cache modifier applied, if possible)
+  WrappedSampleIteratorT d_wrapped_samples;
+
+  /// Native pointer for input samples (possibly nullptr if unavailable)
+  SampleT* d_native_samples;
+
+  /// The number of output bins for each channel
+  int* num_output_bins;
+
+  /// The number of privatized bins for each channel
+  int* num_privatized_bins;
+
+  /// Copy of gmem privatized histograms for each channel
+  CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+  /// Reference to final output histograms (gmem)
+  CounterT** d_output_histograms;
+
+  /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+  OutputDecodeOpT* output_decode_op;
+
+  /// The transform operator for determining privatized counter indices from samples, one for each channel
+  PrivatizedDecodeOpT* privatized_decode_op;
+
+  /// Whether to prefer privatized smem counters vs privatized global counters
+  bool prefer_smem;
+
+  //---------------------------------------------------------------------
+  // Initialize privatized bin counters
+  //---------------------------------------------------------------------
+
+  // Initialize privatized bin counters
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+  {
+// Initialize histogram bin counts to zeros
+#pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL];
+           privatized_bin += BLOCK_THREADS)
+      {
+        privatized_histograms[CHANNEL][privatized_bin] = 0;
+      }
+    }
+
+    // Barrier to make sure all threads are done updating counters
+    CTA_SYNC();
+  }
+
+  // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitSmemBinCounters()
+  {
+    CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+    }
+
+    InitBinCounters(privatized_histograms);
+  }
+
+  // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitGmemBinCounters()
+  {
+    InitBinCounters(d_privatized_histograms);
+  }
+
+  //---------------------------------------------------------------------
+  // Update final output histograms
+  //---------------------------------------------------------------------
+
+  // Update final output histograms from privatized histograms
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+  {
+    // Barrier to make sure all threads are done updating counters
+    CTA_SYNC();
+
+// Apply privatized bin counts to output bin counts
+#pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      int channel_bins = num_privatized_bins[CHANNEL];
+      for (int privatized_bin = threadIdx.x; privatized_bin < channel_bins; privatized_bin += BLOCK_THREADS)
+      {
+        int output_bin = -1;
+        CounterT count = privatized_histograms[CHANNEL][privatized_bin];
+        bool is_valid  = count > 0;
+
+        output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+        if (output_bin >= 0)
+        {
+          atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+        }
+      }
+    }
+  }
+
+  // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StoreSmemOutput()
+  {
+    CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+    }
+
+    StoreOutput(privatized_histograms);
+  }
+
+  // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StoreGmemOutput()
+  {
+    StoreOutput(d_privatized_histograms);
+  }
+
+  //---------------------------------------------------------------------
+  // Tile accumulation
+  //---------------------------------------------------------------------
+
+  // Accumulate pixels.  Specialized for RLE compression.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
+    SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+    bool is_valid[PIXELS_PER_THREAD],
+    CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
+    Int2Type<true> is_rle_compress)
+  {
+#pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      // Bin pixels
+      int bins[PIXELS_PER_THREAD];
+
+#pragma unroll
+      for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+      {
+        bins[PIXEL] = -1;
+        privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(
+          samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+      }
+
+      CounterT accumulator = 1;
+
+#pragma unroll
+      for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+      {
+        if (bins[PIXEL] != bins[PIXEL + 1])
+        {
+          if (bins[PIXEL] >= 0)
+          {
+            atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+          }
+
+          accumulator = 0;
+        }
+        accumulator++;
+      }
+
+      // Last pixel
+      if (bins[PIXELS_PER_THREAD - 1] >= 0)
+      {
+        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+      }
+    }
+  }
+
+  // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
+    SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+    bool is_valid[PIXELS_PER_THREAD],
+    CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
+    Int2Type<false> is_rle_compress)
+  {
+#pragma unroll
+    for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+    {
+#pragma unroll
+      for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+      {
+        int bin = -1;
+        privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+        if (bin >= 0)
+        {
+          atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+        }
+      }
+    }
+  }
+
+  /**
+   * Accumulate pixel, specialized for smem privatized histogram
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
+  {
+    CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+    }
+
+    AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+  }
+
+  /**
+   * Accumulate pixel, specialized for gmem privatized histogram
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
+  {
+    AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+  }
+
+  //---------------------------------------------------------------------
+  // Tile loading
+  //---------------------------------------------------------------------
+
+  // Load full, aligned tile using pixel iterator (multi-channel)
+  template <int _NUM_ACTIVE_CHANNELS>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels)
+  {
+    using AliasedPixels = PixelT[PIXELS_PER_THREAD];
+
+    WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+    // Load using a wrapped pixel iterator
+    BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples));
+  }
+
+  // Load full, aligned tile using vec iterator (single-channel)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<1> num_active_channels)
+  {
+    using AliasedVecs = VecT[VECS_PER_THREAD];
+
+    WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));
+
+    // Load using a wrapped vec iterator
+    BlockLoadVecT(temp_storage.aliasable.vec_load).Load(d_wrapped_vecs, reinterpret_cast<AliasedVecs&>(samples));
+  }
+
+  // Load full, aligned tile
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<true> is_full_tile,
+    Int2Type<true> is_aligned)
+  {
+    LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+  }
+
+  // Load full, mis-aligned tile using sample iterator
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<true> is_full_tile,
+    Int2Type<false> is_aligned)
+  {
+    using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
+
+    // Load using sample iterator
+    BlockLoadSampleT(temp_storage.aliasable.sample_load)
+      .Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples));
+  }
+
+  // Load partially-full, aligned tile using the pixel iterator
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<false> is_full_tile,
+    Int2Type<true> is_aligned)
+  {
+    using AliasedPixels = PixelT[PIXELS_PER_THREAD];
+
+    WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+    int valid_pixels = valid_samples / NUM_CHANNELS;
+
+    // Load using a wrapped pixel iterator
+    BlockLoadPixelT(temp_storage.aliasable.pixel_load)
+      .Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples), valid_pixels);
+  }
+
+  // Load partially-full, mis-aligned tile using sample iterator
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
+    OffsetT block_offset,
+    int valid_samples,
+    SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+    Int2Type<false> is_full_tile,
+    Int2Type<false> is_aligned)
+  {
+    using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
+
+    BlockLoadSampleT(temp_storage.aliasable.sample_load)
+      .Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples), valid_samples);
+  }
+
+  template <bool IS_FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<false> /* is_striped = false */)
+  {
+#pragma unroll
+    for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+    {
+      is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+    }
+  }
+
+  template <bool IS_FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, Int2Type<true> /* is_striped = true */)
+  {
+#pragma unroll
+    for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+    {
+      is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) * NUM_CHANNELS) < valid_samples);
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Tile processing
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Consume a tile of data samples
+   *
+   * @tparam IS_ALIGNED
+   *   Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
+   *
+   * @tparam IS_FULL_TILE
+      Whether the tile is full
+   */
+  template <bool IS_ALIGNED, bool IS_FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT block_offset, int valid_samples)
+  {
+    SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+    bool is_valid[PIXELS_PER_THREAD];
+
+    // Load tile
+    LoadTile(block_offset, valid_samples, samples, Int2Type<IS_FULL_TILE>(), Int2Type<IS_ALIGNED>());
+
+    // Set valid flags
+    MarkValid<IS_FULL_TILE>(
+      is_valid, valid_samples, Int2Type<AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>{});
+
+    // Accumulate samples
+    if (prefer_smem)
+    {
+      AccumulateSmemPixels(samples, is_valid);
+    }
+    else
+    {
+      AccumulateGmemPixels(samples, is_valid);
+    }
+  }
+
+  /**
+   * @brief Consume row tiles. Specialized for work-stealing from queue
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param tiles_per_row
+   *   Number of image tiles per row
+   */
+  template <bool IS_ALIGNED>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    int tiles_per_row,
+    GridQueue<int> tile_queue,
+    Int2Type<true> is_work_stealing)
+  {
+    int num_tiles                = num_rows * tiles_per_row;
+    int tile_idx                 = (blockIdx.y * gridDim.x) + blockIdx.x;
+    OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
+
+    while (tile_idx < num_tiles)
+    {
+      int row             = tile_idx / tiles_per_row;
+      int col             = tile_idx - (row * tiles_per_row);
+      OffsetT row_offset  = row * row_stride_samples;
+      OffsetT col_offset  = (col * TILE_SAMPLES);
+      OffsetT tile_offset = row_offset + col_offset;
+
+      if (col == tiles_per_row - 1)
+      {
+        // Consume a partially-full tile at the end of the row
+        OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+        ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+      }
+      else
+      {
+        // Consume full tile
+        ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+      }
+
+      CTA_SYNC();
+
+      // Get next tile
+      if (threadIdx.x == 0)
+      {
+        temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+      }
+
+      CTA_SYNC();
+
+      tile_idx = temp_storage.tile_idx;
+    }
+  }
+
+  /**
+   * @brief Consume row tiles.  Specialized for even-share (striped across thread blocks)
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param tiles_per_row
+   *   Number of image tiles per row
+   */
+  template <bool IS_ALIGNED>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    int tiles_per_row,
+    GridQueue<int> tile_queue,
+    Int2Type<false> is_work_stealing)
+  {
+    for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+    {
+      OffsetT row_begin   = row * row_stride_samples;
+      OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+      OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+      while (tile_offset < row_end)
+      {
+        OffsetT num_remaining = row_end - tile_offset;
+
+        if (num_remaining < TILE_SAMPLES)
+        {
+          // Consume partial tile
+          ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+          break;
+        }
+
+        // Consume full tile
+        ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+        tile_offset += gridDim.x * TILE_SAMPLES;
+      }
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Parameter extraction
+  //---------------------------------------------------------------------
+
+  // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+  template <CacheLoadModifier _MODIFIER, typename _ValueT, typename _OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+  {
+    return itr.ptr;
+  }
+
+  // Return a native pixel pointer (specialized for other types)
+  template <typename IteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(IteratorT itr)
+  {
+    return nullptr;
+  }
+
+  //---------------------------------------------------------------------
+  // Interface
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Constructor
+   *
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param d_samples
+   *   Input data to reduce
+   *
+   * @param num_output_bins
+   *   The number bins per final output histogram
+   *
+   * @param num_privatized_bins
+   *   The number bins per privatized histogram
+   *
+   * @param d_output_histograms
+   *   Reference to final output histograms
+   *
+   * @param d_privatized_histograms
+   *   Reference to privatized histograms
+   *
+   * @param output_decode_op
+   *   The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+   *
+   * @param privatized_decode_op
+   *   The transform operator for determining privatized counter indices from samples, one for each channel
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
+    TempStorage& temp_storage,
+    SampleIteratorT d_samples,
+    int* num_output_bins,
+    int* num_privatized_bins,
+    CounterT** d_output_histograms,
+    CounterT** d_privatized_histograms,
+    OutputDecodeOpT* output_decode_op,
+    PrivatizedDecodeOpT* privatized_decode_op)
+      : temp_storage(temp_storage.Alias())
+      , d_wrapped_samples(d_samples)
+      , d_native_samples(NativePointer(d_wrapped_samples))
+      , num_output_bins(num_output_bins)
+      , num_privatized_bins(num_privatized_bins)
+      , d_output_histograms(d_output_histograms)
+      , output_decode_op(output_decode_op)
+      , privatized_decode_op(privatized_decode_op)
+      , prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms
+                      (MEM_PREFERENCE == GMEM) ? false
+                                               : // prefer gmem privatized histograms
+                      blockIdx.x & 1) // prefer blended privatized histograms
+  {
+    int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+    // Initialize the locations of this block's privatized histograms
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+      this->d_privatized_histograms[CHANNEL] =
+        d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+  }
+
+  /**
+   * @brief Consume image
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param tiles_per_row
+   *   Number of image tiles per row
+   *
+   * @param tile_queue
+   *   Queue descriptor for assigning tiles of work to thread blocks
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
+    OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue<int> tile_queue)
+  {
+    // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
+    int vec_mask     = AlignBytes<VecT>::ALIGN_BYTES - 1;
+    int pixel_mask   = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+    size_t row_bytes = sizeof(SampleT) * row_stride_samples;
+
+    bool vec_aligned_rows =
+      (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel
+      ((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned
+      ((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the
+                                                          // quad
+
+    bool pixel_aligned_rows =
+      (NUM_CHANNELS > 1) && // Multi channel
+      ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
+      ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
+
+    // Whether rows are aligned and can be vectorized
+    if ((d_native_samples != nullptr) && (vec_aligned_rows || pixel_aligned_rows))
+    {
+      ConsumeTiles<true>(
+        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+    else
+    {
+      ConsumeTiles<false>(
+        num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+  }
+
+  /**
+   * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters()
+  {
+    if (prefer_smem)
+    {
+      InitSmemBinCounters();
+    }
+    else
+    {
+      InitGmemBinCounters();
+    }
+  }
+
+  /**
+   * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput()
+  {
+    if (prefer_smem)
+    {
+      StoreSmemOutput();
+    }
+    else
+    {
+      StoreGmemOutput();
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge.cuh
new file mode 100644
index 000000000..adf755351
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge.cuh
@@ -0,0 +1,230 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+#include <cuda/std/__cccl/dialect.h>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+namespace merge
+{
+template <int ThreadsPerBlock,
+          int ItemsPerThread,
+          BlockLoadAlgorithm LoadAlgorithm,
+          CacheLoadModifier LoadCacheModifier,
+          BlockStoreAlgorithm StoreAlgorithm>
+struct agent_policy_t
+{
+  // do not change data member names, policy_wrapper_t depends on it
+  static constexpr int BLOCK_THREADS                   = ThreadsPerBlock;
+  static constexpr int ITEMS_PER_THREAD                = ItemsPerThread;
+  static constexpr int ITEMS_PER_TILE                  = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = LoadAlgorithm;
+  static constexpr CacheLoadModifier LOAD_MODIFIER     = LoadCacheModifier;
+  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
+};
+
+// TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
+template <typename Policy,
+          typename KeysIt1,
+          typename ItemsIt1,
+          typename KeysIt2,
+          typename ItemsIt2,
+          typename KeysOutputIt,
+          typename ItemsOutputIt,
+          typename Offset,
+          typename CompareOp>
+struct agent_t
+{
+  using policy = Policy;
+
+  // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
+  using key_type  = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
+  using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;
+
+  using keys_load_it1  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
+  using keys_load_it2  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
+  using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
+  using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
+
+  using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
+  using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
+  using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
+  using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
+
+  using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
+  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+
+  union temp_storages
+  {
+    typename block_load_keys1::TempStorage load_keys1;
+    typename block_load_keys2::TempStorage load_keys2;
+    typename block_load_items1::TempStorage load_items1;
+    typename block_load_items2::TempStorage load_items2;
+    typename block_store_keys::TempStorage store_keys;
+    typename block_store_items::TempStorage store_items;
+
+    key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
+    item_type items_shared[Policy::ITEMS_PER_TILE + 1];
+  };
+
+  struct TempStorage : Uninitialized<temp_storages>
+  {};
+
+  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
+  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
+  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
+
+  // Per thread data
+  temp_storages& storage;
+  keys_load_it1 keys1_in;
+  items_load_it1 items1_in;
+  Offset keys1_count;
+  keys_load_it2 keys2_in;
+  items_load_it2 items2_in;
+  Offset keys2_count;
+  KeysOutputIt keys_out;
+  ItemsOutputIt items_out;
+  CompareOp compare_op;
+  Offset* merge_partitions;
+
+  template <bool IsFullTile>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
+  {
+    const Offset partition_beg = merge_partitions[tile_idx + 0];
+    const Offset partition_end = merge_partitions[tile_idx + 1];
+
+    const Offset diag0 = items_per_tile * tile_idx;
+    const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
+
+    // compute bounding box for keys1 & keys2
+    const Offset keys1_beg = partition_beg;
+    const Offset keys1_end = partition_end;
+    const Offset keys2_beg = diag0 - keys1_beg;
+    const Offset keys2_end = diag1 - keys1_end;
+
+    // number of keys per tile
+    const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+    key_type keys_loc[items_per_thread];
+    gmem_to_reg<threads_per_block, IsFullTile>(
+      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
+    reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
+    CTA_SYNC();
+
+    // use binary search in shared memory to find merge path for each of thread.
+    // we can use int type here, because the number of items in shared memory is limited
+    const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
+
+    const int keys1_beg_loc =
+      MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
+    const int keys1_end_loc = num_keys1;
+    const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+    const int keys2_end_loc = num_keys2;
+
+    const int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+    const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+    // perform serial merge
+    int indices[items_per_thread];
+    cub::SerialMerge(
+      &storage.keys_shared[0],
+      keys1_beg_loc,
+      keys2_beg_loc + num_keys1,
+      num_keys1_loc,
+      num_keys2_loc,
+      keys_loc,
+      indices,
+      compare_op);
+    CTA_SYNC();
+
+    // write keys
+    if (IsFullTile)
+    {
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
+    }
+    else
+    {
+      block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc, num_remaining);
+    }
+
+    // if items are provided, merge them
+    static constexpr bool have_items = !std::is_same<item_type, NullType>::value;
+#ifdef _CCCL_CUDACC_BELOW_11_8
+    if (have_items) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can.
+                    // nvcc versions between may work
+#else
+    _CCCL_IF_CONSTEXPR (have_items)
+#endif
+    {
+      item_type items_loc[items_per_thread];
+      gmem_to_reg<threads_per_block, IsFullTile>(
+        items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
+      CTA_SYNC(); // block_store_keys above uses shared memory, so make sure all threads are done before we write to it
+      reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
+      CTA_SYNC();
+
+      // gather items from shared mem
+#pragma unroll
+      for (int i = 0; i < items_per_thread; ++i)
+      {
+        items_loc[i] = storage.items_shared[indices[i]];
+      }
+      CTA_SYNC();
+
+      // write from reg to gmem
+      if (IsFullTile)
+      {
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
+      }
+      else
+      {
+        block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc, num_remaining);
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+  {
+    // XXX with 8.5 chaging type to Offset (or long long) results in error!
+    // TODO(bgruber): is the above still true?
+    const int tile_idx     = static_cast<int>(blockIdx.x);
+    const Offset tile_base = tile_idx * items_per_tile;
+    // TODO(bgruber): random mixing of int and Offset
+    const int items_in_tile =
+      static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
+    if (items_in_tile == items_per_tile)
+    {
+      consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
+    }
+    else
+    {
+      consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
+    }
+  }
+};
+} // namespace merge
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge_sort.cuh
new file mode 100644
index 000000000..123abb2b9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_merge_sort.cuh
@@ -0,0 +1,738 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+CUB_NAMESPACE_BEGIN
+
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD                     = 1,
+          cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+          cub::CacheLoadModifier _LOAD_MODIFIER     = cub::LOAD_LDG,
+          cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT>
+struct AgentMergeSortPolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+/// \brief This agent is responsible for the initial in-tile sorting.
+template <typename Policy,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+struct AgentBlockSort
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;
+
+  using KeysLoadIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
+  using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
+
+  using BlockLoadKeys  = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
+  using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
+
+  using BlockStoreKeysIt   = typename cub::BlockStoreType<Policy, KeyIteratorT>::type;
+  using BlockStoreItemsIt  = typename cub::BlockStoreType<Policy, ValueIteratorT>::type;
+  using BlockStoreKeysRaw  = typename cub::BlockStoreType<Policy, KeyT*>::type;
+  using BlockStoreItemsRaw = typename cub::BlockStoreType<Policy, ValueT*>::type;
+
+  union _TempStorage
+  {
+    typename BlockLoadKeys::TempStorage load_keys;
+    typename BlockLoadItems::TempStorage load_items;
+    typename BlockStoreKeysIt::TempStorage store_keys_it;
+    typename BlockStoreItemsIt::TempStorage store_items_it;
+    typename BlockStoreKeysRaw::TempStorage store_keys_raw;
+    typename BlockStoreItemsRaw::TempStorage store_items_raw;
+    typename BlockMergeSortT::TempStorage block_merge;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  static constexpr int BLOCK_THREADS    = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = Policy::ITEMS_PER_TILE;
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool ping;
+  _TempStorage& storage;
+  KeysLoadIt keys_in;
+  ItemsLoadIt items_in;
+  OffsetT keys_count;
+  KeyIteratorT keys_out_it;
+  ValueIteratorT items_out_it;
+  KeyT* keys_out_raw;
+  ValueT* items_out_raw;
+  CompareOpT compare_op;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentBlockSort(
+    bool ping_,
+    TempStorage& storage_,
+    KeysLoadIt keys_in_,
+    ItemsLoadIt items_in_,
+    OffsetT keys_count_,
+    KeyIteratorT keys_out_it_,
+    ValueIteratorT items_out_it_,
+    KeyT* keys_out_raw_,
+    ValueT* items_out_raw_,
+    CompareOpT compare_op_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in(keys_in_)
+      , items_in(items_in_)
+      , keys_count(keys_count_)
+      , keys_out_it(keys_out_it_)
+      , items_out_it(items_out_it_)
+      , keys_out_raw(keys_out_raw_)
+      , items_out_raw(items_out_raw_)
+      , compare_op(compare_op_)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  {
+    auto tile_idx     = static_cast<OffsetT>(blockIdx.x);
+    auto num_tiles    = static_cast<OffsetT>(gridDim.x);
+    auto tile_base    = tile_idx * ITEMS_PER_TILE;
+    int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE});
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<false>(tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<true>(tile_base, items_in_tile);
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(OffsetT tile_base, int num_remaining)
+  {
+    ValueT items_local[ITEMS_PER_THREAD];
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+    {
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+      {
+        BlockLoadItems(storage.load_items)
+          .Load(items_in + tile_base, items_local, num_remaining, *(items_in + tile_base));
+      }
+      else
+      {
+        BlockLoadItems(storage.load_items).Load(items_in + tile_base, items_local);
+      }
+
+      CTA_SYNC();
+    }
+
+    KeyT keys_local[ITEMS_PER_THREAD];
+    _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+    {
+      BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local, num_remaining, *(keys_in + tile_base));
+    }
+    else
+    {
+      BlockLoadKeys(storage.load_keys).Load(keys_in + tile_base, keys_local);
+    }
+
+    CTA_SYNC();
+
+    _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+    {
+      BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op, num_remaining, keys_local[0]);
+    }
+    else
+    {
+      BlockMergeSortT(storage.block_merge).Sort(keys_local, items_local, compare_op);
+    }
+
+    CTA_SYNC();
+
+    if (ping)
+    {
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+      {
+        BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysIt(storage.store_keys_it).Store(keys_out_it + tile_base, keys_local);
+      }
+
+      _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+        {
+          BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local, num_remaining);
+        }
+        else
+        {
+          BlockStoreItemsIt(storage.store_items_it).Store(items_out_it + tile_base, items_local);
+        }
+      }
+    }
+    else
+    {
+      _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local, num_remaining);
+      }
+      else
+      {
+        BlockStoreKeysRaw(storage.store_keys_raw).Store(keys_out_raw + tile_base, keys_local);
+      }
+
+      _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+      {
+        CTA_SYNC();
+
+        _CCCL_IF_CONSTEXPR (IS_LAST_TILE)
+        {
+          BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local, num_remaining);
+        }
+        else
+        {
+          BlockStoreItemsRaw(storage.store_items_raw).Store(items_out_raw + tile_base, items_local);
+        }
+      }
+    }
+  }
+};
+
+/**
+ * \brief This agent is responsible for partitioning a merge path into equal segments
+ *
+ * There are two sorted arrays to be merged into one array. If the first array
+ * is partitioned between parallel workers by slicing it into ranges of equal
+ * size, there could be a significant workload imbalance. The imbalance is
+ * caused by the fact that the distribution of elements from the second array
+ * is unknown beforehand. Instead, the MergePath is partitioned between workers.
+ * This approach guarantees an equal amount of work being assigned to each worker.
+ *
+ * This approach is outlined in the paper:
+ * Odeh et al, "Merge Path - Parallel Merging Made Simple"
+ * doi:10.1109/IPDPSW.2012.202
+ */
+template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
+struct AgentPartition
+{
+  bool ping;
+  KeyIteratorT keys_ping;
+  KeyT* keys_pong;
+  OffsetT keys_count;
+  OffsetT partition_idx;
+  OffsetT* merge_partitions;
+  CompareOpT compare_op;
+  OffsetT target_merged_tiles_number;
+  int items_per_tile;
+  OffsetT num_partitions;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentPartition(
+    bool ping,
+    KeyIteratorT keys_ping,
+    KeyT* keys_pong,
+    OffsetT keys_count,
+    OffsetT partition_idx,
+    OffsetT* merge_partitions,
+    CompareOpT compare_op,
+    OffsetT target_merged_tiles_number,
+    int items_per_tile,
+    OffsetT num_partitions)
+      : ping(ping)
+      , keys_ping(keys_ping)
+      , keys_pong(keys_pong)
+      , keys_count(keys_count)
+      , partition_idx(partition_idx)
+      , merge_partitions(merge_partitions)
+      , compare_op(compare_op)
+      , target_merged_tiles_number(target_merged_tiles_number)
+      , items_per_tile(items_per_tile)
+      , num_partitions(num_partitions)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  {
+    const OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+
+    // target_merged_tiles_number is a power of two.
+    const OffsetT mask = target_merged_tiles_number - 1;
+
+    // The first tile number in the tiles group being merged, equal to:
+    // target_merged_tiles_number * (partition_idx / target_merged_tiles_number)
+    const OffsetT list  = ~mask & partition_idx;
+    const OffsetT start = items_per_tile * list;
+    const OffsetT size  = items_per_tile * merged_tiles_number;
+
+    // Tile number within the tile group being merged, equal to:
+    // partition_idx / target_merged_tiles_number
+    const OffsetT local_tile_idx = mask & partition_idx;
+
+    const OffsetT keys1_beg = (cub::min)(keys_count, start);
+    const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
+    const OffsetT keys2_beg = keys1_end;
+    const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
+
+    // The last partition (which is one-past-the-last-tile) is only to mark the end of keys1_end for the merge stage
+    if (partition_idx + 1 == num_partitions)
+    {
+      merge_partitions[partition_idx] = keys1_end;
+    }
+    else
+    {
+      const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
+
+      OffsetT partition_diag =
+        ping
+          ? MergePath(keys_ping + keys1_beg,
+                      keys_ping + keys2_beg,
+                      keys1_end - keys1_beg,
+                      keys2_end - keys2_beg,
+                      partition_at,
+                      compare_op)
+          : MergePath(keys_pong + keys1_beg,
+                      keys_pong + keys2_beg,
+                      keys1_end - keys1_beg,
+                      keys2_end - keys2_beg,
+                      partition_at,
+                      compare_op);
+
+      merge_partitions[partition_idx] = keys1_beg + partition_diag;
+    }
+  }
+};
+
+namespace detail
+{
+/**
+ * \brief Concatenates up to ITEMS_PER_THREAD elements from input{1,2} into output array
+ *
+ * Reads data in a coalesced fashion [BLOCK_THREADS * item + tid] and
+ * stores the result in output[item].
+ */
+template <int BLOCK_THREADS, bool IS_FULL_TILE, int ITEMS_PER_THREAD, class T, class It1, class It2>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+gmem_to_reg(T (&output)[ITEMS_PER_THREAD], It1 input1, It2 input2, int count1, int count2)
+{
+  _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+  {
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+      const int idx = BLOCK_THREADS * item + threadIdx.x;
+      // It1 and It2 could have different value types. Convert after load.
+      output[item] = (idx < count1) ? static_cast<T>(input1[idx]) : static_cast<T>(input2[idx - count1]);
+    }
+  }
+  else
+  {
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+    {
+      const int idx = BLOCK_THREADS * item + threadIdx.x;
+      if (idx < count1 + count2)
+      {
+        output[item] = (idx < count1) ? static_cast<T>(input1[idx]) : static_cast<T>(input2[idx - count1]);
+      }
+    }
+  }
+}
+
+/// \brief Stores data in a coalesced fashion in[item] -> out[BLOCK_THREADS * item + tid]
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, class T, class It>
+_CCCL_DEVICE _CCCL_FORCEINLINE void reg_to_shared(It output, T (&input)[ITEMS_PER_THREAD])
+{
+#pragma unroll
+  for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+  {
+    const int idx = BLOCK_THREADS * item + threadIdx.x;
+    output[idx]   = input[item];
+  }
+}
+} // namespace detail
+
+/// \brief The agent is responsible for merging N consecutive sorted arrays into N/2 sorted arrays.
+template <typename Policy,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+struct AgentMerge
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+  using KeysLoadPingIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
+  using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
+  using KeysLoadPongIt  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
+  using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
+
+  using KeysOutputPongIt  = KeyIteratorT;
+  using ItemsOutputPongIt = ValueIteratorT;
+  using KeysOutputPingIt  = KeyT*;
+  using ItemsOutputPingIt = ValueT*;
+
+  using BlockStoreKeysPong  = typename BlockStoreType<Policy, KeysOutputPongIt>::type;
+  using BlockStoreItemsPong = typename BlockStoreType<Policy, ItemsOutputPongIt>::type;
+  using BlockStoreKeysPing  = typename BlockStoreType<Policy, KeysOutputPingIt>::type;
+  using BlockStoreItemsPing = typename BlockStoreType<Policy, ItemsOutputPingIt>::type;
+
+  /// Parameterized BlockReduce primitive
+
+  union _TempStorage
+  {
+    typename BlockStoreKeysPing::TempStorage store_keys_ping;
+    typename BlockStoreItemsPing::TempStorage store_items_ping;
+    typename BlockStoreKeysPong::TempStorage store_keys_pong;
+    typename BlockStoreItemsPong::TempStorage store_items_pong;
+
+    KeyT keys_shared[Policy::ITEMS_PER_TILE + 1];
+    ValueT items_shared[Policy::ITEMS_PER_TILE + 1];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  static constexpr bool KEYS_ONLY       = std::is_same<ValueT, NullType>::value;
+  static constexpr int BLOCK_THREADS    = Policy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = Policy::ITEMS_PER_TILE;
+
+  //---------------------------------------------------------------------
+  // Per thread data
+  //---------------------------------------------------------------------
+
+  bool ping;
+  _TempStorage& storage;
+
+  KeysLoadPingIt keys_in_ping;
+  ItemsLoadPingIt items_in_ping;
+  KeysLoadPongIt keys_in_pong;
+  ItemsLoadPongIt items_in_pong;
+
+  OffsetT keys_count;
+
+  KeysOutputPongIt keys_out_pong;
+  ItemsOutputPongIt items_out_pong;
+  KeysOutputPingIt keys_out_ping;
+  ItemsOutputPingIt items_out_ping;
+
+  CompareOpT compare_op;
+  OffsetT* merge_partitions;
+  OffsetT target_merged_tiles_number;
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  template <bool IS_FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(int tid, OffsetT tile_idx, OffsetT tile_base, int count)
+  {
+    const OffsetT partition_beg = merge_partitions[tile_idx + 0];
+    const OffsetT partition_end = merge_partitions[tile_idx + 1];
+
+    // target_merged_tiles_number is a power of two.
+    const OffsetT merged_tiles_number = target_merged_tiles_number / 2;
+
+    const OffsetT mask = target_merged_tiles_number - 1;
+
+    // The first tile number in the tiles group being merged, equal to:
+    // target_merged_tiles_number * (tile_idx / target_merged_tiles_number)
+    const OffsetT list  = ~mask & tile_idx;
+    const OffsetT start = ITEMS_PER_TILE * list;
+    const OffsetT size  = ITEMS_PER_TILE * merged_tiles_number;
+
+    const OffsetT diag = ITEMS_PER_TILE * tile_idx - start;
+
+    const OffsetT keys1_beg = partition_beg - start;
+    OffsetT keys1_end       = partition_end - start;
+
+    const OffsetT keys_end_dist_from_start = keys_count - start;
+    const OffsetT max_keys2                = (keys_end_dist_from_start > size) ? (keys_end_dist_from_start - size) : 0;
+
+    // We have the following invariants:
+    // diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2)
+    // diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile
+    // and keys1_end is key1's component of that path
+    const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
+    OffsetT keys2_end =
+      (cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
+
+    // Check if it's the last tile in the tile group being merged
+    if (mask == (mask & tile_idx))
+    {
+      keys1_end = (cub::min)(keys_count - start, size);
+      keys2_end = (cub::min)(max_keys2, size);
+    }
+
+    // number of keys per tile
+    const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+    const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+    // load keys1 & keys2
+    KeyT keys_local[ITEMS_PER_THREAD];
+    if (ping)
+    {
+      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+        keys_local, keys_in_ping + start + keys1_beg, keys_in_ping + start + size + keys2_beg, num_keys1, num_keys2);
+    }
+    else
+    {
+      detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+        keys_local, keys_in_pong + start + keys1_beg, keys_in_pong + start + size + keys2_beg, num_keys1, num_keys2);
+    }
+    detail::reg_to_shared<BLOCK_THREADS>(&storage.keys_shared[0], keys_local);
+
+    // preload items into registers already
+    //
+    ValueT items_local[ITEMS_PER_THREAD];
+    (void) items_local; // TODO(bgruber): replace by [[maybe_unused]] in C++17
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+    {
+      if (ping)
+      {
+        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+          items_local,
+          items_in_ping + start + keys1_beg,
+          items_in_ping + start + size + keys2_beg,
+          num_keys1,
+          num_keys2);
+      }
+      else
+      {
+        detail::gmem_to_reg<BLOCK_THREADS, IS_FULL_TILE>(
+          items_local,
+          items_in_pong + start + keys1_beg,
+          items_in_pong + start + size + keys2_beg,
+          num_keys1,
+          num_keys2);
+      }
+    }
+
+    CTA_SYNC();
+
+    // use binary search in shared memory
+    // to find merge path for each of thread
+    // we can use int type here, because the number of
+    // items in shared memory is limited
+    //
+    const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
+
+    const int keys1_beg_local = MergePath(
+      &storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op);
+    const int keys1_end_local = num_keys1;
+    const int keys2_beg_local = diag0_local - keys1_beg_local;
+    const int keys2_end_local = num_keys2;
+
+    const int num_keys1_local = keys1_end_local - keys1_beg_local;
+    const int num_keys2_local = keys2_end_local - keys2_beg_local;
+
+    // perform serial merge
+    //
+    int indices[ITEMS_PER_THREAD];
+
+    SerialMerge(
+      &storage.keys_shared[0],
+      keys1_beg_local,
+      keys2_beg_local + num_keys1,
+      num_keys1_local,
+      num_keys2_local,
+      keys_local,
+      indices,
+      compare_op);
+
+    CTA_SYNC();
+
+    // write keys
+    if (ping)
+    {
+      _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+      {
+        BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local);
+      }
+      else
+      {
+        BlockStoreKeysPing(storage.store_keys_ping).Store(keys_out_ping + tile_base, keys_local, num_keys1 + num_keys2);
+      }
+    }
+    else
+    {
+      _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+      {
+        BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local);
+      }
+      else
+      {
+        BlockStoreKeysPong(storage.store_keys_pong).Store(keys_out_pong + tile_base, keys_local, num_keys1 + num_keys2);
+      }
+    }
+
+    // if items are provided, merge them
+#ifdef _CCCL_CUDACC_BELOW_11_8
+    if (!KEYS_ONLY) // nvcc 11.1 cannot handle #pragma unroll inside if constexpr but 11.8 can.
+                    // nvcc versions between may work
+#else
+    _CCCL_IF_CONSTEXPR (!KEYS_ONLY)
+#endif
+    {
+      CTA_SYNC();
+
+      detail::reg_to_shared<BLOCK_THREADS>(&storage.items_shared[0], items_local);
+
+      CTA_SYNC();
+
+      // gather items from shared mem
+      //
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        items_local[item] = storage.items_shared[indices[item]];
+      }
+
+      CTA_SYNC();
+
+      // write from reg to gmem
+      //
+      if (ping)
+      {
+        _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+        {
+          BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local);
+        }
+        else
+        {
+          BlockStoreItemsPing(storage.store_items_ping).Store(items_out_ping + tile_base, items_local, count);
+        }
+      }
+      else
+      {
+        _CCCL_IF_CONSTEXPR (IS_FULL_TILE)
+        {
+          BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local);
+        }
+        else
+        {
+          BlockStoreItemsPong(storage.store_items_pong).Store(items_out_pong + tile_base, items_local, count);
+        }
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentMerge(
+    bool ping_,
+    TempStorage& storage_,
+    KeysLoadPingIt keys_in_ping_,
+    ItemsLoadPingIt items_in_ping_,
+    KeysLoadPongIt keys_in_pong_,
+    ItemsLoadPongIt items_in_pong_,
+    OffsetT keys_count_,
+    KeysOutputPingIt keys_out_ping_,
+    ItemsOutputPingIt items_out_ping_,
+    KeysOutputPongIt keys_out_pong_,
+    ItemsOutputPongIt items_out_pong_,
+    CompareOpT compare_op_,
+    OffsetT* merge_partitions_,
+    OffsetT target_merged_tiles_number_)
+      : ping(ping_)
+      , storage(storage_.Alias())
+      , keys_in_ping(keys_in_ping_)
+      , items_in_ping(items_in_ping_)
+      , keys_in_pong(keys_in_pong_)
+      , items_in_pong(items_in_pong_)
+      , keys_count(keys_count_)
+      , keys_out_pong(keys_out_pong_)
+      , items_out_pong(items_out_pong_)
+      , keys_out_ping(keys_out_ping_)
+      , items_out_ping(items_out_ping_)
+      , compare_op(compare_op_)
+      , merge_partitions(merge_partitions_)
+      , target_merged_tiles_number(target_merged_tiles_number_)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  {
+    const int tile_idx      = static_cast<int>(blockIdx.x);
+    const int num_tiles     = static_cast<int>(gridDim.x);
+    const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
+    const int tid           = static_cast<int>(threadIdx.x);
+    const int items_in_tile =
+      static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
+
+    if (tile_idx < num_tiles - 1)
+    {
+      consume_tile<true>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
+    }
+    else
+    {
+      consume_tile<false>(tid, tile_idx, tile_base, items_in_tile);
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_downsweep.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 000000000..f6aa9b39d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,759 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread
+ * blocks for participating in device-wide radix sort downsweep .
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_type.cuh>
+
+#include <type_traits>
+
+#include <stdint.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * @brief Parameterizable tuning policy type for AgentRadixSortDownsweep
+ *
+ * @tparam NOMINAL_BLOCK_THREADS_4B
+ *   Threads per thread block
+ *
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B
+ *   Items per thread (per tile of input)
+ *
+ * @tparam ComputeT
+ *   Dominant compute type
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading keys (and values)
+ *
+ * @tparam _RANK_ALGORITHM
+ *   The radix ranking algorithm to use
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The block scan algorithm to use
+ *
+ * @tparam _RADIX_BITS
+ *   The number of radix bits, i.e., log2(bins)
+ */
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          RadixRankAlgorithm _RANK_ALGORITHM,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          int _RADIX_BITS,
+          typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentRadixSortDownsweepPolicy : ScalingType
+{
+  enum
+  {
+    /// The number of radix bits, i.e., log2(bins)
+    RADIX_BITS = _RADIX_BITS,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  /// Cache load modifier for reading keys (and values)
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  /// The radix ranking algorithm to use
+  static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in
+ *        device-wide radix sort downsweep .
+ *
+ * @tparam AgentRadixSortDownsweepPolicy
+ *   Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam ValueT
+ *   ValueT type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename AgentRadixSortDownsweepPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+struct AgentRadixSortDownsweep
+{
+  //---------------------------------------------------------------------
+  // Type definitions and constants
+  //---------------------------------------------------------------------
+
+  using traits                 = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type       = typename traits::bit_ordered_type;
+  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+  static constexpr CacheLoadModifier LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+  static constexpr RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+  enum
+  {
+    BLOCK_THREADS    = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+    RADIX_BITS       = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+    RADIX_DIGITS      = 1 << RADIX_BITS,
+    KEYS_ONLY         = std::is_same<ValueT, NullType>::value,
+    LOAD_WARP_STRIPED = RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+                     || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+  };
+
+  // Input iterator wrapper type (for applying cache modifier)s
+  using KeysItr   = CacheModifiedInputIterator<LOAD_MODIFIER, bit_ordered_type, OffsetT>;
+  using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;
+
+  // Radix ranking type to use
+  using BlockRadixRankT =
+    cub::detail::block_radix_rank_t<RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;
+
+  // Digit extractor type
+  using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
+  using digit_extractor_t = typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
+
+  enum
+  {
+    /// Number of bin-starting offsets tracked per thread
+    BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+  };
+
+  // BlockLoad type (keys)
+  using BlockLoadKeysT = BlockLoad<bit_ordered_type, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM>;
+
+  // BlockLoad type (values)
+  using BlockLoadValuesT = BlockLoad<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM>;
+
+  // Value exchange array type
+  using ValueExchangeT = ValueT[TILE_ITEMS];
+
+  /**
+   * Shared memory storage layout
+   */
+  union __align__(16) _TempStorage
+  {
+    typename BlockLoadKeysT::TempStorage load_keys;
+    typename BlockLoadValuesT::TempStorage load_values;
+    typename BlockRadixRankT::TempStorage radix_rank;
+
+    struct KeysAndOffsets
+    {
+      bit_ordered_type exchange_keys[TILE_ITEMS];
+      OffsetT relative_bin_offsets[RADIX_DIGITS];
+    } keys_and_offsets;
+
+    Uninitialized<ValueExchangeT> exchange_values;
+
+    OffsetT exclusive_digit_prefix[RADIX_DIGITS];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Thread fields
+  //---------------------------------------------------------------------
+
+  // Shared storage for this CTA
+  _TempStorage& temp_storage;
+
+  // Input and output device pointers
+  KeysItr d_keys_in;
+  ValuesItr d_values_in;
+  bit_ordered_type* d_keys_out;
+  ValueT* d_values_out;
+
+  // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+  OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
+
+  std::uint32_t current_bit;
+  std::uint32_t num_bits;
+
+  // Whether to short-cirucit
+  int short_circuit;
+
+  DecomposerT decomposer;
+
+  //---------------------------------------------------------------------
+  // Utility methods
+  //---------------------------------------------------------------------
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor()
+  {
+    return traits::template digit_extractor<fundamental_digit_extractor_t>(current_bit, num_bits, decomposer);
+  }
+
+  /**
+   * Scatter ranked keys through shared memory, then to device-accessible memory
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeys(
+    bit_ordered_type (&twiddled_keys)[ITEMS_PER_THREAD],
+    OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+    int (&ranks)[ITEMS_PER_THREAD],
+    OffsetT valid_items)
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      bit_ordered_type key       = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+      std::uint32_t digit        = digit_extractor().Digit(key);
+      relative_bin_offsets[ITEM] = temp_storage.keys_and_offsets.relative_bin_offsets[digit];
+
+      key = bit_ordered_conversion::from_bit_ordered(decomposer, key);
+
+      if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+      {
+        d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+      }
+    }
+  }
+
+  /**
+   * Scatter ranked values through shared memory, then to device-accessible memory
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+    int (&ranks)[ITEMS_PER_THREAD],
+    OffsetT valid_items)
+  {
+    CTA_SYNC();
+
+    ValueExchangeT& exchange_values = temp_storage.exchange_values.Alias();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      exchange_values[ranks[ITEM]] = values[ITEM];
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+      if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+      {
+        d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+      }
+    }
+  }
+
+  /**
+   * Load a tile of keys (specialized for full tile, block load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(
+    bit_ordered_type (&keys)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    bit_ordered_type oob_item,
+    Int2Type<true> is_full_tile,
+    Int2Type<false> warp_striped)
+  {
+    BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+
+    CTA_SYNC();
+  }
+
+  /**
+   * Load a tile of keys (specialized for partial tile, block load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(
+    bit_ordered_type (&keys)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    bit_ordered_type oob_item,
+    Int2Type<false> is_full_tile,
+    Int2Type<false> warp_striped)
+  {
+    // Register pressure work-around: moving valid_items through shfl prevents compiler
+    // from reusing guards/addressing from prior guarded loads
+    valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+    BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items, oob_item);
+
+    CTA_SYNC();
+  }
+
+  /**
+   * Load a tile of keys (specialized for full tile, warp-striped load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(
+    bit_ordered_type (&keys)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    bit_ordered_type oob_item,
+    Int2Type<true> is_full_tile,
+    Int2Type<true> warp_striped)
+  {
+    LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+  }
+
+  /**
+   * Load a tile of keys (specialized for partial tile, warp-striped load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(
+    bit_ordered_type (&keys)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    bit_ordered_type oob_item,
+    Int2Type<false> is_full_tile,
+    Int2Type<true> warp_striped)
+  {
+    // Register pressure work-around: moving valid_items through shfl prevents compiler
+    // from reusing guards/addressing from prior guarded loads
+    valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+    LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+  }
+
+  /**
+   * Load a tile of values (specialized for full tile, block load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    Int2Type<true> is_full_tile,
+    Int2Type<false> warp_striped)
+  {
+    BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values);
+
+    CTA_SYNC();
+  }
+
+  /**
+   * Load a tile of values (specialized for partial tile, block load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    Int2Type<false> is_full_tile,
+    Int2Type<false> warp_striped)
+  {
+    // Register pressure work-around: moving valid_items through shfl prevents compiler
+    // from reusing guards/addressing from prior guarded loads
+    valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+    BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
+
+    CTA_SYNC();
+  }
+
+  /**
+   * Load a tile of items (specialized for full tile, warp-striped load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    Int2Type<true> is_full_tile,
+    Int2Type<true> warp_striped)
+  {
+    LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+  }
+
+  /**
+   * Load a tile of items (specialized for partial tile, warp-striped load)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    Int2Type<false> is_full_tile,
+    Int2Type<true> warp_striped)
+  {
+    // Register pressure work-around: moving valid_items through shfl prevents compiler
+    // from reusing guards/addressing from prior guarded loads
+    valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+    LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+  }
+
+  /**
+   * Truck along associated values
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(
+    OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD],
+    int (&ranks)[ITEMS_PER_THREAD],
+    OffsetT block_offset,
+    OffsetT valid_items,
+    Int2Type<false> /*is_keys_only*/)
+  {
+    ValueT values[ITEMS_PER_THREAD];
+
+    CTA_SYNC();
+
+    LoadValues(values, block_offset, valid_items, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
+
+    ScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, valid_items);
+  }
+
+  /**
+   * Truck along associated values (specialized for key-only sorting)
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(
+    OffsetT (& /*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+    int (& /*ranks*/)[ITEMS_PER_THREAD],
+    OffsetT /*block_offset*/,
+    OffsetT /*valid_items*/,
+    Int2Type<true> /*is_keys_only*/)
+  {}
+
+  /**
+   * Process tile
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessTile(OffsetT block_offset, const OffsetT& valid_items = TILE_ITEMS)
+  {
+    bit_ordered_type keys[ITEMS_PER_THREAD];
+    int ranks[ITEMS_PER_THREAD];
+    OffsetT relative_bin_offsets[ITEMS_PER_THREAD];
+
+    // Assign default (min/max) value to all keys
+    bit_ordered_type default_key =
+      IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
+
+    // Load tile of keys
+    LoadKeys(keys, block_offset, valid_items, default_key, Int2Type<FULL_TILE>(), Int2Type<LOAD_WARP_STRIPED>());
+
+#pragma unroll
+    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+    {
+      keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, keys[KEY]);
+    }
+
+    // Rank the twiddled keys
+    int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+    BlockRadixRankT(temp_storage.radix_rank).RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix);
+
+    CTA_SYNC();
+
+// Share exclusive digit prefix
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        // Store exclusive prefix
+        temp_storage.exclusive_digit_prefix[bin_idx] = exclusive_digit_prefix[track];
+      }
+    }
+
+    CTA_SYNC();
+
+    // Get inclusive digit prefix
+    int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        if (IS_DESCENDING)
+        {
+          // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+          inclusive_digit_prefix[track] =
+            (bin_idx == 0) ? (BLOCK_THREADS * ITEMS_PER_THREAD) : temp_storage.exclusive_digit_prefix[bin_idx - 1];
+        }
+        else
+        {
+          // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+          inclusive_digit_prefix[track] =
+            (bin_idx == RADIX_DIGITS - 1)
+              ? (BLOCK_THREADS * ITEMS_PER_THREAD)
+              : temp_storage.exclusive_digit_prefix[bin_idx + 1];
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+// Update global scatter base offsets for each digit
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        bin_offset[track] -= exclusive_digit_prefix[track];
+        temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track];
+        bin_offset[track] += inclusive_digit_prefix[track];
+      }
+    }
+
+    CTA_SYNC();
+
+    // Scatter keys
+    ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+    // Gather/scatter values
+    GatherScatterValues<FULL_TILE>(relative_bin_offsets, ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+  }
+
+  //---------------------------------------------------------------------
+  // Copy shortcut
+  //---------------------------------------------------------------------
+
+  /**
+   * Copy tiles within the range of input
+   */
+  template <typename InputIteratorT, typename T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Copy(InputIteratorT d_in, T* d_out, OffsetT block_offset, OffsetT block_end)
+  {
+    // Simply copy the input
+    while (block_end - block_offset >= TILE_ITEMS)
+    {
+      T items[ITEMS_PER_THREAD];
+
+      LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+      CTA_SYNC();
+      StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+      block_offset += TILE_ITEMS;
+    }
+
+    // Clean up last partial tile with guarded-I/O
+    if (block_offset < block_end)
+    {
+      OffsetT valid_items = block_end - block_offset;
+
+      T items[ITEMS_PER_THREAD];
+
+      LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+      CTA_SYNC();
+      StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+    }
+  }
+
+  /**
+   * Copy tiles within the range of input (specialized for NullType)
+   */
+  template <typename InputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Copy(InputIteratorT /*d_in*/, NullType* /*d_out*/, OffsetT /*block_offset*/, OffsetT /*block_end*/)
+  {}
+
+  //---------------------------------------------------------------------
+  // Interface
+  //---------------------------------------------------------------------
+
+  /**
+   * Constructor
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortDownsweep(
+    TempStorage& temp_storage,
+    OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD],
+    OffsetT num_items,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int current_bit,
+    int num_bits,
+    DecomposerT decomposer = {})
+      : temp_storage(temp_storage.Alias())
+      , d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in))
+      , d_values_in(d_values_in)
+      , d_keys_out(reinterpret_cast<bit_ordered_type*>(d_keys_out))
+      , d_values_out(d_values_out)
+      , current_bit(current_bit)
+      , num_bits(num_bits)
+      , short_circuit(1)
+      , decomposer(decomposer)
+  {
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      this->bin_offset[track] = bin_offset[track];
+
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        // Short circuit if the histogram has only bin counts of only zeros or problem-size
+        short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+      }
+    }
+
+    short_circuit = CTA_SYNC_AND(short_circuit);
+  }
+
+  /**
+   * Constructor
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortDownsweep(
+    TempStorage& temp_storage,
+    OffsetT num_items,
+    OffsetT* d_spine,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int current_bit,
+    int num_bits,
+    DecomposerT decomposer = {})
+      : temp_storage(temp_storage.Alias())
+      , d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in))
+      , d_values_in(d_values_in)
+      , d_keys_out(reinterpret_cast<bit_ordered_type*>(d_keys_out))
+      , d_values_out(d_values_out)
+      , current_bit(current_bit)
+      , num_bits(num_bits)
+      , short_circuit(1)
+      , decomposer(decomposer)
+  {
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        if (IS_DESCENDING)
+        {
+          bin_idx = RADIX_DIGITS - bin_idx - 1;
+        }
+
+        // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+        OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+        short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+        // Load my block's bin offset for my bin
+        bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+      }
+    }
+
+    short_circuit = CTA_SYNC_AND(short_circuit);
+  }
+
+  /**
+   * Distribute keys from a segment of input tiles.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessRegion(OffsetT block_offset, OffsetT block_end)
+  {
+    if (short_circuit)
+    {
+      // Copy keys
+      Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+      // Copy values
+      Copy(d_values_in, d_values_out, block_offset, block_end);
+    }
+    else
+    {
+// Process full tiles of tile_items
+#pragma unroll 1
+      while (block_end - block_offset >= TILE_ITEMS)
+      {
+        ProcessTile<true>(block_offset);
+        block_offset += TILE_ITEMS;
+
+        CTA_SYNC();
+      }
+
+      // Clean up last partial tile with guarded-I/O
+      if (block_offset < block_end)
+      {
+        ProcessTile<false>(block_offset, block_end - block_offset);
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_histogram.cuh
new file mode 100644
index 000000000..87f5e0790
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_histogram.cuh
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * agent_radix_sort_histogram.cuh implements a stateful abstraction of CUDA
+ * thread blocks for participating in the device histogram kernel used for
+ * one-sweep radix sorting.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+template <int _BLOCK_THREADS, int _ITEMS_PER_THREAD, int NOMINAL_4B_NUM_PARTS, typename ComputeT, int _RADIX_BITS>
+struct AgentRadixSortHistogramPolicy
+{
+  enum
+  {
+    BLOCK_THREADS    = _BLOCK_THREADS,
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+    /** NUM_PARTS is the number of private histograms (parts) each histogram is split
+     * into. Each warp lane is assigned to a specific part based on the lane
+     * ID. However, lanes with the same ID in different warp use the same private
+     * histogram. This arrangement helps reduce the degree of conflicts in atomic
+     * operations. */
+    NUM_PARTS  = CUB_MAX(1, NOMINAL_4B_NUM_PARTS * 4 / CUB_MAX(sizeof(ComputeT), 4)),
+    RADIX_BITS = _RADIX_BITS,
+  };
+};
+
+template <int _BLOCK_THREADS, int _RADIX_BITS>
+struct AgentRadixSortExclusiveSumPolicy
+{
+  enum
+  {
+    BLOCK_THREADS = _BLOCK_THREADS,
+    RADIX_BITS    = _RADIX_BITS,
+  };
+};
+
+template <typename AgentRadixSortHistogramPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+struct AgentRadixSortHistogram
+{
+  // constants
+  enum
+  {
+    ITEMS_PER_THREAD = AgentRadixSortHistogramPolicy::ITEMS_PER_THREAD,
+    BLOCK_THREADS    = AgentRadixSortHistogramPolicy::BLOCK_THREADS,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+    RADIX_BITS       = AgentRadixSortHistogramPolicy::RADIX_BITS,
+    RADIX_DIGITS     = 1 << RADIX_BITS,
+    MAX_NUM_PASSES   = (sizeof(KeyT) * 8 + RADIX_BITS - 1) / RADIX_BITS,
+    NUM_PARTS        = AgentRadixSortHistogramPolicy::NUM_PARTS,
+  };
+
+  using traits                 = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type       = typename traits::bit_ordered_type;
+  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+  using Twiddle             = RadixSortTwiddle<IS_DESCENDING, KeyT>;
+  using ShmemCounterT       = std::uint32_t;
+  using ShmemAtomicCounterT = ShmemCounterT;
+
+  using fundamental_digit_extractor_t = ShiftDigitExtractor<KeyT>;
+  using digit_extractor_t = typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
+
+  struct _TempStorage
+  {
+    ShmemAtomicCounterT bins[MAX_NUM_PASSES][RADIX_DIGITS][NUM_PARTS];
+  };
+
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // thread fields
+  // shared memory storage
+  _TempStorage& s;
+
+  // bins for the histogram
+  OffsetT* d_bins_out;
+
+  // data to compute the histogram
+  const bit_ordered_type* d_keys_in;
+
+  // number of data items
+  OffsetT num_items;
+
+  // begin and end bits for sorting
+  int begin_bit, end_bit;
+
+  // number of sorting passes
+  int num_passes;
+
+  DecomposerT decomposer;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortHistogram(
+    TempStorage& temp_storage,
+    OffsetT* d_bins_out,
+    const KeyT* d_keys_in,
+    OffsetT num_items,
+    int begin_bit,
+    int end_bit,
+    DecomposerT decomposer = {})
+      : s(temp_storage.Alias())
+      , d_bins_out(d_bins_out)
+      , d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in))
+      , num_items(num_items)
+      , begin_bit(begin_bit)
+      , end_bit(end_bit)
+      , num_passes((end_bit - begin_bit + RADIX_BITS - 1) / RADIX_BITS)
+      , decomposer(decomposer)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Init()
+  {
+// Initialize bins to 0.
+#pragma unroll
+    for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+    {
+#pragma unroll
+      for (int pass = 0; pass < num_passes; ++pass)
+      {
+#pragma unroll
+        for (int part = 0; part < NUM_PARTS; ++part)
+        {
+          s.bins[pass][bin][part] = 0;
+        }
+      }
+    }
+    CTA_SYNC();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
+  {
+    // tile_offset < num_items always, hence the line below works
+    bool full_tile = num_items - tile_offset >= TILE_ITEMS;
+    if (full_tile)
+    {
+      LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + tile_offset, keys);
+    }
+    else
+    {
+      LoadDirectStriped<BLOCK_THREADS>(
+        threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey(decomposer));
+    }
+
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      keys[u] = Twiddle::In(keys[u], decomposer);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  AccumulateSharedHistograms(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
+  {
+    int part = LaneId() % NUM_PARTS;
+#pragma unroll
+    for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
+    {
+      int num_bits = CUB_MIN(RADIX_BITS, end_bit - current_bit);
+#pragma unroll
+      for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+      {
+        std::uint32_t bin = digit_extractor(current_bit, num_bits).Digit(keys[u]);
+        // Using cuda::atomic<> results in lower performance on GP100,
+        // so atomicAdd() is used instead.
+        atomicAdd(&s.bins[pass][bin][part], 1);
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateGlobalHistograms()
+  {
+#pragma unroll
+    for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+    {
+#pragma unroll
+      for (int pass = 0; pass < num_passes; ++pass)
+      {
+        OffsetT count = internal::ThreadReduce(s.bins[pass][bin], Sum());
+        if (count > 0)
+        {
+          // Using cuda::atomic<> here would also require using it in
+          // other kernels. However, other kernels of onesweep sorting
+          // (ExclusiveSum, Onesweep) don't need atomic
+          // access. Therefore, atomicAdd() is used, until
+          // cuda::atomic_ref<> becomes available.
+          atomicAdd(&d_bins_out[pass * RADIX_DIGITS + bin], count);
+        }
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  {
+    // Within a portion, avoid overflowing (u)int32 counters.
+    // Between portions, accumulate results in global memory.
+    constexpr OffsetT MAX_PORTION_SIZE = 1 << 30;
+    OffsetT num_portions               = ::cuda::ceil_div(num_items, MAX_PORTION_SIZE);
+    for (OffsetT portion = 0; portion < num_portions; ++portion)
+    {
+      // Reset the counters.
+      Init();
+      CTA_SYNC();
+
+      // Process the tiles.
+      OffsetT portion_offset = portion * MAX_PORTION_SIZE;
+      OffsetT portion_size   = CUB_MIN(MAX_PORTION_SIZE, num_items - portion_offset);
+      for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
+      {
+        OffsetT tile_offset = portion_offset + offset;
+        bit_ordered_type keys[ITEMS_PER_THREAD];
+        LoadTileKeys(tile_offset, keys);
+        AccumulateSharedHistograms(tile_offset, keys);
+      }
+      CTA_SYNC();
+
+      // Accumulate the result in global memory.
+      AccumulateGlobalHistograms();
+      CTA_SYNC();
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor(int current_bit, int num_bits)
+  {
+    return traits::template digit_extractor<fundamental_digit_extractor_t>(current_bit, num_bits, decomposer);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_onesweep.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_onesweep.cuh
new file mode 100644
index 000000000..a78ee66c7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_onesweep.cuh
@@ -0,0 +1,686 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * agent_radix_sort_onesweep.cuh implements a stateful abstraction of CUDA
+ * thread blocks for participating in the device one-sweep radix sort kernel.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write
+ * partitioned elements (keys, values) stored in shared memory into global
+ * memory. Currently applies only to writing 4B keys in full tiles; in all other cases,
+ * RADIX_SORT_STORE_DIRECT is used.
+ */
+enum RadixSortStoreAlgorithm
+{
+  /** \brief Elements are statically distributed among block threads, which write them
+   * into the appropriate partition in global memory. This results in fewer instructions
+   * and more writes in flight at a given moment, but may generate more transactions. */
+  RADIX_SORT_STORE_DIRECT,
+  /** \brief Elements are distributed among warps in a block distribution. Each warp
+   * goes through its elements and tries to write them while minimizing the number of
+   * memory transactions. This results in fewer memory transactions, but more
+   * instructions and less writes in flight at a given moment. */
+  RADIX_SORT_STORE_ALIGNED
+};
+
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          /** \brief Number of private histograms to use in the ranker;
+              ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
+          int _RANK_NUM_PARTS,
+          /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
+            support warp-strided key arrangement and count callbacks are supported. */
+          RadixRankAlgorithm _RANK_ALGORITHM,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          RadixSortStoreAlgorithm _STORE_ALGORITHM,
+          int _RADIX_BITS,
+          typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentRadixSortOnesweepPolicy : ScalingType
+{
+  enum
+  {
+    RANK_NUM_PARTS = _RANK_NUM_PARTS,
+    RADIX_BITS     = _RADIX_BITS,
+  };
+  static constexpr RadixRankAlgorithm RANK_ALGORITHM       = _RANK_ALGORITHM;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM       = _SCAN_ALGORITHM;
+  static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+};
+
+template <typename AgentRadixSortOnesweepPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename PortionOffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+struct AgentRadixSortOnesweep
+{
+  // constants
+  enum
+  {
+    ITEMS_PER_THREAD      = AgentRadixSortOnesweepPolicy::ITEMS_PER_THREAD,
+    KEYS_ONLY             = std::is_same<ValueT, NullType>::value,
+    BLOCK_THREADS         = AgentRadixSortOnesweepPolicy::BLOCK_THREADS,
+    RANK_NUM_PARTS        = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
+    TILE_ITEMS            = BLOCK_THREADS * ITEMS_PER_THREAD,
+    RADIX_BITS            = AgentRadixSortOnesweepPolicy::RADIX_BITS,
+    RADIX_DIGITS          = 1 << RADIX_BITS,
+    BINS_PER_THREAD       = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    FULL_BINS             = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
+    WARP_THREADS          = CUB_PTX_WARP_THREADS,
+    BLOCK_WARPS           = BLOCK_THREADS / WARP_THREADS,
+    WARP_MASK             = ~0,
+    LOOKBACK_PARTIAL_MASK = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 2),
+    LOOKBACK_GLOBAL_MASK  = 1 << (PortionOffsetT(sizeof(PortionOffsetT)) * 8 - 1),
+    LOOKBACK_KIND_MASK    = LOOKBACK_PARTIAL_MASK | LOOKBACK_GLOBAL_MASK,
+    LOOKBACK_VALUE_MASK   = ~LOOKBACK_KIND_MASK,
+  };
+
+  using traits                 = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type       = typename traits::bit_ordered_type;
+  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+  using fundamental_digit_extractor_t = ShiftDigitExtractor<KeyT>;
+  using digit_extractor_t = typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
+
+  using AtomicOffsetT = PortionOffsetT;
+
+  static constexpr RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortOnesweepPolicy::SCAN_ALGORITHM;
+  static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM =
+    sizeof(bit_ordered_type) == sizeof(uint32_t)
+      ? AgentRadixSortOnesweepPolicy::STORE_ALGORITHM
+      : RADIX_SORT_STORE_DIRECT;
+
+  using Twiddle = RadixSortTwiddle<IS_DESCENDING, KeyT>;
+
+  static_assert(RANK_ALGORITHM == RADIX_RANK_MATCH || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+                  || RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+                "for onesweep agent, the ranking algorithm must warp-strided key arrangement");
+
+  using BlockRadixRankT = ::cuda::std::_If<
+    RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
+    BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM, WARP_MATCH_ATOMIC_OR, RANK_NUM_PARTS>,
+    ::cuda::std::_If<
+      RANK_ALGORITHM == RADIX_RANK_MATCH,
+      BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM>,
+      BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, false, SCAN_ALGORITHM, WARP_MATCH_ANY, RANK_NUM_PARTS>>>;
+
+  // temporary storage
+  struct TempStorage_
+  {
+    union
+    {
+      bit_ordered_type keys_out[TILE_ITEMS];
+      ValueT values_out[TILE_ITEMS];
+      typename BlockRadixRankT::TempStorage rank_temp_storage;
+    };
+    union
+    {
+      OffsetT global_offsets[RADIX_DIGITS];
+      PortionOffsetT block_idx;
+    };
+  };
+
+  using TempStorage = Uninitialized<TempStorage_>;
+
+  // thread variables
+  TempStorage_& s;
+
+  // kernel parameters
+  AtomicOffsetT* d_lookback;
+  AtomicOffsetT* d_ctrs;
+  OffsetT* d_bins_out;
+  const OffsetT* d_bins_in;
+  bit_ordered_type* d_keys_out;
+  const bit_ordered_type* d_keys_in;
+  ValueT* d_values_out;
+  const ValueT* d_values_in;
+  PortionOffsetT num_items;
+  int current_bit;
+  int num_bits;
+
+  // other thread variables
+  int warp;
+  int lane;
+  DecomposerT decomposer;
+  PortionOffsetT block_idx;
+  bool full_block;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor()
+  {
+    return traits::template digit_extractor<fundamental_digit_extractor_t>(current_bit, num_bits, decomposer);
+  }
+
+  // helper methods
+  _CCCL_DEVICE _CCCL_FORCEINLINE std::uint32_t Digit(bit_ordered_type key)
+  {
+    return digit_extractor().Digit(key);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE int ThreadBin(int u)
+  {
+    return threadIdx.x * BINS_PER_THREAD + u;
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LookbackPartial(int (&bins)[BINS_PER_THREAD])
+  {
+#pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+      int bin = ThreadBin(u);
+      if (FULL_BINS || bin < RADIX_DIGITS)
+      {
+        // write the local sum into the bin
+        AtomicOffsetT& loc   = d_lookback[block_idx * RADIX_DIGITS + bin];
+        PortionOffsetT value = bins[u] | LOOKBACK_PARTIAL_MASK;
+        ThreadStore<STORE_VOLATILE>(&loc, value);
+      }
+    }
+  }
+
+  struct CountsCallback
+  {
+    using AgentT =
+      AgentRadixSortOnesweep<AgentRadixSortOnesweepPolicy, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, DecomposerT>;
+    AgentT& agent;
+    int (&bins)[BINS_PER_THREAD];
+    bit_ordered_type (&keys)[ITEMS_PER_THREAD];
+    static constexpr bool EMPTY = false;
+    _CCCL_DEVICE _CCCL_FORCEINLINE
+    CountsCallback(AgentT& agent, int (&bins)[BINS_PER_THREAD], bit_ordered_type (&keys)[ITEMS_PER_THREAD])
+        : agent(agent)
+        , bins(bins)
+        , keys(keys)
+    {}
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&other_bins)[BINS_PER_THREAD])
+    {
+#pragma unroll
+      for (int u = 0; u < BINS_PER_THREAD; ++u)
+      {
+        bins[u] = other_bins[u];
+      }
+      agent.LookbackPartial(bins);
+
+      agent.TryShortCircuit(keys, bins);
+    }
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
+  {
+#pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+      int bin = ThreadBin(u);
+      if (FULL_BINS || bin < RADIX_DIGITS)
+      {
+        PortionOffsetT inc_sum = bins[u];
+        int want_mask          = ~0;
+        // backtrack as long as necessary
+        for (PortionOffsetT block_jdx = block_idx - 1; block_jdx >= 0; --block_jdx)
+        {
+          // wait for some value to appear
+          PortionOffsetT value_j = 0;
+          AtomicOffsetT& loc_j   = d_lookback[block_jdx * RADIX_DIGITS + bin];
+          do
+          {
+            __threadfence_block(); // prevent hoisting loads from loop
+            value_j = ThreadLoad<LOAD_VOLATILE>(&loc_j);
+          } while (value_j == 0);
+
+          inc_sum += value_j & LOOKBACK_VALUE_MASK;
+          want_mask = WARP_BALLOT((value_j & LOOKBACK_GLOBAL_MASK) == 0, want_mask);
+          if (value_j & LOOKBACK_GLOBAL_MASK)
+          {
+            break;
+          }
+        }
+        AtomicOffsetT& loc_i   = d_lookback[block_idx * RADIX_DIGITS + bin];
+        PortionOffsetT value_i = inc_sum | LOOKBACK_GLOBAL_MASK;
+        ThreadStore<STORE_VOLATILE>(&loc_i, value_i);
+        s.global_offsets[bin] += inc_sum - bins[u];
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
+  {
+    if (full_block)
+    {
+      LoadDirectWarpStriped(threadIdx.x, d_keys_in + tile_offset, keys);
+    }
+    else
+    {
+      LoadDirectWarpStriped(
+        threadIdx.x, d_keys_in + tile_offset, keys, num_items - tile_offset, Twiddle::DefaultKey(decomposer));
+    }
+
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      keys[u] = Twiddle::In(keys[u], decomposer);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadValues(OffsetT tile_offset, ValueT (&values)[ITEMS_PER_THREAD])
+  {
+    if (full_block)
+    {
+      LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values);
+    }
+    else
+    {
+      int tile_items = num_items - tile_offset;
+      LoadDirectWarpStriped(threadIdx.x, d_values_in + tile_offset, values, tile_items);
+    }
+  }
+
+  /** Checks whether "short-circuiting" is possible. Short-circuiting happens
+   * if all TILE_ITEMS keys fall into the same bin, i.e. have the same digit
+   * value (note that it only happens for full tiles). If short-circuiting is
+   * performed, the part of the ranking algorithm after the CountsCallback, as
+   * well as the rest of the sorting (e.g. scattering keys and values to
+   * shared and global memory) are skipped; updates related to decoupled
+   * look-back are still performed. Instead, the keys assigned to the current
+   * thread block are written cooperatively into a contiguous location in
+   * d_keys_out corresponding to their digit. The values (if also sorting
+   * values) assigned to the current thread block are similarly copied from
+   * d_values_in to d_values_out. */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  TryShortCircuit(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD])
+  {
+    // check if any bin can be short-circuited
+    bool short_circuit = false;
+#pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+      if (FULL_BINS || ThreadBin(u) < RADIX_DIGITS)
+      {
+        short_circuit = short_circuit || bins[u] == TILE_ITEMS;
+      }
+    }
+    short_circuit = CTA_SYNC_OR(short_circuit);
+    if (!short_circuit)
+    {
+      return;
+    }
+
+    ShortCircuitCopy(keys, bins);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ShortCircuitCopy(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&bins)[BINS_PER_THREAD])
+  {
+    // short-circuit handling; note that global look-back is still required
+
+    // compute offsets
+    std::uint32_t common_bin = Digit(keys[0]);
+    int offsets[BINS_PER_THREAD];
+#pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+      int bin    = ThreadBin(u);
+      offsets[u] = bin > common_bin ? TILE_ITEMS : 0;
+    }
+
+    // global lookback
+    LoadBinsToOffsetsGlobal(offsets);
+    LookbackGlobal(bins);
+    UpdateBinsGlobal(bins, offsets);
+    CTA_SYNC();
+
+    // scatter the keys
+    OffsetT global_offset = s.global_offsets[common_bin];
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      keys[u] = Twiddle::Out(keys[u], decomposer);
+    }
+    if (full_block)
+    {
+      StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys);
+    }
+    else
+    {
+      int tile_items = num_items - block_idx * TILE_ITEMS;
+      StoreDirectWarpStriped(threadIdx.x, d_keys_out + global_offset, keys, tile_items);
+    }
+
+    if (!KEYS_ONLY)
+    {
+      // gather and scatter the values
+      ValueT values[ITEMS_PER_THREAD];
+      LoadValues(block_idx * TILE_ITEMS, values);
+      if (full_block)
+      {
+        StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values);
+      }
+      else
+      {
+        int tile_items = num_items - block_idx * TILE_ITEMS;
+        StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values, tile_items);
+      }
+    }
+
+    // exit early
+    ThreadExit();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterKeysShared(bit_ordered_type (&keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD])
+  {
+// write to shared memory
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      s.keys_out[ranks[u]] = keys[u];
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterValuesShared(ValueT (&values)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD])
+  {
+// write to shared memory
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      s.values_out[ranks[u]] = values[u];
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadBinsToOffsetsGlobal(int (&offsets)[BINS_PER_THREAD])
+  {
+// global offset - global part
+#pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+      int bin = ThreadBin(u);
+      if (FULL_BINS || bin < RADIX_DIGITS)
+      {
+        s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD], int (&offsets)[BINS_PER_THREAD])
+  {
+    bool last_block = (block_idx + 1) * TILE_ITEMS >= num_items;
+    if (d_bins_out != nullptr && last_block)
+    {
+#pragma unroll
+      for (int u = 0; u < BINS_PER_THREAD; ++u)
+      {
+        int bin = ThreadBin(u);
+        if (FULL_BINS || bin < RADIX_DIGITS)
+        {
+          d_bins_out[bin] = s.global_offsets[bin] + offsets[u] + bins[u];
+        }
+      }
+    }
+  }
+
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobalDirect()
+  {
+    int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS;
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      int idx              = threadIdx.x + u * BLOCK_THREADS;
+      bit_ordered_type key = s.keys_out[idx];
+      OffsetT global_idx   = idx + s.global_offsets[Digit(key)];
+      if (FULL_TILE || idx < tile_items)
+      {
+        d_keys_out[global_idx] = Twiddle::Out(key, decomposer);
+      }
+      WARP_SYNC(WARP_MASK);
+    }
+  }
+
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValuesGlobalDirect(int (&digits)[ITEMS_PER_THREAD])
+  {
+    int tile_items = FULL_TILE ? TILE_ITEMS : num_items - block_idx * TILE_ITEMS;
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      int idx            = threadIdx.x + u * BLOCK_THREADS;
+      ValueT value       = s.values_out[idx];
+      OffsetT global_idx = idx + s.global_offsets[digits[u]];
+      if (FULL_TILE || idx < tile_items)
+      {
+        d_values_out[global_idx] = value;
+      }
+      WARP_SYNC(WARP_MASK);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobalAligned()
+  {
+    // this only works with full tiles
+    constexpr int ITEMS_PER_WARP  = TILE_ITEMS / BLOCK_WARPS;
+    constexpr int ALIGN           = 8;
+    constexpr auto CACHE_MODIFIER = STORE_CG;
+
+    int warp_start  = warp * ITEMS_PER_WARP;
+    int warp_end    = (warp + 1) * ITEMS_PER_WARP;
+    int warp_offset = warp_start;
+    while (warp_offset < warp_end - WARP_THREADS)
+    {
+      int idx                  = warp_offset + lane;
+      bit_ordered_type key     = s.keys_out[idx];
+      bit_ordered_type key_out = Twiddle::Out(key, decomposer);
+      OffsetT global_idx       = idx + s.global_offsets[Digit(key)];
+      int last_lane            = WARP_THREADS - 1;
+      int num_writes           = WARP_THREADS;
+      if (lane == last_lane)
+      {
+        num_writes -= int(global_idx + 1) % ALIGN;
+      }
+      num_writes = SHFL_IDX_SYNC(num_writes, last_lane, WARP_MASK);
+      if (lane < num_writes)
+      {
+        ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], key_out);
+      }
+      warp_offset += num_writes;
+    }
+    {
+      int num_writes = warp_end - warp_offset;
+      if (lane < num_writes)
+      {
+        int idx              = warp_offset + lane;
+        bit_ordered_type key = s.keys_out[idx];
+        OffsetT global_idx   = idx + s.global_offsets[Digit(key)];
+        ThreadStore<CACHE_MODIFIER>(&d_keys_out[global_idx], Twiddle::Out(key, decomposer));
+      }
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterKeysGlobal()
+  {
+    // write block data to global memory
+    if (full_block)
+    {
+      if (STORE_ALGORITHM == RADIX_SORT_STORE_ALIGNED)
+      {
+        ScatterKeysGlobalAligned();
+      }
+      else
+      {
+        ScatterKeysGlobalDirect<true>();
+      }
+    }
+    else
+    {
+      ScatterKeysGlobalDirect<false>();
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterValuesGlobal(int (&digits)[ITEMS_PER_THREAD])
+  {
+    // write block data to global memory
+    if (full_block)
+    {
+      ScatterValuesGlobalDirect<true>(digits);
+    }
+    else
+    {
+      ScatterValuesGlobalDirect<false>(digits);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeKeyDigits(int (&digits)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int u = 0; u < ITEMS_PER_THREAD; ++u)
+    {
+      int idx   = threadIdx.x + u * BLOCK_THREADS;
+      digits[u] = Digit(s.keys_out[idx]);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<false> keys_only)
+  {
+    // compute digits corresponding to the keys
+    int digits[ITEMS_PER_THREAD];
+    ComputeKeyDigits(digits);
+
+    // load values
+    ValueT values[ITEMS_PER_THREAD];
+    LoadValues(block_idx * TILE_ITEMS, values);
+
+    // scatter values
+    CTA_SYNC();
+    ScatterValuesShared(values, ranks);
+
+    CTA_SYNC();
+    ScatterValuesGlobal(digits);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void GatherScatterValues(int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Process()
+  {
+    // load keys
+    // if warp1 < warp2, all elements of warp1 occur before those of warp2
+    // in the source array
+    bit_ordered_type keys[ITEMS_PER_THREAD];
+    LoadKeys(block_idx * TILE_ITEMS, keys);
+
+    // rank keys
+    int ranks[ITEMS_PER_THREAD];
+    int exclusive_digit_prefix[BINS_PER_THREAD];
+    int bins[BINS_PER_THREAD];
+    BlockRadixRankT(s.rank_temp_storage)
+      .RankKeys(keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys));
+
+    // scatter keys in shared memory
+    CTA_SYNC();
+    ScatterKeysShared(keys, ranks);
+
+    // compute global offsets
+    LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
+    LookbackGlobal(bins);
+    UpdateBinsGlobal(bins, exclusive_digit_prefix);
+
+    // scatter keys in global memory
+    CTA_SYNC();
+    ScatterKeysGlobal();
+
+    // scatter values if necessary
+    GatherScatterValues(ranks, Int2Type<KEYS_ONLY>());
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  AgentRadixSortOnesweep(
+    TempStorage& temp_storage,
+    AtomicOffsetT* d_lookback,
+    AtomicOffsetT* d_ctrs,
+    OffsetT* d_bins_out,
+    const OffsetT* d_bins_in,
+    KeyT* d_keys_out,
+    const KeyT* d_keys_in,
+    ValueT* d_values_out,
+    const ValueT* d_values_in,
+    PortionOffsetT num_items,
+    int current_bit,
+    int num_bits,
+    DecomposerT decomposer = {})
+      : s(temp_storage.Alias())
+      , d_lookback(d_lookback)
+      , d_ctrs(d_ctrs)
+      , d_bins_out(d_bins_out)
+      , d_bins_in(d_bins_in)
+      , d_keys_out(reinterpret_cast<bit_ordered_type*>(d_keys_out))
+      , d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in))
+      , d_values_out(d_values_out)
+      , d_values_in(d_values_in)
+      , num_items(num_items)
+      , current_bit(current_bit)
+      , num_bits(num_bits)
+      , warp(threadIdx.x / WARP_THREADS)
+      , lane(LaneId())
+      , decomposer(decomposer)
+  {
+    // initialization
+    if (threadIdx.x == 0)
+    {
+      s.block_idx = atomicAdd(d_ctrs, 1);
+    }
+    CTA_SYNC();
+    block_idx  = s.block_idx;
+    full_block = (block_idx + 1) * TILE_ITEMS <= num_items;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_upsweep.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 000000000..e91e32c5b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,545 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix
+ * sort upsweep .
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * @brief Parameterizable tuning policy type for AgentRadixSortUpsweep
+ *
+ * @tparam NOMINAL_BLOCK_THREADS_4B
+ *   Threads per thread block
+ *
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B
+ *   Items per thread (per tile of input)
+ *
+ * @tparam ComputeT
+ *   Dominant compute type
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading keys
+ *
+ * @tparam _RADIX_BITS
+ *   The number of radix bits, i.e., log2(bins)
+ */
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          CacheLoadModifier _LOAD_MODIFIER,
+          int _RADIX_BITS,
+          typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentRadixSortUpsweepPolicy : ScalingType
+{
+  enum
+  {
+    /// The number of radix bits, i.e., log2(bins)
+    RADIX_BITS = _RADIX_BITS,
+  };
+
+  /// Cache load modifier for reading keys
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for
+ * participating in device-wide radix sort upsweep .
+ *
+ * @tparam AgentRadixSortUpsweepPolicy
+ *   Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam DecomposerT = detail::identity_decomposer_t
+ *   Signed integer type for global offsets
+ */
+template <typename AgentRadixSortUpsweepPolicy,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+struct AgentRadixSortUpsweep
+{
+  //---------------------------------------------------------------------
+  // Type definitions and constants
+  //---------------------------------------------------------------------
+  using traits                 = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type       = typename traits::bit_ordered_type;
+  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+  // Integer type for digit counters (to be packed into words of PackedCounters)
+  using DigitCounter = unsigned char;
+
+  // Integer type for packing DigitCounters into columns of shared memory banks
+  using PackedCounter = unsigned int;
+
+  static constexpr CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+  enum
+  {
+    RADIX_BITS      = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+    BLOCK_THREADS   = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+    KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+    RADIX_DIGITS = 1 << RADIX_BITS,
+
+    LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS,
+    WARP_THREADS     = 1 << LOG_WARP_THREADS,
+    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+    TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD,
+
+    BYTES_PER_COUNTER     = sizeof(DigitCounter),
+    LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
+
+    PACKING_RATIO     = sizeof(PackedCounter) / sizeof(DigitCounter),
+    LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
+
+    LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
+    COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
+
+    // To prevent counter overflow, we must periodically unpack and aggregate the
+    // digit counters back into registers.  Each counter lane is assigned to a
+    // warp for aggregation.
+
+    LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+    // Unroll tiles in batches without risk of counter overflow
+    UNROLL_COUNT      = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+    UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS,
+  };
+
+  // Input iterator wrapper type (for applying cache modifier)s
+  using KeysItr = CacheModifiedInputIterator<LOAD_MODIFIER, bit_ordered_type, OffsetT>;
+
+  // Digit extractor type
+  using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
+  using digit_extractor_t = typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;
+
+  /**
+   * Shared memory storage layout
+   */
+  union __align__(16) _TempStorage
+  {
+    DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+    PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+    OffsetT block_counters[WARP_THREADS][RADIX_DIGITS];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Thread fields (aggregate state bundle)
+  //---------------------------------------------------------------------
+
+  // Shared storage for this CTA
+  _TempStorage& temp_storage;
+
+  // Thread-local counters for periodically aggregating composite-counter lanes
+  OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+  // Input and output device pointers
+  KeysItr d_keys_in;
+
+  // Target bits
+  int current_bit;
+  int num_bits;
+  DecomposerT decomposer;
+
+  //---------------------------------------------------------------------
+  // Helper structure for templated iteration
+  //---------------------------------------------------------------------
+
+  // Iterate
+  template <int COUNT, int MAX>
+  struct Iterate
+  {
+    // BucketKeys
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void
+    BucketKeys(AgentRadixSortUpsweep& cta, bit_ordered_type keys[KEYS_PER_THREAD])
+    {
+      cta.Bucket(keys[COUNT]);
+
+      // Next
+      Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+    }
+  };
+
+  // Terminate
+  template <int MAX>
+  struct Iterate<MAX, MAX>
+  {
+    // BucketKeys
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void
+    BucketKeys(AgentRadixSortUpsweep& /*cta*/, bit_ordered_type /*keys*/[KEYS_PER_THREAD])
+    {}
+  };
+
+  //---------------------------------------------------------------------
+  // Utility methods
+  //---------------------------------------------------------------------
+  _CCCL_DEVICE _CCCL_FORCEINLINE digit_extractor_t digit_extractor()
+  {
+    return traits::template digit_extractor<fundamental_digit_extractor_t>(current_bit, num_bits, decomposer);
+  }
+
+  /**
+   * Decode a key and increment corresponding smem digit counter
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Bucket(bit_ordered_type key)
+  {
+    // Perform transform op
+    bit_ordered_type converted_key = bit_ordered_conversion::to_bit_ordered(decomposer, key);
+
+    // Extract current digit bits
+    std::uint32_t digit = digit_extractor().Digit(converted_key);
+
+    // Get sub-counter offset
+    std::uint32_t sub_counter = digit & (PACKING_RATIO - 1);
+
+    // Get row offset
+    std::uint32_t row_offset = digit >> LOG_PACKING_RATIO;
+
+    // Increment counter
+    temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+  }
+
+  /**
+   * Reset composite counters
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ResetDigitCounters()
+  {
+#pragma unroll
+    for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+    {
+      temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+    }
+  }
+
+  /**
+   * Reset the unpacked counters in each thread
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ResetUnpackedCounters()
+  {
+#pragma unroll
+    for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+    {
+#pragma unroll
+      for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+      {
+        local_counts[LANE][UNPACKED_COUNTER] = 0;
+      }
+    }
+  }
+
+  /**
+   * Extracts and aggregates the digit counters for each counter lane
+   * owned by this warp
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void UnpackDigitCounts()
+  {
+    unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
+    unsigned int warp_tid = LaneId();
+
+#pragma unroll
+    for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+    {
+      const int counter_lane = (LANE * WARPS) + warp_id;
+      if (counter_lane < COUNTER_LANES)
+      {
+#pragma unroll
+        for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+        {
+#pragma unroll
+          for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+          {
+            OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+            local_counts[LANE][UNPACKED_COUNTER] += counter;
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Processes a single, full tile
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessFullTile(OffsetT block_offset)
+  {
+    // Tile of keys
+    bit_ordered_type keys[KEYS_PER_THREAD];
+
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+    // Prevent hoisting
+    CTA_SYNC();
+
+    // Bucket tile of keys
+    Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+  }
+
+  /**
+   * Processes a single load (may have some threads masked off)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessPartialTile(OffsetT block_offset, const OffsetT& block_end)
+  {
+    // Process partial tile if necessary using single loads
+    for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
+    {
+      // Load and bucket key
+      bit_ordered_type key = d_keys_in[block_offset + offset];
+      Bucket(key);
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Interface
+  //---------------------------------------------------------------------
+
+  /**
+   * Constructor
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentRadixSortUpsweep(
+    TempStorage& temp_storage, const KeyT* d_keys_in, int current_bit, int num_bits, DecomposerT decomposer = {})
+      : temp_storage(temp_storage.Alias())
+      , d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in))
+      , current_bit(current_bit)
+      , num_bits(num_bits)
+      , decomposer(decomposer)
+  {}
+
+  /**
+   * Compute radix digit histograms from a segment of input tiles.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessRegion(OffsetT block_offset, const OffsetT& block_end)
+  {
+    // Reset digit counters in smem and unpacked counters in registers
+    ResetDigitCounters();
+    ResetUnpackedCounters();
+
+    // Unroll batches of full tiles
+    while (block_end - block_offset >= UNROLLED_ELEMENTS)
+    {
+      for (int i = 0; i < UNROLL_COUNT; ++i)
+      {
+        ProcessFullTile(block_offset);
+        block_offset += TILE_ITEMS;
+      }
+
+      CTA_SYNC();
+
+      // Aggregate back into local_count registers to prevent overflow
+      UnpackDigitCounts();
+
+      CTA_SYNC();
+
+      // Reset composite counters in lanes
+      ResetDigitCounters();
+    }
+
+    // Unroll single full tiles
+    while (block_end - block_offset >= TILE_ITEMS)
+    {
+      ProcessFullTile(block_offset);
+      block_offset += TILE_ITEMS;
+    }
+
+    // Process partial tile if necessary
+    ProcessPartialTile(block_offset, block_end);
+
+    CTA_SYNC();
+
+    // Aggregate back into local_count registers
+    UnpackDigitCounts();
+  }
+
+  /**
+   * Extract counts (saving them to the external array)
+   */
+  template <bool IS_DESCENDING>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT* counters, int bin_stride = 1, int bin_offset = 0)
+  {
+    unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
+    unsigned int warp_tid = LaneId();
+
+// Place unpacked digit counters in shared memory
+#pragma unroll
+    for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+    {
+      int counter_lane = (LANE * WARPS) + warp_id;
+      if (counter_lane < COUNTER_LANES)
+      {
+        int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+#pragma unroll
+        for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+        {
+          int bin_idx = digit_row + UNPACKED_COUNTER;
+
+          temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER];
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+// Rake-reduce bin_count reductions
+
+// Whole blocks
+#pragma unroll
+    for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+         BIN_BASE += BLOCK_THREADS)
+    {
+      int bin_idx = BIN_BASE + threadIdx.x;
+
+      OffsetT bin_count = 0;
+#pragma unroll
+      for (int i = 0; i < WARP_THREADS; ++i)
+      {
+        bin_count += temp_storage.block_counters[i][bin_idx];
+      }
+
+      if (IS_DESCENDING)
+      {
+        bin_idx = RADIX_DIGITS - bin_idx - 1;
+      }
+
+      counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+    }
+
+    // Remainder
+    if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+    {
+      int bin_idx = threadIdx.x;
+
+      OffsetT bin_count = 0;
+#pragma unroll
+      for (int i = 0; i < WARP_THREADS; ++i)
+      {
+        bin_count += temp_storage.block_counters[i][bin_idx];
+      }
+
+      if (IS_DESCENDING)
+      {
+        bin_idx = RADIX_DIGITS - bin_idx - 1;
+      }
+
+      counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+    }
+  }
+
+  /**
+   * @brief Extract counts
+   *
+   * @param[out] bin_count
+   *   The exclusive prefix sum for the digits
+   *   [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD -
+   * 1]
+   */
+  template <int BINS_TRACKED_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
+  {
+    unsigned int warp_id  = threadIdx.x >> LOG_WARP_THREADS;
+    unsigned int warp_tid = LaneId();
+
+// Place unpacked digit counters in shared memory
+#pragma unroll
+    for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+    {
+      int counter_lane = (LANE * WARPS) + warp_id;
+      if (counter_lane < COUNTER_LANES)
+      {
+        int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+#pragma unroll
+        for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+        {
+          int bin_idx = digit_row + UNPACKED_COUNTER;
+
+          temp_storage.block_counters[warp_tid][bin_idx] = local_counts[LANE][UNPACKED_COUNTER];
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+// Rake-reduce bin_count reductions
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        bin_count[track] = 0;
+
+#pragma unroll
+        for (int i = 0; i < WARP_THREADS; ++i)
+        {
+          bin_count[track] += temp_storage.block_counters[i][bin_idx];
+        }
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce.cuh
new file mode 100644
index 000000000..94b90774e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce.cuh
@@ -0,0 +1,448 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::AgentReduce implements a stateful abstraction of CUDA thread
+ *       blocks for participating in device-wide reduction.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/detail/type_traits.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/grid/grid_mapping.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+#include <cuda/std/functional>
+_CCCL_SUPPRESS_DEPRECATED_POP
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ * @tparam NOMINAL_BLOCK_THREADS_4B Threads per thread block
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B Items per thread (per tile of input)
+ * @tparam ComputeT Dominant compute type
+ * @tparam _VECTOR_LOAD_LENGTH Number of items per vectorized load
+ * @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use
+ * @tparam _LOAD_MODIFIER Cache load modifier for reading input elements
+ */
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          int _VECTOR_LOAD_LENGTH,
+          BlockReduceAlgorithm _BLOCK_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentReducePolicy : ScalingType
+{
+  /// Number of items per vectorized load
+  static constexpr int VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH;
+
+  /// Cooperative block-wide reduction algorithm to use
+  static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentReduce implements a stateful abstraction of CUDA thread blocks
+ *        for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If `FIRST_TILE`, this partial
+ * reduction is stored into `thread_aggregate`. Otherwise it is accumulated
+ * into `thread_aggregate`.
+ *
+ * @tparam AgentReducePolicy
+ *   Parameterized AgentReducePolicy tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access iterator type for input
+ *
+ * @tparam OutputIteratorT
+ *   Random-access iterator type for output
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOp
+ *   Binary reduction operator type having member
+ *   `auto operator()(T &&a, U &&b)`
+ *
+ * @tparam AccumT
+ *   The type of intermediate accumulator (according to P2322R6)
+ */
+template <typename AgentReducePolicy,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOp,
+          typename AccumT,
+          typename TransformOp = ::cuda::std::__identity>
+struct AgentReduce
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  /// Vector type of InputT for data movement
+  using VectorT = typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type;
+
+  /// Input iterator wrapper type (for applying cache modifier)
+  // Wrap the native input pointer with CacheModifiedInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
+
+  /// Constants
+  static constexpr int BLOCK_THREADS      = AgentReducePolicy::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD   = AgentReducePolicy::ITEMS_PER_THREAD;
+  static constexpr int TILE_ITEMS         = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH);
+
+  // Can vectorize according to the policy if the input iterator is a native
+  // pointer to a primitive type
+  static constexpr bool ATTEMPT_VECTORIZATION =
+    (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0)
+    && (::cuda::std::is_pointer<InputIteratorT>::value) && Traits<InputT>::PRIMITIVE;
+
+  static constexpr CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
+
+  static constexpr BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+  /// Parameterized BlockReduce primitive
+  using BlockReduceT = BlockReduce<AccumT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM>;
+
+  /// Shared memory type required by this thread block
+  struct _TempStorage
+  {
+    typename BlockReduceT::TempStorage reduce;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+  InputIteratorT d_in; ///< Input data to reduce
+  WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
+  ReductionOp reduction_op; ///< Binary reduction operator
+  TransformOp transform_op; ///< Transform operator
+
+  //---------------------------------------------------------------------
+  // Utility
+  //---------------------------------------------------------------------
+
+  // Whether or not the input is aligned with the vector type (specialized for
+  // types we can vectorize)
+  template <typename Iterator>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAligned(Iterator d_in, Int2Type<true> /*can_vectorize*/)
+  {
+    return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+  }
+
+  // Whether or not the input is aligned with the vector type (specialized for
+  // types we cannot vectorize)
+  template <typename Iterator>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool IsAligned(Iterator /*d_in*/, Int2Type<false> /*can_vectorize*/)
+  {
+    return false;
+  }
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Constructor
+   * @param temp_storage Reference to temp_storage
+   * @param d_in Input data to reduce
+   * @param reduction_op Binary reduction operator
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  AgentReduce(TempStorage& temp_storage, InputIteratorT d_in, ReductionOp reduction_op, TransformOp transform_op = {})
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_wrapped_in(d_in)
+      , reduction_op(reduction_op)
+      , transform_op(transform_op)
+  {}
+
+  //---------------------------------------------------------------------
+  // Tile consumption
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Consume a full tile of input (non-vectorized)
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(
+    AccumT& thread_aggregate,
+    OffsetT block_offset,
+    int /*valid_items*/,
+    Int2Type<true> /*is_full_tile*/,
+    Int2Type<false> /*can_vectorize*/)
+  {
+    AccumT items[ITEMS_PER_THREAD];
+
+    // Load items in striped fashion
+    cub::detail::load_transform_direct_striped<BLOCK_THREADS>(
+      threadIdx.x, d_wrapped_in + block_offset, items, transform_op);
+
+    // Reduce items within each thread stripe
+    thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op)
+                                       : internal::ThreadReduce(items, reduction_op, thread_aggregate);
+  }
+
+  /**
+   * Consume a full tile of input (vectorized)
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(
+    AccumT& thread_aggregate,
+    OffsetT block_offset,
+    int /*valid_items*/,
+    Int2Type<true> /*is_full_tile*/,
+    Int2Type<true> /*can_vectorize*/)
+  {
+    // Alias items as an array of VectorT and load it in striped fashion
+    enum
+    {
+      WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
+    };
+
+    // Fabricate a vectorized input iterator
+    InputT* d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+    CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+      reinterpret_cast<VectorT*>(d_in_unqualified));
+
+    // Load items as vector items
+    InputT input_items[ITEMS_PER_THREAD];
+    VectorT* vec_items = reinterpret_cast<VectorT*>(input_items);
+#pragma unroll
+    for (int i = 0; i < WORDS; ++i)
+    {
+      vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+    }
+
+    // Convert from input type to output type
+    AccumT items[ITEMS_PER_THREAD];
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+    {
+      items[i] = transform_op(input_items[i]);
+    }
+
+    // Reduce items within each thread stripe
+    thread_aggregate = (IS_FIRST_TILE) ? internal::ThreadReduce(items, reduction_op)
+                                       : internal::ThreadReduce(items, reduction_op, thread_aggregate);
+  }
+
+  /**
+   * Consume a partial tile of input
+   * @param block_offset The offset the tile to consume
+   * @param valid_items The number of valid items in the tile
+   * @param is_full_tile Whether or not this is a full tile
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(
+    AccumT& thread_aggregate,
+    OffsetT block_offset,
+    int valid_items,
+    Int2Type<false> /*is_full_tile*/,
+    Int2Type<CAN_VECTORIZE> /*can_vectorize*/)
+  {
+    // Partial tile
+    int thread_offset = threadIdx.x;
+
+    // Read first item
+    if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+    {
+      thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]);
+      thread_offset += BLOCK_THREADS;
+    }
+
+    // Continue reading items (block-striped)
+    while (thread_offset < valid_items)
+    {
+      InputT item(d_wrapped_in[block_offset + thread_offset]);
+
+      thread_aggregate = reduction_op(thread_aggregate, transform_op(item));
+      thread_offset += BLOCK_THREADS;
+    }
+  }
+
+  //---------------------------------------------------------------
+  // Consume a contiguous segment of tiles
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Reduce a contiguous segment of input tiles
+   * @param even_share GridEvenShare descriptor
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int CAN_VECTORIZE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+  ConsumeRange(GridEvenShare<OffsetT>& even_share, Int2Type<CAN_VECTORIZE> can_vectorize)
+  {
+    AccumT thread_aggregate{};
+
+    if (even_share.block_end - even_share.block_offset < TILE_ITEMS)
+    {
+      // First tile isn't full (not all threads have valid items)
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+      return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+    }
+
+    // Extracting this into a function saves 8% of generated kernel size by allowing to reuse
+    // the block reduction below. This also workaround hang in nvcc.
+    ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);
+
+    // Compute block-wide reduction (all threads have valid items)
+    return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+  }
+
+  /**
+   * @brief Reduce a contiguous segment of input tiles
+   * @param[in] block_offset Threadblock begin offset (inclusive)
+   * @param[in] block_end Threadblock end offset (exclusive)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ConsumeRange(OffsetT block_offset, OffsetT block_end)
+  {
+    GridEvenShare<OffsetT> even_share;
+    even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+    return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>()))
+           ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ())
+           : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ());
+  }
+
+  /**
+   * Reduce a contiguous segment of input tiles
+   * @param[in] even_share GridEvenShare descriptor
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ConsumeTiles(GridEvenShare<OffsetT>& even_share)
+  {
+    // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+    return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>()))
+           ? ConsumeRange(even_share, Int2Type < true && ATTEMPT_VECTORIZATION > ())
+           : ConsumeRange(even_share, Int2Type < false && ATTEMPT_VECTORIZATION > ());
+  }
+
+private:
+  /**
+   * @brief Reduce a contiguous segment of input tiles with more than `TILE_ITEMS` elements
+   * @param even_share GridEvenShare descriptor
+   * @param can_vectorize Whether or not we can vectorize loads
+   */
+  template <int CAN_VECTORIZE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeFullTileRange(
+    AccumT& thread_aggregate, GridEvenShare<OffsetT>& even_share, Int2Type<CAN_VECTORIZE> can_vectorize)
+  {
+    // At least one full block
+    ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+
+    if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+    {
+      // Exit early to handle offset overflow
+      return;
+    }
+
+    even_share.block_offset += even_share.block_stride;
+
+    // Consume subsequent full tiles of input, at least one full tile was processed, so
+    // `even_share.block_end >= TILE_ITEMS`
+    while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
+    {
+      ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+
+      if (even_share.block_end - even_share.block_offset < even_share.block_stride)
+      {
+        // Exit early to handle offset overflow
+        return;
+      }
+
+      even_share.block_offset += even_share.block_stride;
+    }
+
+    // Consume a partially-full tile
+    if (even_share.block_offset < even_share.block_end)
+    {
+      int valid_items = even_share.block_end - even_share.block_offset;
+      ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 000000000..f36e5d41f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,696 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::AgentReduceByKey implements a stateful abstraction of CUDA thread
+ *       blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * @brief Parameterizable tuning policy type for AgentReduceByKey
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentReduceByKeyPolicy
+{
+  ///< Threads per thread block
+  static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
+
+  ///< Items per thread (per tile of input)
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  ///< The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  ///< Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  ///< The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentReduceByKey implements a stateful abstraction of CUDA thread
+ *        blocks for participating in device-wide reduce-value-by-key
+ *
+ * @tparam AgentReduceByKeyPolicyT
+ *   Parameterized AgentReduceByKeyPolicy tuning policy type
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam UniqueOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumRunsOutputIteratorT
+ *   Output iterator type for recording number of items selected
+ *
+ * @tparam EqualityOpT
+ *   KeyT equality operator type
+ *
+ * @tparam ReductionOpT
+ *   ValueT reduction operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam AccumT
+ *   The type of intermediate accumulator (according to P2322R6)
+ */
+template <typename AgentReduceByKeyPolicyT,
+          typename KeysInputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT,
+          typename AccumT>
+struct AgentReduceByKey
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // The input keys type
+  using KeyInputT = cub::detail::value_t<KeysInputIteratorT>;
+
+  // The output keys type
+  using KeyOutputT = cub::detail::non_void_value_t<UniqueOutputIteratorT, KeyInputT>;
+
+  // The input values type
+  using ValueInputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+  // Tuple type for scanning (pairs accumulated segment-value with
+  // segment-index)
+  using OffsetValuePairT = KeyValuePair<OffsetT, AccumT>;
+
+  // Tuple type for pairing keys and values
+  using KeyValuePairT = KeyValuePair<KeyOutputT, AccumT>;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
+
+  // Guarded inequality functor
+  template <typename _EqualityOpT>
+  struct GuardedInequalityWrapper
+  {
+    /// Wrapped equality operator
+    _EqualityOpT op;
+
+    /// Items remaining
+    int num_remaining;
+
+    /// Constructor
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GuardedInequalityWrapper(_EqualityOpT op, int num_remaining)
+        : op(op)
+        , num_remaining(num_remaining)
+    {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(const T& a, const T& b, int idx) const
+    {
+      if (idx < num_remaining)
+      {
+        return !op(a, b); // In bounds
+      }
+
+      // Return true if first out-of-bounds item, false otherwise
+      return (idx == num_remaining);
+    }
+  };
+
+  // Constants
+  static constexpr int BLOCK_THREADS     = AgentReduceByKeyPolicyT::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD  = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD;
+  static constexpr int TILE_ITEMS        = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1);
+
+  // Whether or not the scan operation has a zero-valued identity value (true
+  // if we're performing addition on a primitive type)
+  static constexpr int HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) && (Traits<AccumT>::PRIMITIVE);
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier)
+  // for keys Wrap the native input pointer with
+  // CacheModifiedValuesInputIterator or directly use the supplied input
+  // iterator type
+  using WrappedKeysInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<KeysInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,
+                     KeysInputIteratorT>;
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier)
+  // for values Wrap the native input pointer with
+  // CacheModifiedValuesInputIterator or directly use the supplied input
+  // iterator type
+  using WrappedValuesInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<ValuesInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
+                     ValuesInputIteratorT>;
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier)
+  // for fixup values Wrap the native input pointer with
+  // CacheModifiedValuesInputIterator or directly use the supplied input
+  // iterator type
+  using WrappedFixupInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<AggregatesOutputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,
+                     AggregatesOutputIteratorT>;
+
+  // Reduce-value-by-segment scan operator
+  using ReduceBySegmentOpT = ReduceBySegmentOp<ReductionOpT>;
+
+  // Parameterized BlockLoad type for keys
+  using BlockLoadKeysT =
+    BlockLoad<KeyOutputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockLoad type for values
+  using BlockLoadValuesT = BlockLoad<AccumT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentReduceByKeyPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockDiscontinuity type for keys
+  using BlockDiscontinuityKeys = BlockDiscontinuity<KeyOutputT, BLOCK_THREADS>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = BlockScan<OffsetValuePairT, BLOCK_THREADS, AgentReduceByKeyPolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using DelayConstructorT = typename AgentReduceByKeyPolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackOpT =
+    TilePrefixCallbackOp<OffsetValuePairT, ReduceBySegmentOpT, ScanTileStateT, 0, DelayConstructorT>;
+
+  // Key and value exchange types
+  using KeyExchangeT   = KeyOutputT[TILE_ITEMS + 1];
+  using ValueExchangeT = AccumT[TILE_ITEMS + 1];
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+
+      // Smem needed for discontinuity detection
+      typename BlockDiscontinuityKeys::TempStorage discontinuity;
+    } scan_storage;
+
+    // Smem needed for loading keys
+    typename BlockLoadKeysT::TempStorage load_keys;
+
+    // Smem needed for loading values
+    typename BlockLoadValuesT::TempStorage load_values;
+
+    // Smem needed for compacting key value pairs(allows non POD items in this
+    // union)
+    Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  /// Reference to temp_storage
+  _TempStorage& temp_storage;
+
+  /// Input keys
+  WrappedKeysInputIteratorT d_keys_in;
+
+  /// Unique output keys
+  UniqueOutputIteratorT d_unique_out;
+
+  /// Input values
+  WrappedValuesInputIteratorT d_values_in;
+
+  /// Output value aggregates
+  AggregatesOutputIteratorT d_aggregates_out;
+
+  /// Output pointer for total number of segments identified
+  NumRunsOutputIteratorT d_num_runs_out;
+
+  /// KeyT equality operator
+  EqualityOpT equality_op;
+
+  /// Reduction operator
+  ReductionOpT reduction_op;
+
+  /// Reduce-by-segment scan operator
+  ReduceBySegmentOpT scan_op;
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param d_keys_in
+   *   Input keys
+   *
+   * @param d_unique_out
+   *   Unique output keys
+   *
+   * @param d_values_in
+   *   Input values
+   *
+   * @param d_aggregates_out
+   *   Output value aggregates
+   *
+   * @param d_num_runs_out
+   *   Output pointer for total number of segments identified
+   *
+   * @param equality_op
+   *   KeyT equality operator
+   *
+   * @param reduction_op
+   *   ValueT reduction operator
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentReduceByKey(
+    TempStorage& temp_storage,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op)
+      : temp_storage(temp_storage.Alias())
+      , d_keys_in(d_keys_in)
+      , d_unique_out(d_unique_out)
+      , d_values_in(d_values_in)
+      , d_aggregates_out(d_aggregates_out)
+      , d_num_runs_out(d_num_runs_out)
+      , equality_op(equality_op)
+      , reduction_op(reduction_op)
+      , scan_op(reduction_op)
+  {}
+
+  //---------------------------------------------------------------------
+  // Scatter utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * Directly scatter flagged items to output offsets
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterDirect(
+    KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+    OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+    OffsetT (&segment_indices)[ITEMS_PER_THREAD])
+  {
+// Scatter flagged keys and values
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (segment_flags[ITEM])
+      {
+        d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+        d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+      }
+    }
+  }
+
+  /**
+   * 2-phase scatter flagged items to output offsets
+   *
+   * The exclusive scan causes each head flag to be paired with the previous
+   * value aggregate: the scatter offsets must be decremented for value
+   * aggregates
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase(
+    KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+    OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+    OffsetT (&segment_indices)[ITEMS_PER_THREAD],
+    OffsetT num_tile_segments,
+    OffsetT num_tile_segments_prefix)
+  {
+    CTA_SYNC();
+
+// Compact and scatter pairs
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (segment_flags[ITEM])
+      {
+        temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+      }
+    }
+
+    CTA_SYNC();
+
+    for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+    {
+      KeyValuePairT pair                                = temp_storage.raw_exchange.Alias()[item];
+      d_unique_out[num_tile_segments_prefix + item]     = pair.key;
+      d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
+    }
+  }
+
+  /**
+   * Scatter flagged items
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD],
+    OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+    OffsetT (&segment_indices)[ITEMS_PER_THREAD],
+    OffsetT num_tile_segments,
+    OffsetT num_tile_segments_prefix)
+  {
+    // Do a one-phase scatter if (a) two-phase is disabled or (b) the average
+    // number of selected items per thread is less than one
+    if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+    {
+      ScatterTwoPhase(scatter_items, segment_flags, segment_indices, num_tile_segments, num_tile_segments_prefix);
+    }
+    else
+    {
+      ScatterDirect(scatter_items, segment_flags, segment_indices);
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Process a tile of input (dynamic chained scan)
+   *
+   * @tparam IS_LAST_TILE
+   *   Whether the current tile is the last tile
+   *
+   * @param num_remaining
+   *   Number of global input items remaining (including this tile)
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state)
+  {
+    // Tile keys
+    KeyOutputT keys[ITEMS_PER_THREAD];
+
+    // Tile keys shuffled up
+    KeyOutputT prev_keys[ITEMS_PER_THREAD];
+
+    // Tile values
+    AccumT values[ITEMS_PER_THREAD];
+
+    // Segment head flags
+    OffsetT head_flags[ITEMS_PER_THREAD];
+
+    // Segment indices
+    OffsetT segment_indices[ITEMS_PER_THREAD];
+
+    // Zipped values and segment flags|indices
+    OffsetValuePairT scan_items[ITEMS_PER_THREAD];
+
+    // Zipped key value pairs for scattering
+    KeyValuePairT scatter_items[ITEMS_PER_THREAD];
+
+    // Load keys
+    if (IS_LAST_TILE)
+    {
+      BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+    }
+    else
+    {
+      BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+    }
+
+    // Load tile predecessor key in first thread
+    KeyOutputT tile_predecessor;
+    if (threadIdx.x == 0)
+    {
+      // if (tile_idx == 0)
+      //   first tile gets repeat of first item (thus first item will not
+      //   be flagged as a head)
+      // else
+      //   Subsequent tiles get last key from previous tile
+      tile_predecessor = (tile_idx == 0) ? keys[0] : d_keys_in[tile_offset - 1];
+    }
+
+    CTA_SYNC();
+
+    // Load values
+    if (IS_LAST_TILE)
+    {
+      BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+    }
+    else
+    {
+      BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+    }
+
+    CTA_SYNC();
+
+    // Initialize head-flags and shuffle up the previous keys
+    if (IS_LAST_TILE)
+    {
+      // Use custom flag operator to additionally flag the first out-of-bounds
+      // item
+      GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+      BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+        .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor);
+    }
+    else
+    {
+      InequalityWrapper<EqualityOpT> flag_op(equality_op);
+      BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+        .FlagHeads(head_flags, keys, prev_keys, flag_op, tile_predecessor);
+    }
+
+    // Reset head-flag on the very first item to make sure we don't start a new run for data where
+    // (key[0] == key[0]) is false (e.g., when key[0] is NaN)
+    if (threadIdx.x == 0 && tile_idx == 0)
+    {
+      head_flags[0] = 0;
+    }
+
+    // Zip values and head flags
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      scan_items[ITEM].value = values[ITEM];
+      scan_items[ITEM].key   = head_flags[ITEM];
+    }
+
+    // Perform exclusive tile scan
+    // Inclusive block-wide scan aggregate
+    OffsetValuePairT block_aggregate;
+
+    // Number of segments prior to this tile
+    OffsetT num_segments_prefix;
+
+    // The tile prefix folded with block_aggregate
+    OffsetValuePairT total_aggregate;
+
+    if (tile_idx == 0)
+    {
+      // Scan first tile
+      BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+      num_segments_prefix = 0;
+      total_aggregate     = block_aggregate;
+
+      // Update tile status if there are successor tiles
+      if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+      {
+        tile_state.SetInclusive(0, block_aggregate);
+      }
+    }
+    else
+    {
+      // Scan non-first tile
+      TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+      BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+      block_aggregate     = prefix_op.GetBlockAggregate();
+      num_segments_prefix = prefix_op.GetExclusivePrefix().key;
+      total_aggregate     = prefix_op.GetInclusivePrefix();
+    }
+
+// Rezip scatter items and segment indices
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      scatter_items[ITEM].key   = prev_keys[ITEM];
+      scatter_items[ITEM].value = scan_items[ITEM].value;
+      segment_indices[ITEM]     = scan_items[ITEM].key;
+    }
+
+    // At this point, each flagged segment head has:
+    //  - The key for the previous segment
+    //  - The reduced value from the previous segment
+    //  - The segment index for the reduced value
+
+    // Scatter flagged keys and values
+    OffsetT num_tile_segments = block_aggregate.key;
+    Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+    // Last thread in last tile will output final count (and last pair, if
+    // necessary)
+    if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+    {
+      OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+      // If the last tile is a whole tile, output the final_value
+      if (num_remaining == TILE_ITEMS)
+      {
+        d_unique_out[num_segments]     = keys[ITEMS_PER_THREAD - 1];
+        d_aggregates_out[num_segments] = total_aggregate.value;
+        num_segments++;
+      }
+
+      // Output the total number of items selected
+      *d_num_runs_out = num_segments;
+    }
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_items
+   *   Total number of input items
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param start_tile
+   *   The starting tile for the current grid
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per
+    // block
+
+    // Current tile index
+    int tile_idx = start_tile + blockIdx.x;
+
+    // Global offset for the current tile
+    OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;
+
+    // Remaining items (including this tile)
+    OffsetT num_remaining = num_items - tile_offset;
+
+    if (num_remaining > TILE_ITEMS)
+    {
+      // Not last tile
+      ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+    }
+    else if (num_remaining > 0)
+    {
+      // Last tile
+      ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_rle.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_rle.cuh
new file mode 100644
index 000000000..c498f1737
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_rle.cuh
@@ -0,0 +1,991 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide
+ * run-length-encode.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _STORE_WARP_TIME_SLICING
+ *   Whether or not only one warp's worth of shared memory should be allocated and time-sliced among
+ *   block-warps during any store-related data transpositions
+ *   (versus each warp having its own storage)
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          bool _STORE_WARP_TIME_SLICING,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentRlePolicy
+{
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
+
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+
+    /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced
+    /// among block-warps during any store-related data transpositions (versus each warp having its
+    /// own storage)
+    STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide
+ * run-length-encode
+ *
+ * @tparam AgentRlePolicyT
+ *   Parameterized AgentRlePolicyT tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for data
+ *
+ * @tparam OffsetsOutputIteratorT
+ *   Random-access output iterator type for offset values
+ *
+ * @tparam LengthsOutputIteratorT
+ *   Random-access output iterator type for length values
+ *
+ * @tparam EqualityOpT
+ *   T equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename AgentRlePolicyT,
+          typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
+struct AgentRle
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The input value type
+  using T = cub::detail::value_t<InputIteratorT>;
+
+  /// The lengths output value type
+  using LengthT = cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+
+  /// Tuple type for scanning (pairs run-length and run-index)
+  using LengthOffsetPair = KeyValuePair<OffsetT, LengthT>;
+
+  /// Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<LengthT, OffsetT>;
+
+  // Constants
+  enum
+  {
+    WARP_THREADS     = CUB_WARP_THREADS(0),
+    BLOCK_THREADS    = AgentRlePolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
+    WARP_ITEMS       = WARP_THREADS * ITEMS_PER_THREAD,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+    /// Whether or not to sync after loading data
+    SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+    /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced
+    /// among block-warps during any store-related data transpositions (versus each warp having
+    /// its own storage)
+    STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+    ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+  };
+
+  /**
+   * Special operator that signals all out-of-bounds items are not equal to everything else,
+   * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+   * trivial.
+   */
+  template <bool LAST_TILE>
+  struct OobInequalityOp
+  {
+    OffsetT num_remaining;
+    EqualityOpT equality_op;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE OobInequalityOp(OffsetT num_remaining, EqualityOpT equality_op)
+        : num_remaining(num_remaining)
+        , equality_op(equality_op)
+    {}
+
+    template <typename Index>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T first, T second, Index idx)
+    {
+      if (!LAST_TILE || (idx < num_remaining))
+      {
+        return !equality_op(first, second);
+      }
+      else
+      {
+        return true;
+      }
+    }
+  };
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+  // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+  // Directly use the supplied input iterator type
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,
+                     InputIteratorT>;
+
+  // Parameterized BlockLoad type for data
+  using BlockLoadT =
+    BlockLoad<T, AgentRlePolicyT::BLOCK_THREADS, AgentRlePolicyT::ITEMS_PER_THREAD, AgentRlePolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockDiscontinuity type for data
+  using BlockDiscontinuityT = BlockDiscontinuity<T, BLOCK_THREADS>;
+
+  // Parameterized WarpScan type
+  using WarpScanPairs = WarpScan<LengthOffsetPair>;
+
+  // Reduce-length-by-run scan operator
+  using ReduceBySegmentOpT = ReduceBySegmentOp<cub::Sum>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using DelayConstructorT = typename AgentRlePolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackOpT =
+    TilePrefixCallbackOp<LengthOffsetPair, ReduceBySegmentOpT, ScanTileStateT, 0, DelayConstructorT>;
+
+  // Warp exchange types
+  using WarpExchangePairs = WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>;
+
+  using WarpExchangePairsStorage =
+    ::cuda::std::_If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>;
+
+  using WarpExchangeOffsets = WarpExchange<OffsetT, ITEMS_PER_THREAD>;
+  using WarpExchangeLengths = WarpExchange<LengthT, ITEMS_PER_THREAD>;
+
+  using WarpAggregates = LengthOffsetPair[WARPS];
+
+  // Shared memory type for this thread block
+  struct _TempStorage
+  {
+    // Aliasable storage layout
+    union Aliasable
+    {
+      struct ScanStorage
+      {
+        // Smem needed for discontinuity detection
+        typename BlockDiscontinuityT::TempStorage discontinuity;
+
+        // Smem needed for warp-synchronous scans
+        typename WarpScanPairs::TempStorage warp_scan[WARPS];
+
+        // Smem needed for sharing warp-wide aggregates
+        Uninitialized<LengthOffsetPair[WARPS]> warp_aggregates;
+
+        // Smem needed for cooperative prefix callback
+        typename TilePrefixCallbackOpT::TempStorage prefix;
+      } scan_storage;
+
+      // Smem needed for input loading
+      typename BlockLoadT::TempStorage load;
+
+      // Aliasable layout needed for two-phase scatter
+      union ScatterAliasable
+      {
+        unsigned long long align;
+        WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+        typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+        typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+      } scatter_aliasable;
+
+    } aliasable;
+
+    OffsetT tile_idx; // Shared tile index
+    LengthOffsetPair tile_inclusive; // Inclusive tile prefix
+    LengthOffsetPair tile_exclusive; // Exclusive tile prefix
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+
+  WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items
+  OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets
+  LengthsOutputIteratorT d_lengths_out; ///< Output run lengths
+
+  EqualityOpT equality_op; ///< T equality operator
+  ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator
+  OffsetT num_items; ///< Total number of input items
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @param[in] temp_storage
+   *   Reference to temp_storage
+   *
+   * @param[in] d_in
+   *   Pointer to input sequence of data items
+   *
+   * @param[out] d_offsets_out
+   *   Pointer to output sequence of run offsets
+   *
+   * @param[out] d_lengths_out
+   *   Pointer to output sequence of run lengths
+   *
+   * @param[in] equality_op
+   *   Equality operator
+   *
+   * @param[in] num_items
+   *   Total number of input items
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentRle(
+    TempStorage& temp_storage,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    EqualityOpT equality_op,
+    OffsetT num_items)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_offsets_out(d_offsets_out)
+      , d_lengths_out(d_lengths_out)
+      , equality_op(equality_op)
+      , scan_op(cub::Sum())
+      , num_items(num_items)
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility methods for initializing the selections
+  //---------------------------------------------------------------------
+
+  template <bool FIRST_TILE, bool LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections(
+    OffsetT tile_offset,
+    OffsetT num_remaining,
+    T (&items)[ITEMS_PER_THREAD],
+    LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+  {
+    bool head_flags[ITEMS_PER_THREAD];
+    bool tail_flags[ITEMS_PER_THREAD];
+
+    OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+    if (FIRST_TILE && LAST_TILE)
+    {
+      // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+      BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
+        .FlagHeadsAndTails(head_flags, tail_flags, items, inequality_op);
+    }
+    else if (FIRST_TILE)
+    {
+      // First-tile always head-flags the first item
+
+      // Get the first item from the next tile
+      T tile_successor_item;
+      if (threadIdx.x == BLOCK_THREADS - 1)
+      {
+        tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+      }
+
+      BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
+        .FlagHeadsAndTails(head_flags, tail_flags, tile_successor_item, items, inequality_op);
+    }
+    else if (LAST_TILE)
+    {
+      // Last-tile always flags the last item
+
+      // Get the last item from the previous tile
+      T tile_predecessor_item;
+      if (threadIdx.x == 0)
+      {
+        tile_predecessor_item = d_in[tile_offset - 1];
+      }
+
+      BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
+        .FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+    }
+    else
+    {
+      // Get the first item from the next tile
+      T tile_successor_item;
+      if (threadIdx.x == BLOCK_THREADS - 1)
+      {
+        tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+      }
+
+      // Get the last item from the previous tile
+      T tile_predecessor_item;
+      if (threadIdx.x == 0)
+      {
+        tile_predecessor_item = d_in[tile_offset - 1];
+      }
+
+      BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
+        .FlagHeadsAndTails(head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+    }
+
+// Zip counts and runs
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // input                   output
+      // items [ 0 0 0 1 2 3 3 ]
+      // heads [ 1 0 0 1 1 1 0 ]
+      // tails [ 0 0 1 1 1 0 1 ]
+      // key   [ 1 0 0 0 0 1 0 ]  head && !tail - heads of non-trivial (length > 1) runs
+      // value [ 1 1 1 0 0 1 1 ] !head || !tail - elements of non-trivial runs
+      lengths_and_num_runs[ITEM].key   = head_flags[ITEM] && (!tail_flags[ITEM]);
+      lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Scan utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * Scan of allocations
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpScanAllocations(
+    LengthOffsetPair& tile_aggregate,
+    LengthOffsetPair& warp_aggregate,
+    LengthOffsetPair& warp_exclusive_in_tile,
+    LengthOffsetPair& thread_exclusive_in_warp,
+    LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+  {
+    // Perform warpscans
+    unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+    int lane_id          = LaneId();
+
+    LengthOffsetPair identity;
+    identity.key   = 0;
+    identity.value = 0;
+
+    LengthOffsetPair thread_inclusive;
+
+    // `thread_exclusive_in_warp.key`:
+    //      number of non-trivial runs starts in previous threads
+    // `thread_exclusive_in_warp.val`:
+    //      number of items in the last non-trivial run in previous threads
+
+    // `thread_aggregate.key`:
+    //      number of non-trivial runs starts in this thread
+    // `thread_aggregate.val`:
+    //      number of items in the last non-trivial run in this thread
+    LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+    WarpScanPairs(temp_storage.aliasable.scan_storage.warp_scan[warp_id])
+      .Scan(thread_aggregate, thread_inclusive, thread_exclusive_in_warp, identity, scan_op);
+
+    // `thread_inclusive.key`:
+    //      number of non-trivial runs starts in this and previous warp threads
+    // `thread_inclusive.val`:
+    //      number of items in the last non-trivial run in this or previous warp threads
+
+    // Last lane in each warp shares its warp-aggregate
+    if (lane_id == WARP_THREADS - 1)
+    {
+      // `temp_storage.aliasable.scan_storage.warp_aggregates[warp_id].key`:
+      //      number of non-trivial runs starts in this warp
+      // `temp_storage.aliasable.scan_storage.warp_aggregates[warp_id].val`:
+      //      number of items in the last non-trivial run in this warp
+      temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+    }
+
+    CTA_SYNC();
+
+    // Accumulate total selected and the warp-wide prefix
+
+    // `warp_exclusive_in_tile.key`:
+    //      number of non-trivial runs starts in previous warps
+    // `warp_exclusive_in_tile.val`:
+    //      number of items in the last non-trivial run in previous warps
+    warp_exclusive_in_tile = identity;
+    warp_aggregate         = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[warp_id];
+
+    // `tile_aggregate.key`:
+    //      number of non-trivial runs starts in this CTA
+    // `tile_aggregate.val`:
+    //      number of items in the last non-trivial run in this CTA
+    tile_aggregate = temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[0];
+
+#pragma unroll
+    for (int WARP = 1; WARP < WARPS; ++WARP)
+    {
+      if (warp_id == WARP)
+      {
+        warp_exclusive_in_tile = tile_aggregate;
+      }
+
+      tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.scan_storage.warp_aggregates.Alias()[WARP]);
+    }
+
+    // Ensure all threads have read warp aggregates before temp_storage is repurposed in the
+    // subsequent scatter stage
+    CTA_SYNC();
+  }
+
+  //---------------------------------------------------------------------
+  // Utility methods for scattering selections
+  //---------------------------------------------------------------------
+
+  /**
+   * Two-phase scatter, specialized for warp time-slicing
+   */
+  template <bool FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase(
+    OffsetT tile_num_runs_exclusive_in_global,
+    OffsetT warp_num_runs_aggregate,
+    OffsetT warp_num_runs_exclusive_in_tile,
+    OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+    LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
+    Int2Type<true> is_warp_time_slice)
+  {
+    unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+    int lane_id          = LaneId();
+
+    // Locally compact items within the warp (first warp)
+    if (warp_id == 0)
+    {
+      WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0])
+        .ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+    }
+
+// Locally compact items within the warp (remaining warps)
+#pragma unroll
+    for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+    {
+      CTA_SYNC();
+
+      if (warp_id == SLICE)
+      {
+        WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0])
+          .ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+      }
+    }
+
+// Global scatter
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      // warp_num_runs_aggregate - number of non-trivial runs starts in current warp
+      if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+      {
+        OffsetT item_offset =
+          tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id;
+
+        // Scatter offset
+        d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+        // Scatter length if not the first (global) length
+        if ((ITEM != 0) || (item_offset > 0))
+        {
+          d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+        }
+      }
+    }
+  }
+
+  /**
+   * Two-phase scatter
+   */
+  template <bool FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterTwoPhase(
+    OffsetT tile_num_runs_exclusive_in_global,
+    OffsetT warp_num_runs_aggregate,
+    OffsetT warp_num_runs_exclusive_in_tile,
+    OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+    LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD],
+    Int2Type<false> is_warp_time_slice)
+  {
+    unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+    int lane_id          = LaneId();
+
+    // Unzip
+    OffsetT run_offsets[ITEMS_PER_THREAD];
+    LengthT run_lengths[ITEMS_PER_THREAD];
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+      run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+    }
+
+    WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id])
+      .ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp);
+
+    WARP_SYNC(0xffffffff);
+
+    WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id])
+      .ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);
+
+// Global scatter
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+      {
+        OffsetT item_offset =
+          tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + (ITEM * WARP_THREADS) + lane_id;
+
+        // Scatter offset
+        d_offsets_out[item_offset] = run_offsets[ITEM];
+
+        // Scatter length if not the first (global) length
+        if ((ITEM != 0) || (item_offset > 0))
+        {
+          d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+        }
+      }
+    }
+  }
+
+  /**
+   * Direct scatter
+   */
+  template <bool FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterDirect(
+    OffsetT tile_num_runs_exclusive_in_global,
+    OffsetT warp_num_runs_aggregate,
+    OffsetT warp_num_runs_exclusive_in_tile,
+    OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+    LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+      {
+        OffsetT item_offset =
+          tile_num_runs_exclusive_in_global + warp_num_runs_exclusive_in_tile + thread_num_runs_exclusive_in_warp[ITEM];
+
+        // Scatter offset
+        d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+        // Scatter length if not the first (global) length
+        if (item_offset > 0)
+        {
+          d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+        }
+      }
+    }
+  }
+
+  /**
+   * Scatter
+   */
+  template <bool FIRST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    OffsetT tile_num_runs_aggregate,
+    OffsetT tile_num_runs_exclusive_in_global,
+    OffsetT warp_num_runs_aggregate,
+    OffsetT warp_num_runs_exclusive_in_tile,
+    OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+    LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD])
+  {
+    if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+    {
+      // Direct scatter if the warp has any items
+      if (warp_num_runs_aggregate)
+      {
+        ScatterDirect<FIRST_TILE>(
+          tile_num_runs_exclusive_in_global,
+          warp_num_runs_aggregate,
+          warp_num_runs_exclusive_in_tile,
+          thread_num_runs_exclusive_in_warp,
+          lengths_and_offsets);
+      }
+    }
+    else
+    {
+      // Scatter two phase
+      ScatterTwoPhase<FIRST_TILE>(
+        tile_num_runs_exclusive_in_global,
+        warp_num_runs_aggregate,
+        warp_num_runs_exclusive_in_tile,
+        thread_num_runs_exclusive_in_warp,
+        lengths_and_offsets,
+        Int2Type<STORE_WARP_TIME_SLICING>());
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Process a tile of input (dynamic chained scan)
+   *
+   * @param num_items
+   *   Total number of global input items
+   *
+   * @param num_remaining
+   *   Number of global input items remaining (including this tile)
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param &tile_status
+   *   Global list of tile status
+   */
+  template <bool LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE LengthOffsetPair
+  ConsumeTile(OffsetT num_items, OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_status)
+  {
+    if (tile_idx == 0)
+    {
+      // First tile
+
+      // Load items
+      T items[ITEMS_PER_THREAD];
+      if (LAST_TILE)
+      {
+        BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+      }
+      else
+      {
+        BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+      }
+
+      if (SYNC_AFTER_LOAD)
+      {
+        CTA_SYNC();
+      }
+
+      // Set flags
+      LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
+
+      InitializeSelections<true, LAST_TILE>(tile_offset, num_remaining, items, lengths_and_num_runs);
+
+      // Exclusive scan of lengths and runs
+      LengthOffsetPair tile_aggregate;
+      LengthOffsetPair warp_aggregate;
+      LengthOffsetPair warp_exclusive_in_tile;
+      LengthOffsetPair thread_exclusive_in_warp;
+
+      WarpScanAllocations(
+        tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs);
+
+      // Update tile status if this is not the last tile
+      if (!LAST_TILE && (threadIdx.x == 0))
+      {
+        tile_status.SetInclusive(0, tile_aggregate);
+      }
+
+      // Update thread_exclusive_in_warp to fold in warp run-length
+      if (thread_exclusive_in_warp.key == 0)
+      {
+        // If there are no non-trivial runs starts in the previous warp threads, then
+        // `thread_exclusive_in_warp.val` denotes the number of items in the last
+        // non-trivial run of the previous CTA threads, so the better name for it is
+        // `thread_exclusive_in_tile`.
+        thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+      }
+
+      LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD];
+      OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+      LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+      // Downsweep scan through lengths_and_num_runs
+      internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+      // Zip
+
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      {
+        lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
+        lengths_and_offsets[ITEM].key   = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+        thread_num_runs_exclusive_in_warp[ITEM] =
+          (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep
+            WARP_THREADS * ITEMS_PER_THREAD; // discard
+      }
+
+      OffsetT tile_num_runs_aggregate           = tile_aggregate.key;
+      OffsetT tile_num_runs_exclusive_in_global = 0;
+      OffsetT warp_num_runs_aggregate           = warp_aggregate.key;
+      OffsetT warp_num_runs_exclusive_in_tile   = warp_exclusive_in_tile.key;
+
+      // Scatter
+      Scatter<true>(
+        tile_num_runs_aggregate,
+        tile_num_runs_exclusive_in_global,
+        warp_num_runs_aggregate,
+        warp_num_runs_exclusive_in_tile,
+        thread_num_runs_exclusive_in_warp,
+        lengths_and_offsets);
+
+      // Return running total (inclusive of this tile)
+      return tile_aggregate;
+    }
+    else
+    {
+      // Not first tile
+
+      // Load items
+      T items[ITEMS_PER_THREAD];
+      if (LAST_TILE)
+      {
+        BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+      }
+      else
+      {
+        BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+      }
+
+      if (SYNC_AFTER_LOAD)
+      {
+        CTA_SYNC();
+      }
+
+      // Set flags
+      LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD];
+
+      InitializeSelections<false, LAST_TILE>(tile_offset, num_remaining, items, lengths_and_num_runs);
+
+      // Exclusive scan of lengths and runs
+      LengthOffsetPair tile_aggregate;
+      LengthOffsetPair warp_aggregate;
+      LengthOffsetPair warp_exclusive_in_tile;
+      LengthOffsetPair thread_exclusive_in_warp;
+
+      WarpScanAllocations(
+        tile_aggregate, warp_aggregate, warp_exclusive_in_tile, thread_exclusive_in_warp, lengths_and_num_runs);
+
+      // First warp computes tile prefix in lane 0
+      TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.scan_storage.prefix, Sum(), tile_idx);
+      unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+      if (warp_id == 0)
+      {
+        prefix_op(tile_aggregate);
+        if (threadIdx.x == 0)
+        {
+          temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+        }
+      }
+
+      CTA_SYNC();
+
+      LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+      // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+      LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+      if (thread_exclusive_in_warp.key == 0)
+      {
+        // If there are no non-trivial runs starts in the previous warp threads, then
+        // `thread_exclusive_in_warp.val` denotes the number of items in the last
+        // non-trivial run of the previous grid threads, so the better name for it is
+        // `thread_exclusive_in_grid`.
+        thread_exclusive_in_warp.value += thread_exclusive.value;
+      }
+
+      // Downsweep scan through lengths_and_num_runs
+
+      // `lengths_and_num_runs2.key`:
+      //      number of non-trivial runs starts in previous grid threads
+      // `lengths_and_num_runs2.val`:
+      //      number of items in the last non-trivial run in previous grid threads
+      LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+      // `lengths_and_offsets.key`:
+      //      offset to the item in the input sequence
+      // `lengths_and_offsets.val`:
+      //      number of items in the last non-trivial run in previous grid threads
+      LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD];
+      OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+      internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+// Zip
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+      {
+        lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value;
+        lengths_and_offsets[ITEM].key   = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+        thread_num_runs_exclusive_in_warp[ITEM] =
+          (lengths_and_num_runs[ITEM].key) ? lengths_and_num_runs2[ITEM].key : // keep
+            WARP_THREADS * ITEMS_PER_THREAD; // discard
+      }
+
+      OffsetT tile_num_runs_aggregate           = tile_aggregate.key;
+      OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key;
+      OffsetT warp_num_runs_aggregate           = warp_aggregate.key;
+      OffsetT warp_num_runs_exclusive_in_tile   = warp_exclusive_in_tile.key;
+
+      // Scatter
+      Scatter<false>(
+        tile_num_runs_aggregate,
+        tile_num_runs_exclusive_in_global,
+        warp_num_runs_aggregate,
+        warp_num_runs_exclusive_in_tile,
+        thread_num_runs_exclusive_in_warp,
+        lengths_and_offsets);
+
+      // Return running total (inclusive of this tile)
+      return prefix_op.inclusive_prefix;
+    }
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param tile_status
+   *   Global list of tile status
+   *
+   * @param d_num_runs_out
+   *   Output pointer for total number of runs identified
+   *
+   * @tparam NumRunsIteratorT
+   *   Output iterator type for recording number of items selected
+   */
+  template <typename NumRunsIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeRange(int num_tiles, ScanTileStateT& tile_status, NumRunsIteratorT d_num_runs_out)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per block
+    int tile_idx          = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+    OffsetT tile_offset   = tile_idx * TILE_ITEMS; // Global offset for the current tile
+    OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
+
+    if (tile_idx < num_tiles - 1)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+    }
+    else if (num_remaining > 0)
+    {
+      // The last tile (possibly partially-full)
+      LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+      if (threadIdx.x == 0)
+      {
+        // Output the total number of items selected
+        *d_num_runs_out = running_total.key;
+
+        // The inclusive prefix contains accumulated length reduction for the last run
+        if (running_total.key > 0)
+        {
+          d_lengths_out[running_total.key - 1] = running_total.value;
+        }
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan.cuh
new file mode 100644
index 000000000..4ffcd739d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan.cuh
@@ -0,0 +1,587 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::AgentScan implements a stateful abstraction of CUDA thread blocks
+ *       for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * @brief Parameterizable tuning policy type for AgentScan
+ *
+ * @tparam NOMINAL_BLOCK_THREADS_4B
+ *   Threads per thread block
+ *
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B
+ *   Items per thread (per tile of input)
+ *
+ * @tparam ComputeT
+ *   Dominant compute type
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _STORE_ALGORITHM
+ *   The BlockStore algorithm to use
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockStoreAlgorithm _STORE_ALGORITHM,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          typename ScalingType       = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
+          typename DelayConstructorT = detail::default_delay_constructor_t<ComputeT>>
+struct AgentScanPolicy : ScalingType
+{
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM   = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentScan implements a stateful abstraction of CUDA thread blocks for
+ *        participating in device-wide prefix scan.
+ * @tparam AgentScanPolicyT
+ *   Parameterized AgentScanPolicyT tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam OutputIteratorT
+ *   Random-access output iterator type
+ *
+ * @tparam ScanOpT
+ *   Scan functor type
+ *
+ * @tparam InitValueT
+ *   The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam AccumT
+ *   The type of intermediate accumulator (according to P2322R6)
+ */
+template <typename AgentScanPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT,
+          bool ForceInclusive = false>
+struct AgentScan
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ScanTileState<AccumT>;
+
+  // Input iterator wrapper type (for applying cache modifier)
+  // Wrap the native input pointer with CacheModifiedInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
+
+  // Constants
+  enum
+  {
+    // Inclusive scan if no init_value type is provided
+    HAS_INIT     = !std::is_same<InitValueT, NullType>::value,
+    IS_INCLUSIVE = ForceInclusive || !HAS_INIT, // We are relying on either initial value not beeing `NullType`
+                                                // or the ForceInclusive tag to be true for inclusive scan
+                                                // to get picked up.
+    BLOCK_THREADS    = AgentScanPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+  };
+
+  // Parameterized BlockLoad type
+  using BlockLoadT =
+    BlockLoad<AccumT,
+              AgentScanPolicyT::BLOCK_THREADS,
+              AgentScanPolicyT::ITEMS_PER_THREAD,
+              AgentScanPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockStore type
+  using BlockStoreT =
+    BlockStore<AccumT,
+               AgentScanPolicyT::BLOCK_THREADS,
+               AgentScanPolicyT::ITEMS_PER_THREAD,
+               AgentScanPolicyT::STORE_ALGORITHM>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = BlockScan<AccumT, AgentScanPolicyT::BLOCK_THREADS, AgentScanPolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using DelayConstructorT     = typename AgentScanPolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackOpT = TilePrefixCallbackOp<AccumT, ScanOpT, ScanTileStateT, 0 /* PTX */, DelayConstructorT>;
+
+  // Stateful BlockScan prefix callback type for managing a running total while
+  // scanning consecutive tiles
+  using RunningPrefixCallbackOp = BlockScanRunningPrefixOp<AccumT, ScanOpT>;
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    // Smem needed for tile loading
+    typename BlockLoadT::TempStorage load;
+
+    // Smem needed for tile storing
+    typename BlockStoreT::TempStorage store;
+
+    struct ScanStorage
+    {
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+    } scan_storage;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+  WrappedInputIteratorT d_in; ///< Input data
+  OutputIteratorT d_out; ///< Output data
+  ScanOpT scan_op; ///< Binary scan operator
+  InitValueT init_value; ///< The init_value element for ScanOpT
+
+  //---------------------------------------------------------------------
+  // Block scan utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * Exclusive scan specialization (first tile)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    AccumT (&items)[ITEMS_PER_THREAD],
+    AccumT init_value,
+    ScanOpT scan_op,
+    AccumT& block_aggregate,
+    Int2Type<false> /*is_inclusive*/)
+  {
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+    block_aggregate = scan_op(init_value, block_aggregate);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTileInclusive(
+    AccumT (&items)[ITEMS_PER_THREAD],
+    AccumT init_value,
+    ScanOpT scan_op,
+    AccumT& block_aggregate,
+    Int2Type<true> /*has_init*/)
+  {
+    BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, init_value, scan_op, block_aggregate);
+    block_aggregate = scan_op(init_value, block_aggregate);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTileInclusive(
+    AccumT (&items)[ITEMS_PER_THREAD],
+    InitValueT /*init_value*/,
+    ScanOpT scan_op,
+    AccumT& block_aggregate,
+    Int2Type<false> /*has_init*/)
+
+  {
+    BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+  }
+
+  /**
+   * Inclusive scan specialization (first tile)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    AccumT (&items)[ITEMS_PER_THREAD],
+    InitValueT init_value,
+    ScanOpT scan_op,
+    AccumT& block_aggregate,
+    Int2Type<true> /*is_inclusive*/)
+  {
+    ScanTileInclusive(items, init_value, scan_op, block_aggregate, Int2Type<HAS_INIT>());
+  }
+
+  /**
+   * Exclusive scan specialization (subsequent tiles)
+   */
+  template <typename PrefixCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback& prefix_op, Int2Type<false> /*is_inclusive*/)
+  {
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+  }
+
+  /**
+   * Inclusive scan specialization (subsequent tiles)
+   */
+  template <typename PrefixCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    AccumT (&items)[ITEMS_PER_THREAD], ScanOpT scan_op, PrefixCallback& prefix_op, Int2Type<true> /*is_inclusive*/)
+  {
+    BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+  }
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param d_in
+   *   Input data
+   *
+   * @param d_out
+   *   Output data
+   *
+   * @param scan_op
+   *   Binary scan operator
+   *
+   * @param init_value
+   *   Initial value to seed the exclusive scan
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentScan(
+    TempStorage& temp_storage, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_out(d_out)
+      , scan_op(scan_op)
+      , init_value(init_value)
+  {}
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * Process a tile of input (dynamic chained scan)
+   * @tparam IS_LAST_TILE
+   *   Whether the current tile is the last tile
+   *
+   * @param num_remaining
+   *   Number of global input items remaining (including this tile)
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(OffsetT num_remaining, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state)
+  {
+    // Load items
+    AccumT items[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last element with the first element because collectives are
+      // not suffix guarded.
+      BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining, *(d_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+    }
+
+    CTA_SYNC();
+
+    // Perform tile scan
+    if (tile_idx == 0)
+    {
+      // Scan first tile
+      AccumT block_aggregate;
+      ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+
+      if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+      {
+        tile_state.SetInclusive(0, block_aggregate);
+      }
+    }
+    else
+    {
+      // Scan non-first tile
+      TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+      ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+    }
+
+    CTA_SYNC();
+
+    // Store items
+    if (IS_LAST_TILE)
+    {
+      BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+    }
+    else
+    {
+      BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_items
+   *   Total number of input items
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param start_tile
+   *   The starting tile for the current grid
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per
+    // block
+
+    // Current tile index
+    int tile_idx = start_tile + blockIdx.x;
+
+    // Global offset for the current tile
+    OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;
+
+    // Remaining items (including this tile)
+    OffsetT num_remaining = num_items - tile_offset;
+
+    if (num_remaining > TILE_ITEMS)
+    {
+      // Not last tile
+      ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+    }
+    else if (num_remaining > 0)
+    {
+      // Last tile
+      ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  // Scan an sequence of consecutive tiles (independent of other thread blocks)
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Process a tile of input
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param prefix_op
+   *   Running prefix operator
+   *
+   * @param valid_items
+   *   Number of valid items in the tile
+   */
+  template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(OffsetT tile_offset, RunningPrefixCallbackOp& prefix_op, int valid_items = TILE_ITEMS)
+  {
+    // Load items
+    AccumT items[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last element with the first element because collectives are
+      // not suffix guarded.
+      BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items, *(d_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+    }
+
+    CTA_SYNC();
+
+    // Block scan
+    if (IS_FIRST_TILE)
+    {
+      AccumT block_aggregate;
+      ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+      prefix_op.running_total = block_aggregate;
+    }
+    else
+    {
+      ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+    }
+
+    CTA_SYNC();
+
+    // Store items
+    if (IS_LAST_TILE)
+    {
+      BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+    }
+    else
+    {
+      BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+  }
+
+  /**
+   * @brief Scan a consecutive share of input tiles
+   *
+   * @param[in] range_offset
+   *   Threadblock begin offset (inclusive)
+   *
+   * @param[in] range_end
+   *   Threadblock end offset (exclusive)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT range_offset, OffsetT range_end)
+  {
+    BlockScanRunningPrefixOp<AccumT, ScanOpT> prefix_op(scan_op);
+
+    if (range_offset + TILE_ITEMS <= range_end)
+    {
+      // Consume first tile of input (full)
+      ConsumeTile<true, true>(range_offset, prefix_op);
+      range_offset += TILE_ITEMS;
+
+      // Consume subsequent full tiles of input
+      while (range_offset + TILE_ITEMS <= range_end)
+      {
+        ConsumeTile<false, true>(range_offset, prefix_op);
+        range_offset += TILE_ITEMS;
+      }
+
+      // Consume a partially-full tile
+      if (range_offset < range_end)
+      {
+        int valid_items = range_end - range_offset;
+        ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+      }
+    }
+    else
+    {
+      // Consume the first tile of input (partially-full)
+      int valid_items = range_end - range_offset;
+      ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+    }
+  }
+
+  /**
+   * @brief Scan a consecutive share of input tiles, seeded with the
+   *        specified prefix value
+   * @param[in] range_offset
+   *   Threadblock begin offset (inclusive)
+   *
+   * @param[in] range_end
+   *   Threadblock end offset (exclusive)
+   *
+   * @param[in] prefix
+   *   The prefix to apply to the scan segment
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT range_offset, OffsetT range_end, AccumT prefix)
+  {
+    BlockScanRunningPrefixOp<AccumT, ScanOpT> prefix_op(prefix, scan_op);
+
+    // Consume full tiles of input
+    while (range_offset + TILE_ITEMS <= range_end)
+    {
+      ConsumeTile<true, false>(range_offset, prefix_op);
+      range_offset += TILE_ITEMS;
+    }
+
+    // Consume a partially-full tile
+    if (range_offset < range_end)
+    {
+      int valid_items = range_end - range_offset;
+      ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan_by_key.cuh
new file mode 100644
index 000000000..6e79ca18d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_scan_by_key.cuh
@@ -0,0 +1,463 @@
+/******************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file AgentScanByKey implements a stateful abstraction of CUDA thread blocks
+ *       for participating in device-wide prefix scan by key.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScanByKey
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD                = 1,
+          BlockLoadAlgorithm _LOAD_ALGORITHM   = BLOCK_LOAD_DIRECT,
+          CacheLoadModifier _LOAD_MODIFIER     = LOAD_DEFAULT,
+          BlockScanAlgorithm _SCAN_ALGORITHM   = BLOCK_SCAN_WARP_SCANS,
+          BlockStoreAlgorithm _STORE_ALGORITHM = BLOCK_STORE_DIRECT,
+          typename DelayConstructorT           = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentScanByKeyPolicy
+{
+  static constexpr int BLOCK_THREADS    = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = _LOAD_ALGORITHM;
+  static constexpr CacheLoadModifier LOAD_MODIFIER     = _LOAD_MODIFIER;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM   = _SCAN_ALGORITHM;
+  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentScanByKey implements a stateful abstraction of CUDA thread
+ *        blocks for participating in device-wide prefix scan by key.
+ *
+ * @tparam AgentScanByKeyPolicyT
+ *   Parameterized AgentScanPolicyT tuning policy type
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesOutputIteratorT
+ *   Random-access output iterator type
+ *
+ * @tparam EqualityOp
+ *   Equality functor type
+ *
+ * @tparam ScanOpT
+ *   Scan functor type
+ *
+ * @tparam InitValueT
+ *   The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam AccumT
+ *   The type of intermediate accumulator (according to P2322R6)
+ */
+template <typename AgentScanByKeyPolicyT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename EqualityOp,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT>
+struct AgentScanByKey
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  using KeyT               = cub::detail::value_t<KeysInputIteratorT>;
+  using InputT             = cub::detail::value_t<ValuesInputIteratorT>;
+  using FlagValuePairT     = KeyValuePair<int, AccumT>;
+  using ReduceBySegmentOpT = detail::ScanBySegmentOp<ScanOpT>;
+
+  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, int>;
+
+  // Constants
+  // Inclusive scan if no init_value type is provided
+  static constexpr int IS_INCLUSIVE     = std::is_same<InitValueT, NullType>::value;
+  static constexpr int BLOCK_THREADS    = AgentScanByKeyPolicyT::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = AgentScanByKeyPolicyT::ITEMS_PER_THREAD;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  using WrappedKeysInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<KeysInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>,
+                     KeysInputIteratorT>;
+
+  using WrappedValuesInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<ValuesInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentScanByKeyPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     ValuesInputIteratorT>;
+
+  using BlockLoadKeysT = BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::LOAD_ALGORITHM>;
+
+  using BlockLoadValuesT = BlockLoad<AccumT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::LOAD_ALGORITHM>;
+
+  using BlockStoreValuesT = BlockStore<AccumT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentScanByKeyPolicyT::STORE_ALGORITHM>;
+
+  using BlockDiscontinuityKeysT = BlockDiscontinuity<KeyT, BLOCK_THREADS, 1, 1>;
+
+  using DelayConstructorT = typename AgentScanByKeyPolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackT =
+    TilePrefixCallbackOp<FlagValuePairT, ReduceBySegmentOpT, ScanTileStateT, 0, DelayConstructorT>;
+
+  using BlockScanT = BlockScan<FlagValuePairT, BLOCK_THREADS, AgentScanByKeyPolicyT::SCAN_ALGORITHM, 1, 1>;
+
+  union TempStorage_
+  {
+    struct ScanStorage
+    {
+      typename BlockScanT::TempStorage scan;
+      typename TilePrefixCallbackT::TempStorage prefix;
+      typename BlockDiscontinuityKeysT::TempStorage discontinuity;
+    } scan_storage;
+
+    typename BlockLoadKeysT::TempStorage load_keys;
+    typename BlockLoadValuesT::TempStorage load_values;
+    typename BlockStoreValuesT::TempStorage store_values;
+  };
+
+  struct TempStorage : cub::Uninitialized<TempStorage_>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  TempStorage_& storage;
+  WrappedKeysInputIteratorT d_keys_in;
+  KeyT* d_keys_prev_in;
+  WrappedValuesInputIteratorT d_values_in;
+  ValuesOutputIteratorT d_values_out;
+  InequalityWrapper<EqualityOp> inequality_op;
+  ScanOpT scan_op;
+  ReduceBySegmentOpT pair_scan_op;
+  InitValueT init_value;
+
+  //---------------------------------------------------------------------
+  // Block scan utility methods (first tile)
+  //---------------------------------------------------------------------
+
+  // Exclusive scan specialization
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD], FlagValuePairT& tile_aggregate, Int2Type<false> /* is_inclusive */)
+  {
+    BlockScanT(storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
+  }
+
+  // Inclusive scan specialization
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD], FlagValuePairT& tile_aggregate, Int2Type<true> /* is_inclusive */)
+  {
+    BlockScanT(storage.scan_storage.scan).InclusiveScan(scan_items, scan_items, pair_scan_op, tile_aggregate);
+  }
+
+  //---------------------------------------------------------------------
+  // Block scan utility methods (subsequent tiles)
+  //---------------------------------------------------------------------
+
+  // Exclusive scan specialization (with prefix from predecessors)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD],
+    FlagValuePairT& tile_aggregate,
+    TilePrefixCallbackT& prefix_op,
+    Int2Type<false> /* is_inclusive */)
+  {
+    BlockScanT(storage.scan_storage.scan).ExclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op);
+    tile_aggregate = prefix_op.GetBlockAggregate();
+  }
+
+  // Inclusive scan specialization (with prefix from predecessors)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanTile(
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD],
+    FlagValuePairT& tile_aggregate,
+    TilePrefixCallbackT& prefix_op,
+    Int2Type<true> /* is_inclusive */)
+  {
+    BlockScanT(storage.scan_storage.scan).InclusiveScan(scan_items, scan_items, pair_scan_op, prefix_op);
+    tile_aggregate = prefix_op.GetBlockAggregate();
+  }
+
+  //---------------------------------------------------------------------
+  // Zip utility methods
+  //---------------------------------------------------------------------
+
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ZipValuesAndFlags(
+    OffsetT num_remaining,
+    AccumT (&values)[ITEMS_PER_THREAD],
+    OffsetT (&segment_flags)[ITEMS_PER_THREAD],
+    FlagValuePairT (&scan_items)[ITEMS_PER_THREAD])
+  {
+// Zip values and segment_flags
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Set segment_flags for first out-of-bounds item, zero for others
+      if (IS_LAST_TILE && OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
+      {
+        segment_flags[ITEM] = 1;
+      }
+
+      scan_items[ITEM].value = values[ITEM];
+      scan_items[ITEM].key   = segment_flags[ITEM];
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  UnzipValues(AccumT (&values)[ITEMS_PER_THREAD], FlagValuePairT (&scan_items)[ITEMS_PER_THREAD])
+  {
+// Unzip values and segment_flags
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      values[ITEM] = scan_items[ITEM].value;
+    }
+  }
+
+  template <bool IsNull = std::is_same<InitValueT, NullType>::value, typename std::enable_if<!IsNull, int>::type = 0>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  AddInitToScan(AccumT (&items)[ITEMS_PER_THREAD], OffsetT (&flags)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      items[ITEM] = flags[ITEM] ? init_value : scan_op(init_value, items[ITEM]);
+    }
+  }
+
+  template <bool IsNull = std::is_same<InitValueT, NullType>::value, typename std::enable_if<IsNull, int>::type = 0>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  AddInitToScan(AccumT (& /*items*/)[ITEMS_PER_THREAD], OffsetT (& /*flags*/)[ITEMS_PER_THREAD])
+  {}
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  // Process a tile of input (dynamic chained scan)
+  //
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(OffsetT /*num_items*/, OffsetT num_remaining, int tile_idx, OffsetT tile_base, ScanTileStateT& tile_state)
+  {
+    // Load items
+    KeyT keys[ITEMS_PER_THREAD];
+    AccumT values[ITEMS_PER_THREAD];
+    OffsetT segment_flags[ITEMS_PER_THREAD];
+    FlagValuePairT scan_items[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last element with the first element
+      // because collectives are not suffix guarded
+      BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys, num_remaining, *(d_keys_in + tile_base));
+    }
+    else
+    {
+      BlockLoadKeysT(storage.load_keys).Load(d_keys_in + tile_base, keys);
+    }
+
+    CTA_SYNC();
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last element with the first element
+      // because collectives are not suffix guarded
+      BlockLoadValuesT(storage.load_values)
+        .Load(d_values_in + tile_base, values, num_remaining, *(d_values_in + tile_base));
+    }
+    else
+    {
+      BlockLoadValuesT(storage.load_values).Load(d_values_in + tile_base, values);
+    }
+
+    CTA_SYNC();
+
+    // first tile
+    if (tile_idx == 0)
+    {
+      BlockDiscontinuityKeysT(storage.scan_storage.discontinuity).FlagHeads(segment_flags, keys, inequality_op);
+
+      // Zip values and segment_flags
+      ZipValuesAndFlags<IS_LAST_TILE>(num_remaining, values, segment_flags, scan_items);
+
+      // Exclusive scan of values and segment_flags
+      FlagValuePairT tile_aggregate;
+      ScanTile(scan_items, tile_aggregate, Int2Type<IS_INCLUSIVE>());
+
+      if (threadIdx.x == 0)
+      {
+        if (!IS_LAST_TILE)
+        {
+          tile_state.SetInclusive(0, tile_aggregate);
+        }
+
+        scan_items[0].key = 0;
+      }
+    }
+    else
+    {
+      KeyT tile_pred_key = (threadIdx.x == 0) ? d_keys_prev_in[tile_idx] : KeyT();
+
+      BlockDiscontinuityKeysT(storage.scan_storage.discontinuity)
+        .FlagHeads(segment_flags, keys, inequality_op, tile_pred_key);
+
+      // Zip values and segment_flags
+      ZipValuesAndFlags<IS_LAST_TILE>(num_remaining, values, segment_flags, scan_items);
+
+      FlagValuePairT tile_aggregate;
+      TilePrefixCallbackT prefix_op(tile_state, storage.scan_storage.prefix, pair_scan_op, tile_idx);
+      ScanTile(scan_items, tile_aggregate, prefix_op, Int2Type<IS_INCLUSIVE>());
+    }
+
+    CTA_SYNC();
+
+    UnzipValues(values, scan_items);
+
+    AddInitToScan(values, segment_flags);
+
+    // Store items
+    if (IS_LAST_TILE)
+    {
+      BlockStoreValuesT(storage.store_values).Store(d_values_out + tile_base, values, num_remaining);
+    }
+    else
+    {
+      BlockStoreValuesT(storage.store_values).Store(d_values_out + tile_base, values);
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  // Dequeue and scan tiles of items as part of a dynamic chained scan
+  // with Init functor
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentScanByKey(
+    TempStorage& storage,
+    KeysInputIteratorT d_keys_in,
+    KeyT* d_keys_prev_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value)
+      : storage(storage.Alias())
+      , d_keys_in(d_keys_in)
+      , d_keys_prev_in(d_keys_prev_in)
+      , d_values_in(d_values_in)
+      , d_values_out(d_values_out)
+      , inequality_op(equality_op)
+      , scan_op(scan_op)
+      , pair_scan_op(scan_op)
+      , init_value(init_value)
+  {}
+
+  /**
+   * Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_items
+   *   Total number of input items
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * start_tile
+   *   The starting tile for the current grid
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile)
+  {
+    int tile_idx          = blockIdx.x;
+    OffsetT tile_base     = OffsetT(ITEMS_PER_TILE) * tile_idx;
+    OffsetT num_remaining = num_items - tile_base;
+
+    if (num_remaining > ITEMS_PER_TILE)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_base, tile_state);
+    }
+    else if (num_remaining > 0)
+    {
+      // The last tile (possibly partially-full)
+      ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_base, tile_state);
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segment_fixup.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 000000000..4f01df6a0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide
+ * reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * @brief Parameterizable tuning policy type for AgentSegmentFixup
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM>
+struct AgentSegmentFixupPolicy
+{
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
+
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for
+ * participating in device-wide reduce-value-by-key
+ *
+ * @tparam AgentSegmentFixupPolicyT
+ *   Parameterized AgentSegmentFixupPolicy tuning policy type
+ *
+ * @tparam PairsInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam EqualityOpT
+ *   KeyT equality operator type
+ *
+ * @tparam ReductionOpT
+ *   ValueT reduction operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename AgentSegmentFixupPolicyT,
+          typename PairsInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+struct AgentSegmentFixup
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // Data type of key-value input iterator
+  using KeyValuePairT = cub::detail::value_t<PairsInputIteratorT>;
+
+  // Value type
+  using ValueT = typename KeyValuePairT::Value;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<ValueT, OffsetT>;
+
+  // Constants
+  enum
+  {
+    BLOCK_THREADS    = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+    // Whether or not do fixup using RLE + global atomics
+    USE_ATOMIC_FIXUP =
+      (std::is_same<ValueT, float>::value || std::is_same<ValueT, int>::value
+       || std::is_same<ValueT, unsigned int>::value || std::is_same<ValueT, unsigned long long>::value),
+
+    // Whether or not the scan operation has a zero-valued identity value
+    // (true if we're performing addition on a primitive type)
+    HAS_IDENTITY_ZERO = (std::is_same<ReductionOpT, cub::Sum>::value) && (Traits<ValueT>::PRIMITIVE),
+  };
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+  // Wrap the native input pointer with CacheModifiedValuesInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedPairsInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<PairsInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,
+                     PairsInputIteratorT>;
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+  // Wrap the native input pointer with CacheModifiedValuesInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedFixupInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<AggregatesOutputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,
+                     AggregatesOutputIteratorT>;
+
+  // Reduce-value-by-segment scan operator
+  using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
+
+  // Parameterized BlockLoad type for pairs
+  using BlockLoadPairs =
+    BlockLoad<KeyValuePairT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentSegmentFixupPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = BlockScan<KeyValuePairT, BLOCK_THREADS, AgentSegmentFixupPolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using TilePrefixCallbackOpT = TilePrefixCallbackOp<KeyValuePairT, ReduceBySegmentOpT, ScanTileStateT>;
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+    } scan_storage;
+
+    // Smem needed for loading keys
+    typename BlockLoadPairs::TempStorage load_pairs;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+  WrappedPairsInputIteratorT d_pairs_in; ///< Input keys
+  AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates
+  WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values
+  InequalityWrapper<EqualityOpT> inequality_op; ///< KeyT inequality operator
+  ReductionOpT reduction_op; ///< Reduction operator
+  ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param d_pairs_in
+   *   Input keys
+   *
+   * @param d_aggregates_out
+   *   Output value aggregates
+   *
+   * @param equality_op
+   *   KeyT equality operator
+   *
+   * @param reduction_op
+   *   ValueT reduction operator
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentSegmentFixup(
+    TempStorage& temp_storage,
+    PairsInputIteratorT d_pairs_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op)
+      : temp_storage(temp_storage.Alias())
+      , d_pairs_in(d_pairs_in)
+      , d_aggregates_out(d_aggregates_out)
+      , d_fixup_in(d_aggregates_out)
+      , inequality_op(equality_op)
+      , reduction_op(reduction_op)
+      , scan_op(reduction_op)
+  {}
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Process input tile. Specialized for atomic-fixup
+   *
+   * @param num_remaining
+   *   Number of global input items remaining (including this tile)
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param use_atomic_fixup
+   *   Marker whether to use atomicAdd (instead of reduce-by-key)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(
+    OffsetT num_remaining,
+    int tile_idx,
+    OffsetT tile_offset,
+    ScanTileStateT& tile_state,
+    Int2Type<true> use_atomic_fixup)
+  {
+    KeyValuePairT pairs[ITEMS_PER_THREAD];
+
+    // Load pairs
+    KeyValuePairT oob_pair;
+    oob_pair.key = -1;
+
+    if (IS_LAST_TILE)
+    {
+      BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+    }
+    else
+    {
+      BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+    }
+
+// RLE
+#pragma unroll
+    for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+      if (pairs[ITEM].key != pairs[ITEM - 1].key)
+      {
+        atomicAdd(d_scatter, pairs[ITEM - 1].value);
+      }
+      else
+      {
+        pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+      }
+    }
+
+    // Flush last item if valid
+    ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+    if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+    {
+      atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+  }
+
+  /**
+   * @brief Process input tile. Specialized for reduce-by-key fixup
+   *
+   * @param num_remaining
+   *   Number of global input items remaining (including this tile)
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param use_atomic_fixup
+   *   Marker whether to use atomicAdd (instead of reduce-by-key)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(
+    OffsetT num_remaining,
+    int tile_idx,
+    OffsetT tile_offset,
+    ScanTileStateT& tile_state,
+    Int2Type<false> use_atomic_fixup)
+  {
+    KeyValuePairT pairs[ITEMS_PER_THREAD];
+    KeyValuePairT scatter_pairs[ITEMS_PER_THREAD];
+
+    // Load pairs
+    KeyValuePairT oob_pair;
+    oob_pair.key = -1;
+
+    if (IS_LAST_TILE)
+    {
+      BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+    }
+    else
+    {
+      BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+    }
+
+    CTA_SYNC();
+
+    KeyValuePairT tile_aggregate;
+    if (tile_idx == 0)
+    {
+      // Exclusive scan of values and segment_flags
+      BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+      // Update tile status if this is not the last tile
+      if (threadIdx.x == 0)
+      {
+        // Set first segment id to not trigger a flush (invalid from exclusive scan)
+        scatter_pairs[0].key = pairs[0].key;
+
+        if (!IS_LAST_TILE)
+        {
+          tile_state.SetInclusive(0, tile_aggregate);
+        }
+      }
+    }
+    else
+    {
+      // Exclusive scan of values and segment_flags
+      TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
+      BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+      tile_aggregate = prefix_op.GetBlockAggregate();
+    }
+
+// Scatter updated values
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+      {
+        // Update the value at the key location
+        ValueT value = d_fixup_in[scatter_pairs[ITEM].key];
+        value        = reduction_op(value, scatter_pairs[ITEM].value);
+
+        d_aggregates_out[scatter_pairs[ITEM].key] = value;
+      }
+    }
+
+    // Finalize the last item
+    if (IS_LAST_TILE)
+    {
+      // Last thread will output final count and last item, if necessary
+      if (threadIdx.x == BLOCK_THREADS - 1)
+      {
+        // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last
+        // segment
+        if (num_remaining == TILE_ITEMS)
+        {
+          // Update the value at the key location
+          OffsetT last_key           = pairs[ITEMS_PER_THREAD - 1].key;
+          d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_items
+   *   Total number of input items
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, int num_tiles, ScanTileStateT& tile_state)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per block
+    int tile_idx          = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+    OffsetT tile_offset   = tile_idx * TILE_ITEMS; // Global offset for the current tile
+    OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
+
+    if (num_remaining > TILE_ITEMS)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+    }
+    else if (num_remaining > 0)
+    {
+      // The last tile (possibly partially-full)
+      ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segmented_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segmented_radix_sort.cuh
new file mode 100644
index 000000000..fe687fa9f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_segmented_radix_sort.cuh
@@ -0,0 +1,278 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_radix_sort_downsweep.cuh>
+#include <cub/agent/agent_radix_sort_upsweep.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * This agent will be implementing the `DeviceSegmentedRadixSort` when the
+ * https://github.com/NVIDIA/cub/issues/383 is addressed.
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam SegmentedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <bool IS_DESCENDING,
+          typename SegmentedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+struct AgentSegmentedRadixSort
+{
+  OffsetT num_items;
+
+  static constexpr int ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD;
+  static constexpr int BLOCK_THREADS    = SegmentedPolicyT::BLOCK_THREADS;
+  static constexpr int RADIX_BITS       = SegmentedPolicyT::RADIX_BITS;
+  static constexpr int RADIX_DIGITS     = 1 << RADIX_BITS;
+  static constexpr int KEYS_ONLY        = std::is_same<ValueT, NullType>::value;
+
+  using traits           = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type = typename traits::bit_ordered_type;
+
+  // Huge segment handlers
+  using BlockUpsweepT   = AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT, DecomposerT>;
+  using DigitScanT      = BlockScan<OffsetT, BLOCK_THREADS>;
+  using BlockDownsweepT = AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+
+  /// Number of bin-starting offsets tracked per thread
+  static constexpr int BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD;
+
+  // Small segment handlers
+  using BlockRadixSortT =
+    BlockRadixSort<KeyT,
+                   BLOCK_THREADS,
+                   ITEMS_PER_THREAD,
+                   ValueT,
+                   RADIX_BITS,
+                   (SegmentedPolicyT::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                   SegmentedPolicyT::SCAN_ALGORITHM>;
+
+  using BlockKeyLoadT = BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_ALGORITHM>;
+
+  using BlockValueLoadT = BlockLoad<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_ALGORITHM>;
+
+  union _TempStorage
+  {
+    // Huge segment handlers
+    typename BlockUpsweepT::TempStorage upsweep;
+    typename BlockDownsweepT::TempStorage downsweep;
+
+    struct UnboundBlockSort
+    {
+      OffsetT reverse_counts_in[RADIX_DIGITS];
+      OffsetT reverse_counts_out[RADIX_DIGITS];
+      typename DigitScanT::TempStorage scan;
+    } unbound_sort;
+
+    // Small segment handlers
+    typename BlockKeyLoadT::TempStorage keys_load;
+    typename BlockValueLoadT::TempStorage values_load;
+    typename BlockRadixSortT::TempStorage sort;
+  };
+
+  using TempStorage = Uninitialized<_TempStorage>;
+  _TempStorage& temp_storage;
+
+  DecomposerT decomposer;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  AgentSegmentedRadixSort(OffsetT num_items, TempStorage& temp_storage, DecomposerT decomposer = {})
+      : num_items(num_items)
+      , temp_storage(temp_storage.Alias())
+      , decomposer(decomposer)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessSinglePass(
+    int begin_bit, int end_bit, const KeyT* d_keys_in, const ValueT* d_values_in, KeyT* d_keys_out, ValueT* d_values_out)
+  {
+    KeyT thread_keys[ITEMS_PER_THREAD];
+    ValueT thread_values[ITEMS_PER_THREAD];
+
+    // For FP64 the difference is:
+    // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
+    // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
+
+    bit_ordered_type default_key_bits =
+      IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
+    KeyT oob_default = reinterpret_cast<KeyT&>(default_key_bits);
+
+    if (!KEYS_ONLY)
+    {
+      BlockValueLoadT(temp_storage.values_load).Load(d_values_in, thread_values, num_items);
+
+      CTA_SYNC();
+    }
+
+    {
+      BlockKeyLoadT(temp_storage.keys_load).Load(d_keys_in, thread_keys, num_items, oob_default);
+
+      CTA_SYNC();
+    }
+
+    BlockRadixSortT(temp_storage.sort)
+      .SortBlockedToStriped(
+        thread_keys, thread_values, begin_bit, end_bit, Int2Type<IS_DESCENDING>(), Int2Type<KEYS_ONLY>(), decomposer);
+
+    cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out, thread_keys, num_items);
+
+    if (!KEYS_ONLY)
+    {
+      cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out, thread_values, num_items);
+    }
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessIterative(
+    int current_bit,
+    int pass_bits,
+    const KeyT* d_keys_in,
+    const ValueT* d_values_in,
+    KeyT* d_keys_out,
+    ValueT* d_values_out)
+  {
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
+    upsweep.ProcessRegion(OffsetT{}, num_items);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+// Reverse bin counts
+#pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          temp_storage.unbound_sort.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+      }
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          bin_count[track] = temp_storage.unbound_sort.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+      }
+    }
+
+    // Scan
+    // The global scatter base offset for each digit value in this pass
+    // (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];
+    DigitScanT(temp_storage.unbound_sort.scan).ExclusiveSum(bin_count, bin_offset);
+
+    if (IS_DESCENDING)
+    {
+// Reverse bin offsets
+#pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          temp_storage.unbound_sort.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+      }
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+      {
+        int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+        {
+          bin_offset[track] = temp_storage.unbound_sort.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+      }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(
+      temp_storage.downsweep,
+      bin_offset,
+      num_items,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      current_bit,
+      pass_bits,
+      decomposer);
+    downsweep.ProcessRegion(OffsetT{}, num_items);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_select_if.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_select_if.cuh
new file mode 100644
index 000000000..ea2d1c24b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_select_if.cuh
@@ -0,0 +1,1015 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentSelectIfPolicy
+{
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
+
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+namespace detail
+{
+template <typename SelectedOutputItT, typename RejectedOutputItT>
+struct partition_distinct_output_t
+{
+  using selected_iterator_t = SelectedOutputItT;
+  using rejected_iterator_t = RejectedOutputItT;
+
+  selected_iterator_t selected_it;
+  rejected_iterator_t rejected_it;
+};
+} // namespace detail
+
+/**
+ * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in
+ * device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ *
+ * @tparam AgentSelectIfPolicyT
+ *   Parameterized AgentSelectIfPolicy tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for selection items
+ *
+ * @tparam FlagsInputIteratorT
+ *   Random-access input iterator type for selections (NullType* if a selection functor or
+ *   discontinuity flagging is to be used for selection)
+ *
+ * @tparam OutputIteratorWrapperT
+ *   Either a random-access iterator or an instance of the `partition_distinct_output_t` template.
+ *
+ * @tparam SelectOpT
+ *   Selection operator type (NullType if selections or discontinuity flagging is to be used for
+ * selection)
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type (NullType if selection functor or selections is to be used for
+ * selection)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for offsets within a partition
+ *
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    is_first_partition() -> [Select::Unique-only] whether this is the first partition
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
+ *
+ * @tparam KEEP_REJECTS
+ *   Whether or not we push rejected items to the back of the output
+ */
+template <typename AgentSelectIfPolicyT,
+          typename InputIteratorT,
+          typename FlagsInputIteratorT,
+          typename OutputIteratorWrapperT,
+          typename SelectOpT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename StreamingContextT,
+          bool KEEP_REJECTS,
+          bool MayAlias>
+struct AgentSelectIf
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+  using ScanTileStateT = ScanTileState<OffsetT>;
+
+  // Indicates whether the BlockLoad algorithm uses shared memory to load or exchange the data
+  static constexpr bool loads_via_smem =
+    !(AgentSelectIfPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_DIRECT
+      || AgentSelectIfPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED
+      || AgentSelectIfPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_VECTORIZE);
+
+  // If this may be an *in-place* stream compaction, we need to ensure that all of a tile's items have been loaded
+  // before signalling a subsequent thread block's partial or inclusive state, hence we need a store release when
+  // updating a tile state. Similarly, we need to make sure that the load of previous tile states precede writing of
+  // the stream-compacted items and, hence, we need a load acquire when reading those tile states.
+  static constexpr MemoryOrder memory_order =
+    ((!KEEP_REJECTS) && MayAlias && (!loads_via_smem)) ? MemoryOrder::acquire_release : MemoryOrder::relaxed;
+
+  // If we need to enforce memory order for in-place stream compaction, wrap the default decoupled look-back tile
+  // state in a helper class that enforces memory order on reads and writes
+  using MemoryOrderedTileStateT = detail::tile_state_with_memory_order<ScanTileStateT, memory_order>;
+
+  // The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  // The flag value type
+  using FlagT = cub::detail::value_t<FlagsInputIteratorT>;
+
+  // Constants
+  enum
+  {
+    USE_SELECT_OP,
+    USE_SELECT_FLAGS,
+    USE_DISCONTINUITY,
+    USE_STENCIL_WITH_OP
+  };
+
+  static constexpr ::cuda::std::int32_t BLOCK_THREADS    = AgentSelectIfPolicyT::BLOCK_THREADS;
+  static constexpr ::cuda::std::int32_t ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD;
+  static constexpr ::cuda::std::int32_t TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr bool TWO_PHASE_SCATTER                = (ITEMS_PER_THREAD > 1);
+
+  static constexpr bool has_select_op       = (!::cuda::std::is_same<SelectOpT, NullType>::value);
+  static constexpr bool has_flags_it        = (!::cuda::std::is_same<FlagT, NullType>::value);
+  static constexpr bool use_stencil_with_op = has_select_op && has_flags_it;
+  static constexpr auto SELECT_METHOD =
+    use_stencil_with_op ? USE_STENCIL_WITH_OP
+    : has_select_op     ? USE_SELECT_OP
+    : has_flags_it      ? USE_SELECT_FLAGS
+                        : USE_DISCONTINUITY;
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+  // Wrap the native input pointer with CacheModifiedValuesInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<::cuda::std::is_pointer<InputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+  // Wrap the native input pointer with CacheModifiedValuesInputIterator
+  // or directly use the supplied input iterator type
+  using WrappedFlagsInputIteratorT =
+    ::cuda::std::_If<::cuda::std::is_pointer<FlagsInputIteratorT>::value,
+                     CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,
+                     FlagsInputIteratorT>;
+
+  // Parameterized BlockLoad type for input data
+  using BlockLoadT = BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentSelectIfPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockLoad type for flags
+  using BlockLoadFlags = BlockLoad<FlagT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentSelectIfPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockDiscontinuity type for items
+  using BlockDiscontinuityT = BlockDiscontinuity<InputT, BLOCK_THREADS>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = BlockScan<OffsetT, BLOCK_THREADS, AgentSelectIfPolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using DelayConstructorT     = typename AgentSelectIfPolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackOpT = TilePrefixCallbackOp<OffsetT, cub::Sum, MemoryOrderedTileStateT, 0, DelayConstructorT>;
+
+  // Item exchange type
+  using ItemExchangeT = InputT[TILE_ITEMS];
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+
+      // Smem needed for discontinuity detection
+      typename BlockDiscontinuityT::TempStorage discontinuity;
+    } scan_storage;
+
+    // Smem needed for loading items
+    typename BlockLoadT::TempStorage load_items;
+
+    // Smem needed for loading values
+    typename BlockLoadFlags::TempStorage load_flags;
+
+    // Smem needed for compacting items (allows non POD items in this union)
+    Uninitialized<ItemExchangeT> raw_exchange;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+  WrappedInputIteratorT d_in; ///< Input items
+  OutputIteratorWrapperT d_selected_out; ///< Output iterator for the selected items
+  WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable)
+  InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
+  SelectOpT select_op; ///< Selection operator
+  OffsetT num_items; ///< Total number of input items
+
+  // Note: This is a const reference because we have seen double-digit percentage perf regressions otherwise
+  const StreamingContextT& streaming_context; ///< Context for the current partition
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  /**
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param d_in
+   *   Input data
+   *
+   * @param d_flags_in
+   *   Input selection flags (if applicable)
+   *
+   * @param d_selected_out
+   *   Output data
+   *
+   * @param select_op
+   *   Selection operator
+   *
+   * @param equality_op
+   *   Equality operator
+   *
+   * @param num_items
+   *   Total number of input items
+   *
+   * @param streaming_context
+   *   Context for the current partition
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentSelectIf(
+    TempStorage& temp_storage,
+    InputIteratorT d_in,
+    FlagsInputIteratorT d_flags_in,
+    OutputIteratorWrapperT d_selected_out,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    const StreamingContextT& streaming_context)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_selected_out(d_selected_out)
+      , d_flags_in(d_flags_in)
+      , inequality_op(equality_op)
+      , select_op(select_op)
+      , num_items(num_items)
+      , streaming_context(streaming_context)
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility methods for initializing the selections
+  //---------------------------------------------------------------------
+
+  /**
+   * Initialize selections (specialized for selection operator)
+   */
+  template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections(
+    OffsetT /*tile_offset*/,
+    OffsetT num_tile_items,
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    Int2Type<USE_SELECT_OP> /*select_method*/)
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Out-of-bounds items are selection_flags
+      selection_flags[ITEM] = 1;
+
+      if (!IS_LAST_TILE || (static_cast<OffsetT>(threadIdx.x * ITEMS_PER_THREAD + ITEM) < num_tile_items))
+      {
+        selection_flags[ITEM] = static_cast<bool>(select_op(items[ITEM]));
+      }
+    }
+  }
+
+  /**
+   * Initialize selections (specialized for selection_op applied to d_flags_in)
+   */
+  template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections(
+    OffsetT tile_offset,
+    OffsetT num_tile_items,
+    InputT (& /*items*/)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    Int2Type<USE_STENCIL_WITH_OP> /*select_method*/)
+  {
+    CTA_SYNC();
+
+    FlagT flags[ITEMS_PER_THREAD];
+    if (IS_LAST_TILE)
+    {
+      // Initialize the out-of-bounds flags
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        selection_flags[ITEM] = true;
+      }
+      // Guarded loads
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items);
+    }
+    else
+    {
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
+    }
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Set selection_flags for out-of-bounds items
+      if ((!IS_LAST_TILE) || (static_cast<OffsetT>(threadIdx.x * ITEMS_PER_THREAD + ITEM) < num_tile_items))
+      {
+        selection_flags[ITEM] = static_cast<bool>(select_op(flags[ITEM]));
+      }
+    }
+  }
+
+  /**
+   * Initialize selections (specialized for valid flags)
+   */
+  template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections(
+    OffsetT tile_offset,
+    OffsetT num_tile_items,
+    InputT (& /*items*/)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    Int2Type<USE_SELECT_FLAGS> /*select_method*/)
+  {
+    CTA_SYNC();
+
+    FlagT flags[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Out-of-bounds items are selection_flags
+      BlockLoadFlags(temp_storage.load_flags)
+        .Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags, num_tile_items, 1);
+    }
+    else
+    {
+      BlockLoadFlags(temp_storage.load_flags).Load((d_flags_in + streaming_context.input_offset()) + tile_offset, flags);
+    }
+
+// Convert flag type to selection_flags type
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      selection_flags[ITEM] = static_cast<bool>(flags[ITEM]);
+    }
+  }
+
+  /**
+   * Initialize selections (specialized for discontinuity detection)
+   */
+  template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeSelections(
+    OffsetT tile_offset,
+    OffsetT num_tile_items,
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    Int2Type<USE_DISCONTINUITY> /*select_method*/)
+  {
+    if (IS_FIRST_TILE && streaming_context.is_first_partition())
+    {
+      CTA_SYNC();
+
+      // Set head selection_flags.  First tile sets the first flag for the first item
+      BlockDiscontinuityT(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+    }
+    else
+    {
+      InputT tile_predecessor;
+      if (threadIdx.x == 0)
+      {
+        tile_predecessor = d_in[tile_offset + streaming_context.input_offset() - 1];
+      }
+
+      CTA_SYNC();
+
+      BlockDiscontinuityT(temp_storage.scan_storage.discontinuity)
+        .FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+    }
+
+// Set selection flags for out-of-bounds items
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Set selection_flags for out-of-bounds items
+      if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+      {
+        selection_flags[ITEM] = 1;
+      }
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Scatter utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * Scatter flagged items to output offsets (specialized for direct scattering).
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedDirect(
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+    OffsetT num_selections)
+  {
+// Scatter flagged items
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (selection_flags[ITEM])
+      {
+        if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+        {
+          *((d_selected_out + streaming_context.num_previously_selected()) + selection_indices[ITEM]) = items[ITEM];
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Scatter flagged items to output offsets (specialized for two-phase scattering)
+   *
+   * @param num_tile_items
+   *   Number of valid items in this tile
+   *
+   * @param num_tile_selections
+   *   Number of selections in this tile
+   *
+   * @param num_selections_prefix
+   *   Total number of selections prior to this tile
+   *
+   * @param num_rejected_prefix
+   *   Total number of rejections prior to this tile
+   *
+   * @param is_keep_rejects
+   *   Marker type indicating whether to keep rejected items in the second partition
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterSelectedTwoPhase(
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+    int num_tile_selections,
+    OffsetT num_selections_prefix)
+  {
+    CTA_SYNC();
+
+// Compact and scatter items
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+      if (selection_flags[ITEM])
+      {
+        temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+      }
+    }
+
+    CTA_SYNC();
+
+    for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+    {
+      *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
+        temp_storage.raw_exchange.Alias()[item];
+    }
+  }
+
+  /**
+   * @brief Scatter flagged items. Specialized for selection algorithm that simply discards rejected items
+   *
+   * @param num_tile_items
+   *   Number of valid items in this tile
+   *
+   * @param num_tile_selections
+   *   Number of selections in this tile
+   *
+   * @param num_selections_prefix
+   *   Total number of selections prior to this tile
+   *
+   * @param num_rejected_prefix
+   *   Total number of rejections prior to this tile
+   *
+   * @param num_selections
+   *   Total number of selections including this tile
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+    int num_tile_items,
+    int num_tile_selections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    OffsetT num_selections,
+    Int2Type<false> /*is_keep_rejects*/)
+  {
+    // Do a two-phase scatter if two-phase is enabled and the average number of selection_flags items per thread is
+    // greater than one
+    if (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))
+    {
+      ScatterSelectedTwoPhase<IS_LAST_TILE>(
+        items, selection_flags, selection_indices, num_tile_selections, num_selections_prefix);
+    }
+    else
+    {
+      ScatterSelectedDirect<IS_LAST_TILE>(items, selection_flags, selection_indices, num_selections);
+    }
+  }
+
+  /**
+   * @brief Scatter flagged items. Specialized for partitioning algorithm that writes rejected items to a second
+   * partition.
+   *
+   * @param num_tile_items
+   *   Number of valid items in this tile
+   *
+   * @param num_tile_selections
+   *   Number of selections in this tile
+   *
+   * @param num_selections_prefix
+   *   Total number of selections prior to this tile
+   *
+   * @param num_rejected_prefix
+   *   Total number of rejections prior to this tile
+   *
+   * @param is_keep_rejects
+   *   Marker type indicating whether to keep rejected items in the second partition
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    InputT (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+    int num_tile_items,
+    int num_tile_selections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    OffsetT num_selections,
+    Int2Type<true> /*is_keep_rejects*/)
+  {
+    CTA_SYNC();
+
+    int tile_num_rejections = num_tile_items - num_tile_selections;
+
+// Scatter items to shared memory (rejections first)
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx            = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+      int local_selection_idx = selection_indices[ITEM] - num_selections_prefix;
+      int local_rejection_idx = item_idx - local_selection_idx;
+      int local_scatter_offset =
+        (selection_flags[ITEM]) ? tile_num_rejections + local_selection_idx : local_rejection_idx;
+
+      temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+    }
+
+    // Ensure all threads finished scattering to shared memory
+    CTA_SYNC();
+
+    // Gather items from shared memory and scatter to global
+    ScatterPartitionsToGlobal<IS_LAST_TILE>(
+      num_tile_items, tile_num_rejections, num_selections_prefix, num_rejected_prefix, d_selected_out);
+  }
+
+  /**
+   * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to two
+   * distinct partitions.
+   */
+  template <bool IS_LAST_TILE, typename SelectedItT, typename RejectedItT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
+    int num_tile_items,
+    int tile_num_rejections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    detail::partition_distinct_output_t<SelectedItT, RejectedItT> partitioned_out_wrapper)
+  {
+    auto selected_out_it = partitioned_out_wrapper.selected_it + streaming_context.num_previously_selected();
+    auto rejected_out_it = partitioned_out_wrapper.rejected_it + streaming_context.num_previously_rejected();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      int rejection_idx = item_idx;
+      int selection_idx = item_idx - tile_num_rejections;
+      OffsetT scatter_offset =
+        (item_idx < tile_num_rejections) ? num_rejected_prefix + rejection_idx : num_selections_prefix + selection_idx;
+
+      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        if (item_idx >= tile_num_rejections)
+        {
+          selected_out_it[scatter_offset] = item;
+        }
+        else
+        {
+          rejected_out_it[scatter_offset] = item;
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Second phase of scattering partitioned items to global memory. Specialized for partitioning to a single
+   * iterator, where selected items are written in order from the beginning of the itereator and rejected items are
+   * writtem from the iterators end backwards.
+   */
+  template <bool IS_LAST_TILE, typename PartitionedOutputItT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterPartitionsToGlobal(
+    int num_tile_items,
+    int tile_num_rejections,
+    OffsetT num_selections_prefix,
+    OffsetT num_rejected_prefix,
+    PartitionedOutputItT partitioned_out_it)
+  {
+    using total_offset_t = typename StreamingContextT::total_num_items_t;
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx      = (ITEM * BLOCK_THREADS) + threadIdx.x;
+      int rejection_idx = item_idx;
+      int selection_idx = item_idx - tile_num_rejections;
+      total_offset_t scatter_offset =
+        (item_idx < tile_num_rejections)
+          ? (streaming_context.num_total_items(num_items) - streaming_context.num_previously_rejected()
+             - static_cast<total_offset_t>(num_rejected_prefix) - static_cast<total_offset_t>(rejection_idx)
+             - total_offset_t{1})
+          : (streaming_context.num_previously_selected() + static_cast<total_offset_t>(num_selections_prefix)
+             + static_cast<total_offset_t>(selection_idx));
+
+      InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        partitioned_out_it[scatter_offset] = item;
+      }
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Process first tile of input (dynamic chained scan).
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state_wrapper
+   *   A global tile state descriptor wrapped in a MemoryOrderedTileStateT that ensures consistent memory order across
+   *   all tile status updates and loads
+   *
+   * @return The running count of selections (including this tile)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+  ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, MemoryOrderedTileStateT& tile_state_wrapper)
+  {
+    InputT items[ITEMS_PER_THREAD];
+    OffsetT selection_flags[ITEMS_PER_THREAD];
+    OffsetT selection_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    InitializeSelections<true, IS_LAST_TILE>(
+      tile_offset, num_tile_items, items, selection_flags, Int2Type<SELECT_METHOD>());
+
+    // Ensure temporary storage used during block load can be reused
+    // Also, in case of in-place stream compaction, this is needed to order the loads of
+    // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
+    CTA_SYNC();
+
+    // Exclusive scan of selection_flags
+    OffsetT num_tile_selections;
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+    if (threadIdx.x == 0)
+    {
+      // Update tile status if this is not the last tile
+      if (!IS_LAST_TILE)
+      {
+        tile_state_wrapper.SetInclusive(0, num_tile_selections);
+      }
+    }
+
+    // Discount any out-of-bounds selections
+    if (IS_LAST_TILE)
+    {
+      num_tile_selections -= (TILE_ITEMS - num_tile_items);
+    }
+
+    // Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      selection_flags,
+      selection_indices,
+      num_tile_items,
+      num_tile_selections,
+      0,
+      0,
+      num_tile_selections,
+      cub::Int2Type<KEEP_REJECTS>{});
+
+    return num_tile_selections;
+  }
+
+  /**
+   * @brief Process subsequent tile of input (dynamic chained scan).
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state_wrapper
+   *   A global tile state descriptor wrapped in a MemoryOrderedTileStateT that ensures consistent memory order across
+   *   all tile status updates and loads
+   *
+   * @return The running count of selections (including this tile)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT ConsumeSubsequentTile(
+    int num_tile_items, int tile_idx, OffsetT tile_offset, MemoryOrderedTileStateT& tile_state_wrapper)
+  {
+    InputT items[ITEMS_PER_THREAD];
+    OffsetT selection_flags[ITEMS_PER_THREAD];
+    OffsetT selection_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items)
+        .Load((d_in + streaming_context.input_offset()) + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load((d_in + streaming_context.input_offset()) + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    InitializeSelections<false, IS_LAST_TILE>(
+      tile_offset, num_tile_items, items, selection_flags, Int2Type<SELECT_METHOD>());
+
+    // Ensure temporary storage used during block load can be reused
+    // Also, in case of in-place stream compaction, this is needed to order the loads of
+    // *all threads of this thread block* before the st.release of the thread writing this thread block's tile state
+    CTA_SYNC();
+
+    // Exclusive scan of values and selection_flags
+    TilePrefixCallbackOpT prefix_op(tile_state_wrapper, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+    OffsetT num_tile_selections   = prefix_op.GetBlockAggregate();
+    OffsetT num_selections        = prefix_op.GetInclusivePrefix();
+    OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix();
+    OffsetT num_rejected_prefix   = tile_offset - num_selections_prefix;
+
+    // Discount any out-of-bounds selections
+    if (IS_LAST_TILE)
+    {
+      int num_discount = TILE_ITEMS - num_tile_items;
+      num_selections -= num_discount;
+      num_tile_selections -= num_discount;
+    }
+
+    // note (only applies to in-place stream compaction): We can avoid having to introduce explicit memory order between
+    // the look-back (i.e., loading previous tiles' states) and scattering items (which means, potentially overwriting
+    // previous tiles' input items, in case of in-place compaction), because this is implicitly ensured through
+    // execution dependency: The scatter stage requires the offset from the prefix-sum and it can only know the
+    // prefix-sum after having read that from the decoupled look-back. Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      selection_flags,
+      selection_indices,
+      num_tile_items,
+      num_tile_selections,
+      num_selections_prefix,
+      num_rejected_prefix,
+      num_selections,
+      cub::Int2Type<KEEP_REJECTS>{});
+
+    return num_selections;
+  }
+
+  /**
+   * @brief Process a tile of input
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state_wrapper
+   *   A global tile state descriptor wrapped in a MemoryOrderedTileStateT that ensures consistent memory order across
+   *   all tile status updates and loads
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+  ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, MemoryOrderedTileStateT& tile_state_wrapper)
+  {
+    OffsetT num_selections;
+    if (tile_idx == 0)
+    {
+      num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state_wrapper);
+    }
+    else
+    {
+      num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state_wrapper);
+    }
+
+    return num_selections;
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param d_num_selected_out
+   *   Output total number selection_flags
+   *
+   * @tparam NumSelectedIteratorT
+   *   Output iterator type for recording number of items selection_flags
+   */
+  template <typename NumSelectedIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out)
+  {
+    // Ensure consistent memory order across all tile status updates and loads
+    auto tile_state_wrapper = MemoryOrderedTileStateT{tile_state};
+
+    // Blocks are launched in increasing order, so just assign one tile per block
+    // TODO (elstehle): replacing this term with just `blockIdx.x` degrades perf for partition. Once we get to re-tune
+    // the algorithm, we want to replace this term with `blockIdx.x`
+    int tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+    OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
+
+    if (tile_idx < num_tiles - 1)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state_wrapper);
+    }
+    else
+    {
+      // The last tile (possibly partially-full)
+      OffsetT num_remaining  = num_items - tile_offset;
+      OffsetT num_selections = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state_wrapper);
+
+      if (threadIdx.x == 0)
+      {
+        // Update the number of selected items with this partition's selections
+        streaming_context.update_num_selected(d_num_selected_out, num_selections);
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_spmv_orig.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 000000000..a39235953
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,747 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * @param Parameterizable tuning policy type for AgentSpmv
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR row-offsets during search
+ *
+ * @tparam _ROW_OFFSETS_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR row-offsets
+ *
+ * @tparam _COLUMN_INDICES_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR column-indices
+ *
+ * @tparam _VALUES_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR values
+ *
+ * @tparam _VECTOR_VALUES_LOAD_MODIFIER
+ *   Cache load modifier for reading vector values
+ *
+ * @tparam _DIRECT_LOAD_NONZEROS
+ *   Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through
+ * shared memory)
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+          CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER,
+          CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER,
+          CacheLoadModifier _VALUES_LOAD_MODIFIER,
+          CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
+          bool _DIRECT_LOAD_NONZEROS,
+          BlockScanAlgorithm _SCAN_ALGORITHM>
+struct AgentSpmvPolicy
+{
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
+
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+
+    /// Whether to load nonzeros directly from global during sequential merging (pre-staged through
+    /// shared memory)
+    DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS,
+  };
+
+  /// Cache load modifier for reading CSR row-offsets
+  static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading CSR row-offsets
+  static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading CSR column-indices
+  static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading CSR values
+  static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading vector values
+  static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ */
+template <typename ValueT, typename OffsetT>
+struct SpmvParams
+{
+  /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix
+  /// <b>A</b>.
+  const ValueT* d_values;
+
+  /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices
+  /// and \p d_values
+  const OffsetT* d_row_end_offsets;
+
+  /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements
+  /// of matrix <b>A</b>.  (Indices are zero-valued.)
+  const OffsetT* d_column_indices;
+
+  /// Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+  const ValueT* d_vector_x;
+
+  /// Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+  ValueT* d_vector_y;
+
+  /// Number of rows of matrix <b>A</b>.
+  int num_rows;
+
+  /// Number of columns of matrix <b>A</b>.
+  int num_cols;
+
+  /// Number of nonzero elements of matrix <b>A</b>.
+  int num_nonzeros;
+
+  /// Alpha multiplicand
+  ValueT alpha;
+
+  /// Beta addend-multiplicand
+  ValueT beta;
+};
+
+/**
+ * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ *
+ * @tparam AgentSpmvPolicyT
+ *   Parameterized AgentSpmvPolicy tuning policy type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam HAS_ALPHA
+ *   Whether the input parameter \p alpha is 1
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter \p beta is 0
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   PTX compute capability (unused)
+ */
+template <typename AgentSpmvPolicyT,
+          typename ValueT,
+          typename OffsetT,
+          bool HAS_ALPHA,
+          bool HAS_BETA,
+          int LEGACY_PTX_ARCH = 0>
+struct AgentSpmv
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// Constants
+  enum
+  {
+    BLOCK_THREADS    = AgentSpmvPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+  };
+
+  /// 2D merge path coordinate type
+  using CoordinateT = typename CubVector<OffsetT, 2>::Type;
+
+  /// Input iterator wrapper types (for applying cache modifiers)
+
+  using RowOffsetsSearchIteratorT =
+    CacheModifiedInputIterator<AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT>;
+
+  using RowOffsetsIteratorT = CacheModifiedInputIterator<AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, OffsetT, OffsetT>;
+
+  using ColumnIndicesIteratorT =
+    CacheModifiedInputIterator<AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, OffsetT, OffsetT>;
+
+  using ValueIteratorT = CacheModifiedInputIterator<AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
+
+  using VectorValueIteratorT =
+    CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
+
+  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+  using KeyValuePairT = KeyValuePair<OffsetT, ValueT>;
+
+  // Reduce-value-by-segment scan operator
+  using ReduceBySegmentOpT = ReduceByKeyOp<cub::Sum>;
+
+  // BlockReduce specialization
+  using BlockReduceT = BlockReduce<ValueT, BLOCK_THREADS, BLOCK_REDUCE_WARP_REDUCTIONS>;
+
+  // BlockScan specialization
+  using BlockScanT = BlockScan<KeyValuePairT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM>;
+
+  // BlockScan specialization
+  using BlockPrefixSumT = BlockScan<ValueT, BLOCK_THREADS, AgentSpmvPolicyT::SCAN_ALGORITHM>;
+
+  // BlockExchange specialization
+  using BlockExchangeT = BlockExchange<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD>;
+
+  /// Merge item type (either a non-zero value or a row-end offset)
+  union MergeItem
+  {
+    // Value type to pair with index type OffsetT
+    // (NullType if loading values directly during merge)
+    using MergeValueT = ::cuda::std::_If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>;
+
+    OffsetT row_end_offset;
+    MergeValueT nonzero;
+  };
+
+  /// Shared memory type required by this thread block
+  struct _TempStorage
+  {
+    CoordinateT tile_coords[2];
+
+    union Aliasable
+    {
+      // Smem needed for tile of merge items
+      MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+      // Smem needed for block exchange
+      typename BlockExchangeT::TempStorage exchange;
+
+      // Smem needed for block-wide reduction
+      typename BlockReduceT::TempStorage reduce;
+
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for tile prefix sum
+      typename BlockPrefixSumT::TempStorage prefix_sum;
+
+    } aliasable;
+  };
+
+  /// Temporary storage type (unionable)
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  /// Reference to temp_storage
+  _TempStorage& temp_storage;
+
+  SpmvParams<ValueT, OffsetT>& spmv_params;
+
+  /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements
+  /// of matrix <b>A</b>.
+  ValueIteratorT wd_values;
+
+  /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p
+  /// d_column_indices and \p d_values
+  RowOffsetsIteratorT wd_row_end_offsets;
+
+  /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero
+  /// elements of matrix <b>A</b>.  (Indices are zero-valued.)
+  ColumnIndicesIteratorT wd_column_indices;
+
+  /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
+  /// <em>x</em>
+  VectorValueIteratorT wd_vector_x;
+
+  /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
+  /// <em>x</em>
+  VectorValueIteratorT wd_vector_y;
+
+  //---------------------------------------------------------------------
+  // Interface
+  //---------------------------------------------------------------------
+
+  /**
+   * @param temp_storage
+   *   Reference to temp_storage
+   *
+   * @param spmv_params
+   *   SpMV input parameter bundle
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentSpmv(TempStorage& temp_storage, SpmvParams<ValueT, OffsetT>& spmv_params)
+      : temp_storage(temp_storage.Alias())
+      , spmv_params(spmv_params)
+      , wd_values(spmv_params.d_values)
+      , wd_row_end_offsets(spmv_params.d_row_end_offsets)
+      , wd_column_indices(spmv_params.d_column_indices)
+      , wd_vector_x(spmv_params.d_vector_x)
+      , wd_vector_y(spmv_params.d_vector_y)
+  {}
+
+  /**
+   * @brief Consume a merge tile, specialized for direct-load of nonzeros
+   *
+   * @param is_direct_load
+   *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT
+  ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type<true> is_direct_load)
+  {
+    int tile_num_rows               = tile_end_coord.x - tile_start_coord.x;
+    int tile_num_nonzeros           = tile_end_coord.y - tile_start_coord.y;
+    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+    // Gather the row end-offsets for the merge tile into shared memory
+    for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
+    {
+      const OffsetT offset =
+        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
+    }
+
+    CTA_SYNC();
+
+    // Search for the thread's starting coordinate within the merge tile
+    CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
+    CoordinateT thread_start_coord;
+
+    MergePathSearch(
+      OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
+      s_tile_row_end_offsets, // List A
+      tile_nonzero_indices, // List B
+      tile_num_rows,
+      tile_num_nonzeros,
+      thread_start_coord);
+
+    CTA_SYNC(); // Perf-sync
+
+    // Compute the thread's merge path segment
+    CoordinateT thread_current_coord = thread_start_coord;
+    KeyValuePairT scan_segment[ITEMS_PER_THREAD];
+
+    ValueT running_total = 0.0;
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+      OffsetT column_idx  = wd_column_indices[nonzero_idx];
+      ValueT value        = wd_values[nonzero_idx];
+
+      ValueT vector_value = wd_vector_x[column_idx];
+
+      ValueT nonzero = value * vector_value;
+
+      OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
+
+      if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+      {
+        // Move down (accumulate)
+        running_total += nonzero;
+        scan_segment[ITEM].value = running_total;
+        scan_segment[ITEM].key   = tile_num_rows;
+        ++thread_current_coord.y;
+      }
+      else
+      {
+        // Move right (reset)
+        scan_segment[ITEM].value = running_total;
+        scan_segment[ITEM].key   = thread_current_coord.x;
+        running_total            = 0.0;
+        ++thread_current_coord.x;
+      }
+    }
+
+    CTA_SYNC();
+
+    // Block-wide reduce-value-by-segment
+    KeyValuePairT tile_carry;
+    ReduceBySegmentOpT scan_op;
+    KeyValuePairT scan_item;
+
+    scan_item.value = running_total;
+    scan_item.key   = thread_current_coord.x;
+
+    BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+    if (tile_num_rows > 0)
+    {
+      if (threadIdx.x == 0)
+      {
+        scan_item.key = -1;
+      }
+
+// Direct scatter
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        if (scan_segment[ITEM].key < tile_num_rows)
+        {
+          if (scan_item.key == scan_segment[ITEM].key)
+          {
+            scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+          }
+
+          if (HAS_ALPHA)
+          {
+            scan_segment[ITEM].value *= spmv_params.alpha;
+          }
+
+          if (HAS_BETA)
+          {
+            // Update the output vector element
+            ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+            scan_segment[ITEM].value += addend;
+          }
+
+          // Set the output vector element
+          spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+        }
+      }
+    }
+
+    // Return the tile's running carry-out
+    return tile_carry;
+  }
+
+  /**
+   * @brief Consume a merge tile, specialized for indirect load of nonzeros
+   *
+   * @param is_direct_load
+   *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePairT
+  ConsumeTile(int tile_idx, CoordinateT tile_start_coord, CoordinateT tile_end_coord, Int2Type<false> is_direct_load)
+  {
+    int tile_num_rows     = tile_end_coord.x - tile_start_coord.x;
+    int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
+    ValueT* s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+// Gather the nonzeros for the merge tile into shared memory
+#  pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+      ValueIteratorT a          = wd_values + tile_start_coord.y + nonzero_idx;
+      ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx;
+      ValueT* s                 = s_tile_nonzeros + nonzero_idx;
+
+      if (nonzero_idx < tile_num_nonzeros)
+      {
+        OffsetT column_idx = *ci;
+        ValueT value       = *a;
+
+        ValueT vector_value = wd_vector_x[column_idx];
+
+        ValueT nonzero = value * vector_value;
+
+        *s = nonzero;
+      }
+    }
+
+#else
+
+    OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
+    ValueT* s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+    // Gather the nonzeros for the merge tile into shared memory
+    if (tile_num_nonzeros > 0)
+    {
+#  pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+        nonzero_idx     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+        OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
+        ValueT value       = wd_values[tile_start_coord.y + nonzero_idx];
+
+        ValueT vector_value = wd_vector_x[column_idx];
+
+        ValueT nonzero = value * vector_value;
+
+        s_tile_nonzeros[nonzero_idx] = nonzero;
+      }
+    }
+
+#endif
+
+// Gather the row end-offsets for the merge tile into shared memory
+#pragma unroll 1
+    for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
+    {
+      const OffsetT offset =
+        (cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
+      s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
+    }
+
+    CTA_SYNC();
+
+    // Search for the thread's starting coordinate within the merge tile
+    CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
+    CoordinateT thread_start_coord;
+
+    MergePathSearch(
+      OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
+      s_tile_row_end_offsets, // List A
+      tile_nonzero_indices, // List B
+      tile_num_rows,
+      tile_num_nonzeros,
+      thread_start_coord);
+
+    CTA_SYNC(); // Perf-sync
+
+    // Compute the thread's merge path segment
+    CoordinateT thread_current_coord = thread_start_coord;
+    KeyValuePairT scan_segment[ITEMS_PER_THREAD];
+    ValueT running_total = 0.0;
+
+    OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
+    ValueT nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+      {
+        // Move down (accumulate)
+        scan_segment[ITEM].value = nonzero;
+        running_total += nonzero;
+        ++thread_current_coord.y;
+        nonzero = s_tile_nonzeros[thread_current_coord.y];
+      }
+      else
+      {
+        // Move right (reset)
+        scan_segment[ITEM].value = 0.0;
+        running_total            = 0.0;
+        ++thread_current_coord.x;
+        row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
+      }
+
+      scan_segment[ITEM].key = thread_current_coord.x;
+    }
+
+    CTA_SYNC();
+
+    // Block-wide reduce-value-by-segment
+    KeyValuePairT tile_carry;
+    ReduceBySegmentOpT scan_op;
+    KeyValuePairT scan_item;
+
+    scan_item.value = running_total;
+    scan_item.key   = thread_current_coord.x;
+
+    BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+    if (threadIdx.x == 0)
+    {
+      scan_item.key   = thread_start_coord.x;
+      scan_item.value = 0.0;
+    }
+
+    if (tile_num_rows > 0)
+    {
+      CTA_SYNC();
+
+      // Scan downsweep and scatter
+      ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+      if (scan_item.key != scan_segment[0].key)
+      {
+        s_partials[scan_item.key] = scan_item.value;
+      }
+      else
+      {
+        scan_segment[0].value += scan_item.value;
+      }
+
+#pragma unroll
+      for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+        {
+          s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+        }
+        else
+        {
+          scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+        }
+      }
+
+      CTA_SYNC();
+
+#pragma unroll 1
+      for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+      {
+        spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+      }
+    }
+
+    // Return the tile's running carry-out
+    return tile_carry;
+  }
+
+  /**
+   * @brief Consume input tile
+   *
+   * @param[in] d_tile_coordinates
+   *   Pointer to the temporary array of tile starting coordinates
+   *
+   * @param[out] d_tile_carry_pairs
+   *   Pointer to the temporary array carry-out dot product row-ids, one per block
+   *
+   * @param[in] num_merge_tiles
+   *   Number of merge tiles
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(CoordinateT* d_tile_coordinates, KeyValuePairT* d_tile_carry_pairs, int num_merge_tiles)
+  {
+    int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+
+    if (tile_idx >= num_merge_tiles)
+    {
+      return;
+    }
+
+    // Read our starting coordinates
+    if (threadIdx.x < 2)
+    {
+      if (d_tile_coordinates == nullptr)
+      {
+        // Search our starting coordinates
+        OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+        CoordinateT tile_coord;
+        CountingInputIterator<OffsetT> nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+          diagonal,
+          RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+          nonzero_indices,
+          spmv_params.num_rows,
+          spmv_params.num_nonzeros,
+          tile_coord);
+
+        temp_storage.tile_coords[threadIdx.x] = tile_coord;
+      }
+      else
+      {
+        temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+      }
+    }
+
+    CTA_SYNC();
+
+    CoordinateT tile_start_coord = temp_storage.tile_coords[0];
+    CoordinateT tile_end_coord   = temp_storage.tile_coords[1];
+
+    // Consume multi-segment tile
+    KeyValuePairT tile_carry =
+      ConsumeTile(tile_idx, tile_start_coord, tile_end_coord, Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+    // Output the tile's carry-out
+    if (threadIdx.x == 0)
+    {
+      if (HAS_ALPHA)
+      {
+        tile_carry.value *= spmv_params.alpha;
+      }
+
+      tile_carry.key += tile_start_coord.x;
+      if (tile_carry.key >= spmv_params.num_rows)
+      {
+        // FIXME: This works around an invalid memory access in the
+        // fixup kernel. The underlying issue needs to be debugged and
+        // properly fixed, but this hack prevents writes to
+        // out-of-bounds addresses. It doesn't appear to have an effect
+        // on the validity of the results, since this only affects the
+        // carry-over from last tile in the input.
+        tile_carry.key   = spmv_params.num_rows - 1;
+        tile_carry.value = ValueT{};
+      };
+
+      d_tile_carry_pairs[tile_idx] = tile_carry;
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_sub_warp_merge_sort.cuh
new file mode 100644
index 000000000..f07b2173c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_sub_warp_merge_sort.cuh
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_load.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+#include <cub/warp/warp_store.cuh>
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+template <int WARP_THREADS_ARG,
+          int ITEMS_PER_THREAD_ARG,
+          cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG   = cub::WARP_LOAD_DIRECT,
+          cub::CacheLoadModifier LOAD_MODIFIER_ARG    = cub::LOAD_LDG,
+          cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT>
+struct AgentSubWarpMergeSortPolicy
+{
+  static constexpr int WARP_THREADS     = WARP_THREADS_ARG;
+  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
+  static constexpr int ITEMS_PER_TILE   = WARP_THREADS * ITEMS_PER_THREAD;
+
+  static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM   = LOAD_ALGORITHM_ARG;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER    = LOAD_MODIFIER_ARG;
+  static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG;
+};
+
+template <int BLOCK_THREADS_ARG, typename SmallPolicy, typename MediumPolicy>
+struct AgentSmallAndMediumSegmentedSortPolicy
+{
+  static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
+  using SmallPolicyT                 = SmallPolicy;
+  using MediumPolicyT                = MediumPolicy;
+
+  static constexpr int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS;
+
+  static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS;
+};
+
+/**
+ * @brief AgentSubWarpSort implements a sub-warp merge sort.
+ *
+ * This agent can work with any power of two number of threads, not exceeding
+ * 32. The number of threads is defined in the `PolicyT::WARP_THREADS`. Virtual
+ * warp of `PolicyT::WARP_THREADS` will efficiently load data using
+ * `PolicyT::LOAD_ALGORITHM`, sort it using `WarpMergeSort`, and store it back
+ * using `PolicyT::STORE_ALGORITHM`.
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam PolicyT
+ *   Chained tuning policy
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <bool IS_DESCENDING, typename PolicyT, typename KeyT, typename ValueT, typename OffsetT>
+class AgentSubWarpSort
+{
+  using traits           = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type = typename traits::bit_ordered_type;
+
+  struct BinaryOpT
+  {
+    template <typename T>
+    _CCCL_DEVICE bool operator()(T lhs, T rhs) const noexcept
+    {
+      _CCCL_IF_CONSTEXPR (IS_DESCENDING)
+      {
+        return lhs > rhs;
+      }
+      else
+      {
+        return lhs < rhs;
+      }
+      _CCCL_UNREACHABLE();
+    }
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+    _CCCL_DEVICE bool operator()(__half lhs, __half rhs) const noexcept
+    {
+      // Need to explicitly cast to float for SM <= 52.
+      _CCCL_IF_CONSTEXPR (IS_DESCENDING)
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hgt(lhs, rhs);), (return __half2float(lhs) > __half2float(rhs);));
+      }
+      else
+      {
+        NV_IF_TARGET(NV_PROVIDES_SM_53, (return __hlt(lhs, rhs);), (return __half2float(lhs) < __half2float(rhs);));
+      }
+      _CCCL_UNREACHABLE();
+    }
+#endif // __CUDA_FP16_TYPES_EXIST__
+  };
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+  _CCCL_DEVICE static bool equal(__half lhs, __half rhs)
+  {
+    // Need to explicitly cast to float for SM <= 52.
+    NV_IF_TARGET(NV_PROVIDES_SM_53, (return __heq(lhs, rhs);), (return __half2float(lhs) == __half2float(rhs);));
+  }
+#endif // __CUDA_FP16_TYPES_EXIST__
+
+  template <typename T>
+  _CCCL_DEVICE static bool equal(T lhs, T rhs)
+  {
+    return lhs == rhs;
+  }
+
+  _CCCL_DEVICE static bool get_oob_default(Int2Type<true> /* is bool */)
+  {
+    // Traits<KeyT>::MAX_KEY for `bool` is 0xFF which is different from `true` and makes
+    // comparison with oob unreliable.
+    return !IS_DESCENDING;
+  }
+
+  _CCCL_DEVICE static KeyT get_oob_default(Int2Type<false> /* is bool */)
+  {
+    // For FP64 the difference is:
+    // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b
+    // LOWEST   -> -nan          = 11...11b -> TwiddleIn ->  0 = 00...00b
+
+    // Segmented sort doesn't support custom types at the moment.
+    bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(detail::identity_decomposer_t{})
+                                                      : traits::max_raw_binary_key(detail::identity_decomposer_t{});
+    return reinterpret_cast<KeyT&>(default_key_bits);
+  }
+
+public:
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;
+
+  using KeysLoadItT  = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
+  using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
+
+  using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
+  using WarpLoadItemsT =
+    cub::WarpLoad<ValueT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
+
+  using WarpStoreKeysT =
+    cub::WarpStore<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::STORE_ALGORITHM, PolicyT::WARP_THREADS>;
+  using WarpStoreItemsT =
+    cub::WarpStore<ValueT, PolicyT::ITEMS_PER_THREAD, PolicyT::STORE_ALGORITHM, PolicyT::WARP_THREADS>;
+
+  union _TempStorage
+  {
+    typename WarpLoadKeysT::TempStorage load_keys;
+    typename WarpLoadItemsT::TempStorage load_items;
+    typename WarpMergeSortT::TempStorage sort;
+    typename WarpStoreKeysT::TempStorage store_keys;
+    typename WarpStoreItemsT::TempStorage store_items;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  _TempStorage& storage;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE explicit AgentSubWarpSort(TempStorage& temp_storage)
+      : storage(temp_storage.Alias())
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ProcessSegment(
+    int segment_size, KeysLoadItT keys_input, KeyT* keys_output, ItemsLoadItT values_input, ValueT* values_output)
+  {
+    WarpMergeSortT warp_merge_sort(storage.sort);
+
+    if (segment_size < 3)
+    {
+      ShortCircuit(
+        warp_merge_sort.get_linear_tid(),
+        segment_size,
+        keys_input,
+        keys_output,
+        values_input,
+        values_output,
+        BinaryOpT{});
+    }
+    else
+    {
+      KeyT keys[PolicyT::ITEMS_PER_THREAD];
+      ValueT values[PolicyT::ITEMS_PER_THREAD];
+
+      KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type<std::is_same<bool, KeyT>::value>{});
+
+      WarpLoadKeysT(storage.load_keys).Load(keys_input, keys, segment_size, oob_default);
+      WARP_SYNC(warp_merge_sort.get_member_mask());
+
+      if (!KEYS_ONLY)
+      {
+        WarpLoadItemsT(storage.load_items).Load(values_input, values, segment_size);
+
+        WARP_SYNC(warp_merge_sort.get_member_mask());
+      }
+
+      warp_merge_sort.Sort(keys, values, BinaryOpT{}, segment_size, oob_default);
+      WARP_SYNC(warp_merge_sort.get_member_mask());
+
+      WarpStoreKeysT(storage.store_keys).Store(keys_output, keys, segment_size);
+
+      if (!KEYS_ONLY)
+      {
+        WARP_SYNC(warp_merge_sort.get_member_mask());
+        WarpStoreItemsT(storage.store_items).Store(values_output, values, segment_size);
+      }
+    }
+  }
+
+private:
+  /**
+   * This method implements a shortcut for sorting less than three items.
+   * Only the first thread of a virtual warp is used for soring.
+   */
+  template <typename CompareOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ShortCircuit(
+    unsigned int linear_tid,
+    OffsetT segment_size,
+    KeysLoadItT keys_input,
+    KeyT* keys_output,
+    ItemsLoadItT values_input,
+    ValueT* values_output,
+    CompareOpT binary_op)
+  {
+    if (segment_size == 1)
+    {
+      if (linear_tid == 0)
+      {
+        if (keys_input.ptr != keys_output)
+        {
+          keys_output[0] = keys_input[0];
+        }
+
+        if (!KEYS_ONLY)
+        {
+          if (values_input.ptr != values_output)
+          {
+            values_output[0] = values_input[0];
+          }
+        }
+      }
+    }
+    else if (segment_size == 2)
+    {
+      if (linear_tid == 0)
+      {
+        KeyT lhs = keys_input[0];
+        KeyT rhs = keys_input[1];
+
+        if (equal(lhs, rhs) || binary_op(lhs, rhs))
+        {
+          keys_output[0] = lhs;
+          keys_output[1] = rhs;
+
+          if (!KEYS_ONLY)
+          {
+            if (values_output != values_input.ptr)
+            {
+              values_output[0] = values_input[0];
+              values_output[1] = values_input[1];
+            }
+          }
+        }
+        else
+        {
+          keys_output[0] = rhs;
+          keys_output[1] = lhs;
+
+          if (!KEYS_ONLY)
+          {
+            // values_output might be an alias for values_input, so
+            // we have to use registers here
+
+            const ValueT lhs_val = values_input[0];
+            const ValueT rhs_val = values_input[1];
+
+            values_output[0] = rhs_val;
+            values_output[1] = lhs_val;
+          }
+        }
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_three_way_partition.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_three_way_partition.cuh
new file mode 100644
index 000000000..0c0556ffe
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_three_way_partition.cuh
@@ -0,0 +1,582 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+namespace detail
+{
+
+namespace three_way_partition
+{
+
+template <class OffsetT>
+struct pair_pack_t
+{
+  OffsetT x, y;
+
+  _CCCL_DEVICE pair_pack_t<OffsetT> operator+(const pair_pack_t<OffsetT>& other) const
+  {
+    return {x + other.x, y + other.y};
+  }
+};
+
+template <class OffsetT, class = void>
+struct accumulator_pack_base_t
+{
+  using pack_t = pair_pack_t<OffsetT>;
+
+  _CCCL_DEVICE static pack_t pack(OffsetT f, OffsetT s)
+  {
+    return {f, s};
+  }
+  _CCCL_DEVICE static OffsetT first(pack_t packed)
+  {
+    return packed.x;
+  }
+  _CCCL_DEVICE static OffsetT second(pack_t packed)
+  {
+    return packed.y;
+  }
+};
+
+template <class OffsetT>
+struct accumulator_pack_base_t<OffsetT, typename ::cuda::std::enable_if<sizeof(OffsetT) == 4>::type>
+{
+  using pack_t = std::uint64_t;
+
+  _CCCL_DEVICE static pack_t pack(OffsetT f, OffsetT s)
+  {
+    return (static_cast<pack_t>(f) << 32) | static_cast<pack_t>(s);
+  }
+
+  _CCCL_DEVICE static OffsetT first(pack_t packed)
+  {
+    return static_cast<OffsetT>(packed >> 32);
+  }
+
+  _CCCL_DEVICE static OffsetT second(pack_t packed)
+  {
+    return static_cast<OffsetT>(packed & 0xFFFFFFFF);
+  }
+};
+
+template <class OffsetT>
+struct accumulator_pack_t : accumulator_pack_base_t<OffsetT>
+{
+  using base = accumulator_pack_base_t<OffsetT>;
+  using typename base::pack_t;
+
+  _CCCL_DEVICE static void subtract(pack_t& packed, OffsetT val)
+  {
+    packed = base::pack(base::first(packed) - val, base::second(packed) - val);
+  }
+
+  _CCCL_DEVICE static OffsetT sum(pack_t& packed)
+  {
+    return base::first(packed) + base::second(packed);
+  }
+
+  _CCCL_DEVICE static pack_t zero()
+  {
+    return {};
+  }
+};
+
+} // namespace three_way_partition
+
+} // namespace detail
+
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          class DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentThreeWayPartitionPolicy
+{
+  static constexpr int BLOCK_THREADS                 = _BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD              = _ITEMS_PER_THREAD;
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+  static constexpr CacheLoadModifier LOAD_MODIFIER   = _LOAD_MODIFIER;
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/**
+ * \brief Implements a device-wide three-way partitioning
+ *
+ * Splits input data into three parts based on the selection functors. If the
+ * first functor selects an item, the algorithm places it in the first part.
+ * Otherwise, if the second functor selects an item, the algorithm places it in
+ * the second part. If both functors don't select an item, the algorithm places
+ * it into the unselected part.
+ */
+template <typename PolicyT,
+          typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT>
+struct AgentThreeWayPartition
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  using AccumPackT       = typename AccumPackHelperT::pack_t;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = cub::ScanTileState<AccumPackT>;
+
+  // Constants
+  static constexpr int BLOCK_THREADS    = PolicyT::BLOCK_THREADS;
+  static constexpr int ITEMS_PER_THREAD = PolicyT::ITEMS_PER_THREAD;
+  static constexpr int TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  using WrappedInputIteratorT =
+    ::cuda::std::_If<std::is_pointer<InputIteratorT>::value,
+                     cub::CacheModifiedInputIterator<PolicyT::LOAD_MODIFIER, InputT, OffsetT>,
+                     InputIteratorT>;
+
+  // Parameterized BlockLoad type for input data
+  using BlockLoadT = cub::BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = cub::BlockScan<AccumPackT, BLOCK_THREADS, PolicyT::SCAN_ALGORITHM>;
+
+  // Callback type for obtaining tile prefix during block scan
+  using DelayConstructorT     = typename PolicyT::detail::delay_constructor_t;
+  using TilePrefixCallbackOpT = cub::TilePrefixCallbackOp<AccumPackT, cub::Sum, ScanTileStateT, 0, DelayConstructorT>;
+
+  // Item exchange type
+  using ItemExchangeT = InputT[TILE_ITEMS];
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      // Smem needed for tile scanning
+      typename BlockScanT::TempStorage scan;
+
+      // Smem needed for cooperative prefix callback
+      typename TilePrefixCallbackOpT::TempStorage prefix;
+    } scan_storage;
+
+    // Smem needed for loading items
+    typename BlockLoadT::TempStorage load_items;
+
+    // Smem needed for compacting items (allows non POD items in this union)
+    cub::Uninitialized<ItemExchangeT> raw_exchange;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : cub::Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage; ///< Reference to temp_storage
+  WrappedInputIteratorT d_in; ///< Input items
+  FirstOutputIteratorT d_first_part_out;
+  SecondOutputIteratorT d_second_part_out;
+  UnselectedOutputIteratorT d_unselected_out;
+  SelectFirstPartOp select_first_part_op;
+  SelectSecondPartOp select_second_part_op;
+  OffsetT num_items; ///< Total number of input items
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  // Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentThreeWayPartition(
+    TempStorage& temp_storage,
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    OffsetT num_items)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_first_part_out(d_first_part_out)
+      , d_second_part_out(d_second_part_out)
+      , d_unselected_out(d_unselected_out)
+      , select_first_part_op(select_first_part_op)
+      , select_second_part_op(select_second_part_op)
+      , num_items(num_items)
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility methods for initializing the selections
+  //---------------------------------------------------------------------
+
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Initialize(
+    OffsetT num_tile_items, InputT (&items)[ITEMS_PER_THREAD], AccumPackT (&items_selection_flags)[ITEMS_PER_THREAD])
+  {
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Out-of-bounds items are selection_flags
+      items_selection_flags[ITEM] = AccumPackHelperT::pack(1, 1);
+
+      if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+      {
+        OffsetT first_item_selected = select_first_part_op(items[ITEM]);
+        items_selection_flags[ITEM] =
+          AccumPackHelperT::pack(first_item_selected, first_item_selected ? 0 : select_second_part_op(items[ITEM]));
+      }
+    }
+  }
+
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    InputT (&items)[ITEMS_PER_THREAD],
+    AccumPackT (&items_selection_flags)[ITEMS_PER_THREAD],
+    AccumPackT (&items_selection_indices)[ITEMS_PER_THREAD],
+    int num_tile_items,
+    AccumPackT num_tile_selected,
+    AccumPackT num_tile_selected_prefix,
+    OffsetT num_rejected_prefix)
+  {
+    CTA_SYNC();
+
+    const OffsetT num_first_selections_prefix  = AccumPackHelperT::first(num_tile_selected_prefix);
+    const OffsetT num_second_selections_prefix = AccumPackHelperT::second(num_tile_selected_prefix);
+
+    const int first_item_end  = AccumPackHelperT::first(num_tile_selected);
+    const int second_item_end = first_item_end + AccumPackHelperT::second(num_tile_selected);
+
+    // Scatter items to shared memory (rejections first)
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+
+      const OffsetT first_items_selection_indices  = AccumPackHelperT::first(items_selection_indices[ITEM]);
+      const OffsetT second_items_selection_indices = AccumPackHelperT::second(items_selection_indices[ITEM]);
+
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        int local_scatter_offset = 0;
+
+        if (AccumPackHelperT::first(items_selection_flags[ITEM]))
+        {
+          local_scatter_offset = first_items_selection_indices - num_first_selections_prefix;
+        }
+        else if (AccumPackHelperT::second(items_selection_flags[ITEM]))
+        {
+          local_scatter_offset = first_item_end + second_items_selection_indices - num_second_selections_prefix;
+        }
+        else
+        {
+          // Medium item
+          int local_selection_idx = (first_items_selection_indices - num_first_selections_prefix)
+                                  + (second_items_selection_indices - num_second_selections_prefix);
+          local_scatter_offset = second_item_end + item_idx - local_selection_idx;
+        }
+
+        temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+      }
+    }
+
+    CTA_SYNC();
+
+    // Gather items from shared memory and scatter to global
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
+
+      if (!IS_LAST_TILE || (item_idx < num_tile_items))
+      {
+        InputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+        if (item_idx < first_item_end)
+        {
+          d_first_part_out[num_first_selections_prefix + item_idx] = item;
+        }
+        else if (item_idx < second_item_end)
+        {
+          d_second_part_out[num_second_selections_prefix + item_idx - first_item_end] = item;
+        }
+        else
+        {
+          int rejection_idx                                     = item_idx - second_item_end;
+          d_unselected_out[num_rejected_prefix + rejection_idx] = item;
+        }
+      }
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * Process first tile of input (dynamic chained scan).
+   * Returns the running count of selections (including this tile)
+   *
+   * @param num_tile_items Number of input items comprising this tile
+   * @param tile_offset Tile offset
+   * @param first_tile_state Global tile state descriptor
+   * @param second_tile_state Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& num_items_selected)
+  {
+    InputT items[ITEMS_PER_THREAD];
+
+    AccumPackT items_selection_flags[ITEMS_PER_THREAD];
+    AccumPackT items_selection_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    Initialize<IS_LAST_TILE>(num_tile_items, items, items_selection_flags);
+    CTA_SYNC();
+
+    // Exclusive scan of selection_flags
+    BlockScanT(temp_storage.scan_storage.scan)
+      .ExclusiveSum(items_selection_flags, items_selection_indices, num_items_selected);
+
+    if (threadIdx.x == 0)
+    {
+      // Update tile status if this is not the last tile
+      if (!IS_LAST_TILE)
+      {
+        tile_state.SetInclusive(0, num_items_selected);
+      }
+    }
+
+    // Discount any out-of-bounds selections
+    if (IS_LAST_TILE)
+    {
+      AccumPackHelperT::subtract(num_items_selected, TILE_ITEMS - num_tile_items);
+    }
+
+    // Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      items_selection_flags,
+      items_selection_indices,
+      num_tile_items,
+      num_items_selected,
+      // all the prefixes equal to 0 because it's the first tile
+      AccumPackHelperT::zero(),
+      0);
+  }
+
+  /**
+   * Process subsequent tile of input (dynamic chained scan).
+   * Returns the running count of selections (including this tile)
+   *
+   * @param num_tile_items Number of input items comprising this tile
+   * @param tile_idx Tile index
+   * @param tile_offset Tile offset
+   * @param first_tile_state Global tile state descriptor
+   * @param second_tile_state Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeSubsequentTile(
+    int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& num_items_selected)
+  {
+    InputT items[ITEMS_PER_THREAD];
+
+    AccumPackT items_selected_flags[ITEMS_PER_THREAD];
+    AccumPackT items_selected_indices[ITEMS_PER_THREAD];
+
+    // Load items
+    if (IS_LAST_TILE)
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+    }
+    else
+    {
+      BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+    }
+
+    // Initialize selection_flags
+    Initialize<IS_LAST_TILE>(num_tile_items, items, items_selected_flags);
+    CTA_SYNC();
+
+    // Exclusive scan of values and selection_flags
+    TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
+
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(items_selected_flags, items_selected_indices, prefix_op);
+
+    num_items_selected                    = prefix_op.GetInclusivePrefix();
+    AccumPackT num_items_in_tile_selected = prefix_op.GetBlockAggregate();
+    AccumPackT num_items_selected_prefix  = prefix_op.GetExclusivePrefix();
+
+    CTA_SYNC();
+
+    OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - AccumPackHelperT::sum(num_items_selected_prefix);
+
+    // Discount any out-of-bounds selections. There are exactly
+    // TILE_ITEMS - num_tile_items elements like that because we
+    // marked them as selected in Initialize method.
+    if (IS_LAST_TILE)
+    {
+      const int num_discount = TILE_ITEMS - num_tile_items;
+
+      AccumPackHelperT::subtract(num_items_selected, num_discount);
+      AccumPackHelperT::subtract(num_items_in_tile_selected, num_discount);
+    }
+
+    // Scatter flagged items
+    Scatter<IS_LAST_TILE>(
+      items,
+      items_selected_flags,
+      items_selected_indices,
+      num_tile_items,
+      num_items_in_tile_selected,
+      num_items_selected_prefix,
+      num_rejected_prefix);
+  }
+
+  /**
+   * Process a tile of input
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state, AccumPackT& accum)
+  {
+    if (tile_idx == 0)
+    {
+      ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state, accum);
+    }
+    else
+    {
+      ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state, accum);
+    }
+  }
+
+  /**
+   * Scan tiles of items as part of a dynamic chained scan
+   *
+   * @tparam NumSelectedIteratorT
+   *   Output iterator type for recording number of items selection_flags
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param first_tile_state
+   *   Global tile state descriptor
+   *
+   * @param second_tile_state
+   *   Global tile state descriptor
+   *
+   * @param d_num_selected_out
+   *   Output total number selection_flags
+   */
+  template <typename NumSelectedIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per block
+    // Current tile index
+    const int tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y);
+
+    // Global offset for the current tile
+    const OffsetT tile_offset = tile_idx * TILE_ITEMS;
+
+    AccumPackT accum;
+
+    if (tile_idx < num_tiles - 1)
+    {
+      // Not the last tile (full)
+      ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state, accum);
+    }
+    else
+    {
+      // The last tile (possibly partially-full)
+      const OffsetT num_remaining = num_items - tile_offset;
+
+      ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, accum);
+
+      if (threadIdx.x == 0)
+      {
+        // Output the total number of items selection_flags
+        d_num_selected_out[0] = AccumPackHelperT::first(accum);
+        d_num_selected_out[1] = AccumPackHelperT::second(accum);
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_unique_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_unique_by_key.cuh
new file mode 100644
index 000000000..7df751076
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/agent_unique_by_key.cuh
@@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide
+ * unique-by-key.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include <iterator>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentUniqueByKey
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD                   = 1,
+          cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT,
+          cub::CacheLoadModifier _LOAD_MODIFIER   = cub::LOAD_LDG,
+          cub::BlockScanAlgorithm _SCAN_ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS,
+          typename DelayConstructorT              = detail::fixed_delay_constructor_t<350, 450>>
+struct AgentUniqueByKeyPolicy
+{
+  enum
+  {
+    BLOCK_THREADS    = _BLOCK_THREADS,
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+  };
+  static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+  static constexpr cub::CacheLoadModifier LOAD_MODIFIER   = _LOAD_MODIFIER;
+  static constexpr cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating
+ * in device-wide unique-by-key
+ *
+ * @tparam AgentUniqueByKeyPolicyT
+ *   Parameterized AgentUniqueByKeyPolicy tuning policy type
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename AgentUniqueByKeyPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
+struct AgentUniqueByKey
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  // The input key and value type
+  using KeyT   = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+  using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ScanTileState<OffsetT>;
+
+  // Constants
+  enum
+  {
+    BLOCK_THREADS    = AgentUniqueByKeyPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD,
+    ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
+  };
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+  using WrappedKeyInputIteratorT = typename std::conditional<
+    std::is_pointer<KeyInputIteratorT>::value,
+    CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, KeyT, OffsetT>, // Wrap the native input pointer
+                                                                                       // with
+                                                                                       // CacheModifiedValuesInputIterator
+    KeyInputIteratorT>::type; // Directly use the supplied input iterator type
+
+  // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+  using WrappedValueInputIteratorT = typename std::conditional<
+    std::is_pointer<ValueInputIteratorT>::value,
+    CacheModifiedInputIterator<AgentUniqueByKeyPolicyT::LOAD_MODIFIER, ValueT, OffsetT>, // Wrap the native input
+                                                                                         // pointer with
+                                                                                         // CacheModifiedValuesInputIterator
+    ValueInputIteratorT>::type; // Directly use the supplied input iterator type
+
+  // Parameterized BlockLoad type for input data
+  using BlockLoadKeys = BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockLoad type for flags
+  using BlockLoadValues = BlockLoad<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>;
+
+  // Parameterized BlockDiscontinuity type for items
+  using BlockDiscontinuityKeys = cub::BlockDiscontinuity<KeyT, BLOCK_THREADS>;
+
+  // Parameterized BlockScan type
+  using BlockScanT = cub::BlockScan<OffsetT, BLOCK_THREADS, AgentUniqueByKeyPolicyT::SCAN_ALGORITHM>;
+
+  // Parameterized BlockDiscontinuity type for items
+  using DelayConstructorT  = typename AgentUniqueByKeyPolicyT::detail::delay_constructor_t;
+  using TilePrefixCallback = cub::TilePrefixCallbackOp<OffsetT, cub::Sum, ScanTileStateT, 0, DelayConstructorT>;
+
+  // Key exchange type
+  using KeyExchangeT = KeyT[ITEMS_PER_TILE];
+
+  // Value exchange type
+  using ValueExchangeT = ValueT[ITEMS_PER_TILE];
+
+  // Shared memory type for this thread block
+  union _TempStorage
+  {
+    struct ScanStorage
+    {
+      typename BlockScanT::TempStorage scan;
+      typename TilePrefixCallback::TempStorage prefix;
+      typename BlockDiscontinuityKeys::TempStorage discontinuity;
+    } scan_storage;
+
+    // Smem needed for loading keys
+    typename BlockLoadKeys::TempStorage load_keys;
+
+    // Smem needed for loading values
+    typename BlockLoadValues::TempStorage load_values;
+
+    // Smem needed for compacting items (allows non POD items in this union)
+    Uninitialized<KeyExchangeT> shared_keys;
+    Uninitialized<ValueExchangeT> shared_values;
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  _TempStorage& temp_storage;
+  WrappedKeyInputIteratorT d_keys_in;
+  WrappedValueInputIteratorT d_values_in;
+  KeyOutputIteratorT d_keys_out;
+  ValueOutputIteratorT d_values_out;
+  cub::InequalityWrapper<EqualityOpT> inequality_op;
+  OffsetT num_items;
+
+  //---------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------
+
+  // Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE AgentUniqueByKey(
+    TempStorage& temp_storage_,
+    WrappedKeyInputIteratorT d_keys_in_,
+    WrappedValueInputIteratorT d_values_in_,
+    KeyOutputIteratorT d_keys_out_,
+    ValueOutputIteratorT d_values_out_,
+    EqualityOpT equality_op_,
+    OffsetT num_items_)
+      : temp_storage(temp_storage_.Alias())
+      , d_keys_in(d_keys_in_)
+      , d_values_in(d_values_in_)
+      , d_keys_out(d_keys_out_)
+      , d_values_out(d_values_out_)
+      , inequality_op(equality_op_)
+      , num_items(num_items_)
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  struct KeyTagT
+  {};
+  struct ValueTagT
+  {};
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyExchangeT& GetShared(KeyTagT)
+  {
+    return temp_storage.shared_keys.Alias();
+  }
+  _CCCL_DEVICE _CCCL_FORCEINLINE ValueExchangeT& GetShared(ValueTagT)
+  {
+    return temp_storage.shared_values.Alias();
+  }
+
+  //---------------------------------------------------------------------
+  // Scatter utility methods
+  //---------------------------------------------------------------------
+  template <typename Tag, typename OutputIt, typename T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scatter(
+    Tag tag,
+    OutputIt items_out,
+    T (&items)[ITEMS_PER_THREAD],
+    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+    int /*num_tile_items*/,
+    int num_tile_selections,
+    OffsetT num_selections_prefix,
+    OffsetT /*num_selections*/)
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+      if (selection_flags[ITEM])
+      {
+        GetShared(tag)[local_scatter_offset] = items[ITEM];
+      }
+    }
+
+    CTA_SYNC();
+
+// Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
+// types
+#pragma unroll 1
+    for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+    {
+      items_out[num_selections_prefix + item] = GetShared(tag)[item];
+    }
+
+    CTA_SYNC();
+  }
+
+  //---------------------------------------------------------------------
+  // Cooperatively scan a device-wide sequence of tiles with other CTAs
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Process first tile of input (dynamic chained scan).
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @return The running count of selections (including this tile)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+  ConsumeFirstTile(int num_tile_items, OffsetT tile_offset, ScanTileStateT& tile_state)
+  {
+    KeyT keys[ITEMS_PER_THREAD];
+    OffsetT selection_flags[ITEMS_PER_THREAD];
+    OffsetT selection_idx[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoadKeys(temp_storage.load_keys)
+        .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+    }
+
+    CTA_SYNC();
+
+    ValueT values[ITEMS_PER_THREAD];
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoadValues(temp_storage.load_values)
+        .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+    }
+
+    CTA_SYNC();
+
+    BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity).FlagHeads(selection_flags, keys, inequality_op);
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Set selection_flags for out-of-bounds items
+      if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+      {
+        selection_flags[ITEM] = 1;
+      }
+    }
+
+    CTA_SYNC();
+
+    OffsetT num_tile_selections   = 0;
+    OffsetT num_selections        = 0;
+    OffsetT num_selections_prefix = 0;
+
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_idx, num_tile_selections);
+
+    if (threadIdx.x == 0)
+    {
+      // Update tile status if this is not the last tile
+      if (!IS_LAST_TILE)
+      {
+        tile_state.SetInclusive(0, num_tile_selections);
+      }
+    }
+
+    // Do not count any out-of-bounds selections
+    if (IS_LAST_TILE)
+    {
+      int num_discount = ITEMS_PER_TILE - num_tile_items;
+      num_tile_selections -= num_discount;
+    }
+    num_selections = num_tile_selections;
+
+    CTA_SYNC();
+
+    Scatter(KeyTagT(),
+            d_keys_out,
+            keys,
+            selection_flags,
+            selection_idx,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_selections);
+
+    CTA_SYNC();
+
+    Scatter(ValueTagT(),
+            d_values_out,
+            values,
+            selection_flags,
+            selection_idx,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_selections);
+
+    return num_selections;
+  }
+
+  /**
+   * @brief Process subsequent tile of input (dynamic chained scan).
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @return Returns the running count of selections (including this tile)
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+  ConsumeSubsequentTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state)
+  {
+    KeyT keys[ITEMS_PER_THREAD];
+    OffsetT selection_flags[ITEMS_PER_THREAD];
+    OffsetT selection_idx[ITEMS_PER_THREAD];
+
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoadKeys(temp_storage.load_keys)
+        .Load(d_keys_in + tile_offset, keys, num_tile_items, *(d_keys_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+    }
+
+    CTA_SYNC();
+
+    ValueT values[ITEMS_PER_THREAD];
+    if (IS_LAST_TILE)
+    {
+      // Fill last elements with the first element
+      // because collectives are not suffix guarded
+      BlockLoadValues(temp_storage.load_values)
+        .Load(d_values_in + tile_offset, values, num_tile_items, *(d_values_in + tile_offset));
+    }
+    else
+    {
+      BlockLoadValues(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+    }
+
+    CTA_SYNC();
+
+    KeyT tile_predecessor = d_keys_in[tile_offset - 1];
+    BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+      .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor);
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      // Set selection_flags for out-of-bounds items
+      if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+      {
+        selection_flags[ITEM] = 1;
+      }
+    }
+
+    CTA_SYNC();
+
+    OffsetT num_tile_selections   = 0;
+    OffsetT num_selections        = 0;
+    OffsetT num_selections_prefix = 0;
+
+    TilePrefixCallback prefix_cb(tile_state, temp_storage.scan_storage.prefix, cub::Sum(), tile_idx);
+    BlockScanT(temp_storage.scan_storage.scan).ExclusiveSum(selection_flags, selection_idx, prefix_cb);
+
+    num_selections        = prefix_cb.GetInclusivePrefix();
+    num_tile_selections   = prefix_cb.GetBlockAggregate();
+    num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+    if (IS_LAST_TILE)
+    {
+      int num_discount = ITEMS_PER_TILE - num_tile_items;
+      num_tile_selections -= num_discount;
+      num_selections -= num_discount;
+    }
+
+    CTA_SYNC();
+
+    Scatter(KeyTagT(),
+            d_keys_out,
+            keys,
+            selection_flags,
+            selection_idx,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_selections);
+
+    CTA_SYNC();
+
+    Scatter(ValueTagT(),
+            d_values_out,
+            values,
+            selection_flags,
+            selection_idx,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_selections);
+
+    return num_selections;
+  }
+
+  /**
+   * @brief Process a tile of input
+   *
+   * @param num_tile_items
+   *   Number of input items comprising this tile
+   *
+   * @param tile_idx
+   *   Tile index
+   *
+   * @param tile_offset
+   *   Tile offset
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   */
+  template <bool IS_LAST_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+  ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT& tile_state)
+  {
+    OffsetT num_selections;
+    if (tile_idx == 0)
+    {
+      num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+    }
+    else
+    {
+      num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+    }
+
+    return num_selections;
+  }
+
+  /**
+   * @brief Scan tiles of items as part of a dynamic chained scan
+   *
+   * @param num_tiles
+   *   Total number of input tiles
+   *
+   * @param tile_state
+   *   Global tile state descriptor
+   *
+   * @param d_num_selected_out
+   *   Output total number selection_flags
+   *
+   * @tparam NumSelectedIteratorT
+   *   Output iterator type for recording number of items selection_flags
+   *
+   */
+  template <typename NumSelectedIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ConsumeRange(int num_tiles, ScanTileStateT& tile_state, NumSelectedIteratorT d_num_selected_out)
+  {
+    // Blocks are launched in increasing order, so just assign one tile per block
+    int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+
+    // Global offset for the current tile
+    OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(ITEMS_PER_TILE);
+
+    if (tile_idx < num_tiles - 1)
+    {
+      ConsumeTile<false>(ITEMS_PER_TILE, tile_idx, tile_offset, tile_state);
+    }
+    else
+    {
+      int num_remaining      = static_cast<int>(num_items - tile_offset);
+      OffsetT num_selections = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+      if (threadIdx.x == 0)
+      {
+        *d_num_selected_out = num_selections;
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/agent/single_pass_scan_operators.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 000000000..3f1d23255
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,1321 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/strong_load.cuh>
+#include <cub/detail/strong_store.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_temporary_storage.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ *
+ * @tparam T
+ *   BlockScan value type
+ *
+ * @tparam ScanOpT
+ *   Wrapped scan operator type
+ */
+template <typename T, typename ScanOpT>
+struct BlockScanRunningPrefixOp
+{
+  /// Wrapped scan operator
+  ScanOpT op;
+
+  /// Running block-wide prefix
+  T running_total;
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(ScanOpT op)
+      : op(op)
+  {}
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op)
+      : op(op)
+      , running_total(starting_prefix)
+  {}
+
+  /**
+   * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+   *
+   * @param block_aggregate
+   *   The aggregate sum of the BlockScan inputs
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE T operator()(const T& block_aggregate)
+  {
+    T retval      = running_total;
+    running_total = op(running_total, block_aggregate);
+    return retval;
+  }
+};
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+  SCAN_TILE_OOB, // Out-of-bounds (e.g., padding)
+  SCAN_TILE_INVALID = 99, // Not yet processed
+  SCAN_TILE_PARTIAL, // Tile aggregate is available
+  SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available
+};
+
+/**
+ * Enum class used for specifying the memory order that shall be enforced while reading and writing the tile status.
+ */
+enum class MemoryOrder
+{
+  // Uses relaxed loads when reading a tile's status and relaxed stores when updating a tile's status
+  relaxed,
+  // Uses load acquire when reading a tile's status and store release when updating a tile's status
+  acquire_release
+};
+
+namespace detail
+{
+template <int Delay, unsigned int GridThreshold = 500>
+_CCCL_DEVICE _CCCL_FORCEINLINE void delay()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (if (Delay > 0) {
+                 if (gridDim.x < GridThreshold)
+                 {
+                   __threadfence_block();
+                 }
+                 else
+                 {
+                   __nanosleep(Delay);
+                 }
+               }));
+}
+
+template <unsigned int GridThreshold = 500>
+_CCCL_DEVICE _CCCL_FORCEINLINE void delay(int ns)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (if (ns > 0) {
+                 if (gridDim.x < GridThreshold)
+                 {
+                   __threadfence_block();
+                 }
+                 else
+                 {
+                   __nanosleep(ns);
+                 }
+               }));
+}
+
+template <int Delay>
+_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(Delay);));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay(int ns)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(ns);), ((void) ns;));
+}
+
+template <unsigned int Delay = 350, unsigned int GridThreshold = 500>
+_CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (delay<Delay, GridThreshold>();), (__threadfence_block();));
+}
+
+template <unsigned int GridThreshold = 500>
+_CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting(int ns)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (delay<GridThreshold>(ns);), ((void) ns; __threadfence_block();));
+}
+
+template <unsigned int Delay = 350>
+_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(Delay);), (__threadfence_block();));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting(int ns)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(ns);), ((void) ns; __threadfence_block();));
+}
+
+template <unsigned int L2WriteLatency>
+struct no_delay_constructor_t
+{
+  struct delay_t
+  {
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_70, (), (__threadfence_block();));
+    }
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE no_delay_constructor_t(unsigned int /* seed */)
+  {
+    delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {};
+  }
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold = 500>
+struct reduce_by_key_delay_constructor_t
+{
+  struct delay_t
+  {
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      NV_DISPATCH_TARGET(
+        NV_IS_EXACTLY_SM_80,
+        (delay<Delay, GridThreshold>();),
+        NV_PROVIDES_SM_70,
+        (delay<0, GridThreshold>();),
+        NV_IS_DEVICE,
+        (__threadfence_block();));
+    }
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE reduce_by_key_delay_constructor_t(unsigned int /* seed */)
+  {
+    delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {};
+  }
+};
+
+template <unsigned int Delay, unsigned int L2WriteLatency>
+struct fixed_delay_constructor_t
+{
+  struct delay_t
+  {
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      delay_or_prevent_hoisting<Delay>();
+    }
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE fixed_delay_constructor_t(unsigned int /* seed */)
+  {
+    delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backoff_constructor_t
+{
+  struct delay_t
+  {
+    int delay;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      always_delay_or_prevent_hoisting(delay);
+      delay <<= 1;
+    }
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_constructor_t(unsigned int /* seed */)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {InitialDelay};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backoff_jitter_constructor_t
+{
+  struct delay_t
+  {
+    static constexpr unsigned int a = 16807;
+    static constexpr unsigned int c = 0;
+    static constexpr unsigned int m = 1u << 31;
+
+    unsigned int max_delay;
+    unsigned int& seed;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
+    {
+      return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      always_delay_or_prevent_hoisting(next(0, max_delay));
+      max_delay <<= 1;
+    }
+  };
+
+  unsigned int seed;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_constructor_t(unsigned int seed)
+      : seed(seed)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {InitialDelay, seed};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backoff_jitter_window_constructor_t
+{
+  struct delay_t
+  {
+    static constexpr unsigned int a = 16807;
+    static constexpr unsigned int c = 0;
+    static constexpr unsigned int m = 1u << 31;
+
+    unsigned int max_delay;
+    unsigned int& seed;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
+    {
+      return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      unsigned int next_max_delay = max_delay << 1;
+      always_delay_or_prevent_hoisting(next(max_delay, next_max_delay));
+      max_delay = next_max_delay;
+    }
+  };
+
+  unsigned int seed;
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_window_constructor_t(unsigned int seed)
+      : seed(seed)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    return {InitialDelay, seed};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backon_jitter_window_constructor_t
+{
+  struct delay_t
+  {
+    static constexpr unsigned int a = 16807;
+    static constexpr unsigned int c = 0;
+    static constexpr unsigned int m = 1u << 31;
+
+    unsigned int max_delay;
+    unsigned int& seed;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
+    {
+      return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      int prev_delay = max_delay >> 1;
+      always_delay_or_prevent_hoisting(next(prev_delay, max_delay));
+      max_delay = prev_delay;
+    }
+  };
+
+  unsigned int seed;
+  unsigned int max_delay = InitialDelay;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_window_constructor_t(unsigned int seed)
+      : seed(seed)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    max_delay >>= 1;
+    return {max_delay, seed};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backon_jitter_constructor_t
+{
+  struct delay_t
+  {
+    static constexpr unsigned int a = 16807;
+    static constexpr unsigned int c = 0;
+    static constexpr unsigned int m = 1u << 31;
+
+    unsigned int max_delay;
+    unsigned int& seed;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
+    {
+      return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      always_delay_or_prevent_hoisting(next(0, max_delay));
+      max_delay >>= 1;
+    }
+  };
+
+  unsigned int seed;
+  unsigned int max_delay = InitialDelay;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_constructor_t(unsigned int seed)
+      : seed(seed)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    max_delay >>= 1;
+    return {max_delay, seed};
+  }
+};
+
+template <unsigned int InitialDelay, unsigned int L2WriteLatency>
+struct exponential_backon_constructor_t
+{
+  struct delay_t
+  {
+    unsigned int delay;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
+    {
+      always_delay_or_prevent_hoisting(delay);
+      delay >>= 1;
+    }
+  };
+
+  unsigned int max_delay = InitialDelay;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_constructor_t(unsigned int /* seed */)
+  {
+    always_delay<L2WriteLatency>();
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
+  {
+    max_delay >>= 1;
+    return {max_delay};
+  }
+};
+
+using default_no_delay_constructor_t = no_delay_constructor_t<450>;
+using default_no_delay_t             = default_no_delay_constructor_t::delay_t;
+
+template <class T>
+using default_delay_constructor_t =
+  ::cuda::std::_If<Traits<T>::PRIMITIVE, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>;
+
+template <class T>
+using default_delay_t = typename default_delay_constructor_t<T>::delay_t;
+
+template <class KeyT, class ValueT>
+using default_reduce_by_key_delay_constructor_t =
+  ::cuda::std::_If<(Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16),
+                   reduce_by_key_delay_constructor_t<350, 450>,
+                   default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
+
+/**
+ * @brief Alias template for a ScanTileState specialized for a given value type, `T`, and memory order `Order`.
+ *
+ * @tparam T The ScanTileState's value type
+ * @tparam Order The memory order to be implemented by the ScanTileState
+ */
+template <typename ScanTileStateT, MemoryOrder Order>
+struct tile_state_with_memory_order
+{
+  ScanTileStateT& tile_state;
+  using T          = typename ScanTileStateT::StatusValueT;
+  using StatusWord = typename ScanTileStateT::StatusWord;
+
+  /**
+   * Update the specified tile's inclusive value and corresponding status
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
+  {
+    tile_state.template SetInclusive<Order>(tile_idx, tile_inclusive);
+  }
+
+  /**
+   * Update the specified tile's partial value and corresponding status
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
+  {
+    tile_state.template SetPartial<Order>(tile_idx, tile_partial);
+  }
+
+  /**
+   * Wait for the corresponding tile to become non-invalid
+   */
+  template <class DelayT = detail::default_no_delay_t>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay = {})
+  {
+    tile_state.template WaitForValid<DelayT, Order>(tile_idx, status, value, delay);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
+  {
+    return tile_state.template LoadValid<Order>(tile_idx);
+  }
+};
+} // namespace detail
+
+/**
+ * Tile status interface.
+ */
+template <typename T, bool SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+  using StatusValueT = T;
+
+  // Status word type
+  using StatusWord = ::cuda::std::_If<
+    sizeof(T) == 8,
+    unsigned long long,
+    ::cuda::std::_If<sizeof(T) == 4, unsigned int, ::cuda::std::_If<sizeof(T) == 2, unsigned short, unsigned char>>>;
+
+  // Unit word type
+  using TxnWord = ::cuda::std::_If<sizeof(T) == 8, ulonglong2, ::cuda::std::_If<sizeof(T) == 4, uint2, unsigned int>>;
+
+  // Device word type
+  struct TileDescriptor
+  {
+    StatusWord status;
+    T value;
+  };
+
+  // Constants
+  enum
+  {
+    TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+  };
+
+  // Device storage
+  TxnWord* d_tile_descriptors;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
+      : d_tile_descriptors(nullptr)
+  {}
+
+  /**
+   * @brief Initializer
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is
+   * done.
+   *
+   * @param[in] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t
+  Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/)
+  {
+    d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+    return cudaSuccess;
+  }
+
+  /**
+   * @brief Compute device memory needed for tile status
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[out] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  {
+    // bytes needed for tile status descriptors
+    temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
+    return cudaSuccess;
+  }
+
+  /**
+   * Initialize (from device)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
+  {
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    TxnWord val                = TxnWord();
+    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+    if (tile_idx < num_tiles)
+    {
+      // Not-yet-set
+      descriptor->status                                 = StatusWord(SCAN_TILE_INVALID);
+      d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+    }
+
+    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+    {
+      // Padding
+      descriptor->status              = StatusWord(SCAN_TILE_OOB);
+      d_tile_descriptors[threadIdx.x] = val;
+    }
+  }
+
+private:
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::relaxed), void>::type
+  StoreStatus(TxnWord* ptr, TxnWord alias)
+  {
+    detail::store_relaxed(ptr, alias);
+  }
+
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::acquire_release), void>::type
+  StoreStatus(TxnWord* ptr, TxnWord alias)
+  {
+    detail::store_release(ptr, alias);
+  }
+
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::relaxed), TxnWord>::type
+  LoadStatus(TxnWord* ptr)
+  {
+    return detail::load_relaxed(ptr);
+  }
+
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::acquire_release), TxnWord>::type
+  LoadStatus(TxnWord* ptr)
+  {
+    // For pre-volta we hoist the memory barrier to outside the loop, i.e., after reading a valid state
+    NV_IF_TARGET(NV_PROVIDES_SM_70, (return detail::load_acquire(ptr);), (return detail::load_relaxed(ptr);));
+  }
+
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::relaxed), void>::type
+  ThreadfenceForLoadAcqPreVolta()
+  {}
+
+  template <MemoryOrder Order>
+  _CCCL_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::enable_if<(Order == MemoryOrder::acquire_release), void>::type
+  ThreadfenceForLoadAcqPreVolta()
+  {
+    NV_IF_TARGET(NV_PROVIDES_SM_70, (), (__threadfence();));
+  }
+
+public:
+  template <MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
+  {
+    TileDescriptor tile_descriptor;
+    tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+    tile_descriptor.value  = tile_inclusive;
+
+    TxnWord alias;
+    *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+
+    StoreStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+  }
+
+  template <MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
+  {
+    TileDescriptor tile_descriptor;
+    tile_descriptor.status = SCAN_TILE_PARTIAL;
+    tile_descriptor.value  = tile_partial;
+
+    TxnWord alias;
+    *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+
+    StoreStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+  }
+
+  /**
+   * Wait for the corresponding tile to become non-invalid
+   */
+  template <class DelayT = detail::default_delay_t<T>, MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay_or_prevent_hoisting = {})
+  {
+    TileDescriptor tile_descriptor;
+
+    {
+      TxnWord alias   = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+      tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+    }
+
+    while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
+    {
+      delay_or_prevent_hoisting();
+      TxnWord alias   = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+      tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+    }
+
+    // For pre-Volta and load acquire we emit relaxed loads in LoadStatus and hoist the threadfence here
+    ThreadfenceForLoadAcqPreVolta<Order>();
+
+    status = tile_descriptor.status;
+    value  = tile_descriptor.value;
+  }
+
+  /**
+   * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
+   * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
+  {
+    TxnWord alias                  = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx];
+    TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+    return tile_descriptor.value;
+  }
+};
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+  using StatusValueT = T;
+
+  // Status word type
+  using StatusWord = unsigned int;
+
+  // Constants
+  enum
+  {
+    TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+  };
+
+  // Device storage
+  StatusWord* d_tile_status;
+  T* d_tile_partial;
+  T* d_tile_inclusive;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
+      : d_tile_status(nullptr)
+      , d_tile_partial(nullptr)
+      , d_tile_inclusive(nullptr)
+  {}
+
+  /**
+   * @brief Initializer
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is
+   *   done.
+   *
+   * @param[in] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  /// Initializer
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
+  {
+    cudaError_t error = cudaSuccess;
+    do
+    {
+      void* allocations[3] = {};
+      size_t allocation_sizes[3];
+
+      // bytes needed for tile status descriptors
+      allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
+
+      // bytes needed for partials
+      allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
+
+      // bytes needed for inclusives
+      allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
+
+      // Compute allocation pointers into the single storage blob
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Alias the offsets
+      d_tile_status    = reinterpret_cast<StatusWord*>(allocations[0]);
+      d_tile_partial   = reinterpret_cast<T*>(allocations[1]);
+      d_tile_inclusive = reinterpret_cast<T*>(allocations[2]);
+    } while (0);
+
+    return error;
+  }
+
+  /**
+   * @brief Compute device memory needed for tile status
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[out] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  {
+    // Specify storage allocation requirements
+    size_t allocation_sizes[3];
+
+    // bytes needed for tile status descriptors
+    allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);
+
+    // bytes needed for partials
+    allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
+
+    // bytes needed for inclusives
+    allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);
+
+    // Set the necessary size of the blob
+    void* allocations[3] = {};
+    return CubDebug(AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes));
+  }
+
+  /**
+   * Initialize (from device)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
+  {
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_tiles)
+    {
+      // Not-yet-set
+      d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+    }
+
+    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+    {
+      // Padding
+      d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+    }
+  }
+
+  /**
+   * Update the specified tile's inclusive value and corresponding status
+   */
+  template <MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
+  {
+    // Update tile inclusive value
+    ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+    detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+  }
+
+  /**
+   * Update the specified tile's partial value and corresponding status
+   */
+  template <MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
+  {
+    // Update tile partial value
+    ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+    detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+  }
+
+  /**
+   * Wait for the corresponding tile to become non-invalid
+   */
+  template <class DelayT = detail::default_no_delay_t, MemoryOrder Order = MemoryOrder::relaxed>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay = {})
+  {
+    do
+    {
+      delay();
+      status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+      __threadfence();
+    } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
+
+    if (status == StatusWord(SCAN_TILE_PARTIAL))
+    {
+      value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+    }
+    else
+    {
+      value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+  }
+
+  /**
+   * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
+   * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
+  {
+    return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx];
+  }
+};
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <typename ValueT,
+          typename KeyT,
+          bool SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename ValueT, typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> : ScanTileState<KeyValuePair<KeyT, ValueT>>
+{
+  using SuperClass = ScanTileState<KeyValuePair<KeyT, ValueT>>;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState()
+      : SuperClass()
+  {}
+};
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <typename ValueT, typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+  using KeyValuePairT = KeyValuePair<KeyT, ValueT>;
+
+  // Constants
+  enum
+  {
+    PAIR_SIZE        = static_cast<int>(sizeof(ValueT) + sizeof(KeyT)),
+    TXN_WORD_SIZE    = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+    STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE,
+
+    TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+  };
+
+  // Status word type
+  using StatusWord = ::cuda::std::_If<
+    STATUS_WORD_SIZE == 8,
+    unsigned long long,
+    ::cuda::std::
+      _If<STATUS_WORD_SIZE == 4, unsigned int, ::cuda::std::_If<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>;
+
+  // Status word type
+  using TxnWord = ::cuda::std::
+    _If<TXN_WORD_SIZE == 16, ulonglong2, ::cuda::std::_If<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>;
+
+  // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+  struct TileDescriptorBigStatus
+  {
+    KeyT key;
+    ValueT value;
+    StatusWord status;
+  };
+
+  // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+  struct TileDescriptorLittleStatus
+  {
+    ValueT value;
+    StatusWord status;
+    KeyT key;
+  };
+
+  // Device word type
+  using TileDescriptor =
+    ::cuda::std::_If<sizeof(ValueT) == sizeof(KeyT), TileDescriptorBigStatus, TileDescriptorLittleStatus>;
+
+  // Device storage
+  TxnWord* d_tile_descriptors;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState()
+      : d_tile_descriptors(nullptr)
+  {}
+
+  /**
+   * @brief Initializer
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.  When nullptr, the required allocation size
+   *   is written to \p temp_storage_bytes and no work is done.
+   *
+   * @param[in] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t
+  Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/)
+  {
+    d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+    return cudaSuccess;
+  }
+
+  /**
+   * @brief Compute device memory needed for tile status
+   *
+   * @param[in] num_tiles
+   *   Number of tiles
+   *
+   * @param[out] temp_storage_bytes
+   *   Size in bytes of \t d_temp_storage allocation
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
+  {
+    // bytes needed for tile status descriptors
+    temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
+    return cudaSuccess;
+  }
+
+  /**
+   * Initialize (from device)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
+  {
+    int tile_idx               = (blockIdx.x * blockDim.x) + threadIdx.x;
+    TxnWord val                = TxnWord();
+    TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+    if (tile_idx < num_tiles)
+    {
+      // Not-yet-set
+      descriptor->status                                 = StatusWord(SCAN_TILE_INVALID);
+      d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+    }
+
+    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+    {
+      // Padding
+      descriptor->status              = StatusWord(SCAN_TILE_OOB);
+      d_tile_descriptors[threadIdx.x] = val;
+    }
+  }
+
+  /**
+   * Update the specified tile's inclusive value and corresponding status
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+  {
+    TileDescriptor tile_descriptor;
+    tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+    tile_descriptor.value  = tile_inclusive.value;
+    tile_descriptor.key    = tile_inclusive.key;
+
+    TxnWord alias;
+    *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+
+    detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+  {
+    TileDescriptor tile_descriptor;
+    tile_descriptor.status = SCAN_TILE_PARTIAL;
+    tile_descriptor.value  = tile_partial.value;
+    tile_descriptor.key    = tile_partial.key;
+
+    TxnWord alias;
+    *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+
+    detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+  }
+
+  /**
+   * Wait for the corresponding tile to become non-invalid
+   */
+  template <class DelayT = detail::fixed_delay_constructor_t<350, 450>::delay_t>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  WaitForValid(int tile_idx, StatusWord& status, KeyValuePairT& value, DelayT delay_or_prevent_hoisting = {})
+  {
+    //        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING +
+    //        tile_idx); TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+    //
+    //        while (tile_descriptor.status == SCAN_TILE_INVALID)
+    //        {
+    //            __threadfence_block(); // prevent hoisting loads from loop
+    //
+    //            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+    //            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+    //        }
+    //
+    //        status      = tile_descriptor.status;
+    //        value.value = tile_descriptor.value;
+    //        value.key   = tile_descriptor.key;
+
+    TileDescriptor tile_descriptor;
+
+    do
+    {
+      delay_or_prevent_hoisting();
+      TxnWord alias   = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+      tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+    } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+    status      = tile_descriptor.status;
+    value.value = tile_descriptor.value;
+    value.key   = tile_descriptor.key;
+  }
+};
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ *
+ * @tparam DelayConstructorT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <typename T,
+          typename ScanOpT,
+          typename ScanTileStateT,
+          int LEGACY_PTX_ARCH        = 0,
+          typename DelayConstructorT = detail::default_delay_constructor_t<T>>
+struct TilePrefixCallbackOp
+{
+  // Parameterized warp reduce
+  using WarpReduceT = WarpReduce<T, (1 << (5))>;
+
+  // Temporary storage type
+  struct _TempStorage
+  {
+    typename WarpReduceT::TempStorage warp_reduce;
+    T exclusive_prefix;
+    T inclusive_prefix;
+    T block_aggregate;
+  };
+
+  // Alias wrapper allowing temporary storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // Type of status word
+  using StatusWord = typename ScanTileStateT::StatusWord;
+
+  // Fields
+  _TempStorage& temp_storage; ///< Reference to a warp-reduction instance
+  ScanTileStateT& tile_status; ///< Interface to tile status
+  ScanOpT scan_op; ///< Binary scan operator
+  int tile_idx; ///< The current tile index
+  T exclusive_prefix; ///< Exclusive prefix for the tile
+  T inclusive_prefix; ///< Inclusive prefix for the tile
+
+  // Constructs prefix functor for a given tile index.
+  // Precondition: thread blocks processing all of the predecessor tiles were scheduled.
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op, int tile_idx)
+      : temp_storage(temp_storage.Alias())
+      , tile_status(tile_status)
+      , scan_op(scan_op)
+      , tile_idx(tile_idx)
+  {}
+
+  // Computes the tile index and constructs prefix functor with it.
+  // Precondition: thread block per tile assignment.
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op)
+      : TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x)
+  {}
+
+  /**
+   * @brief Block until all predecessors within the warp-wide window have non-invalid status
+   *
+   * @param predecessor_idx
+   *   Preceding tile index to inspect
+   *
+   * @param[out] predecessor_status
+   *   Preceding tile status
+   *
+   * @param[out] window_aggregate
+   *   Relevant partial reduction from this window of preceding tiles
+   */
+  template <class DelayT = detail::default_delay_t<T>>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ProcessWindow(int predecessor_idx, StatusWord& predecessor_status, T& window_aggregate, DelayT delay = {})
+  {
+    T value;
+    tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay);
+
+    // Perform a segmented reduction to get the prefix for the current window.
+    // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+    int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+    window_aggregate =
+      WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(value, tail_flag, SwizzleScanOp<ScanOpT>(scan_op));
+  }
+
+  // BlockScan prefix callback functor (called by the first warp)
+  _CCCL_DEVICE _CCCL_FORCEINLINE T operator()(T block_aggregate)
+  {
+    // Update our status with our tile-aggregate
+    if (threadIdx.x == 0)
+    {
+      detail::uninitialized_copy_single(&temp_storage.block_aggregate, block_aggregate);
+
+      tile_status.SetPartial(tile_idx, block_aggregate);
+    }
+
+    int predecessor_idx = tile_idx - threadIdx.x - 1;
+    StatusWord predecessor_status;
+    T window_aggregate;
+
+    // Wait for the warp-wide window of predecessor tiles to become valid
+    DelayConstructorT construct_delay(tile_idx);
+    ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay());
+
+    // The exclusive tile prefix starts out as the current window aggregate
+    exclusive_prefix = window_aggregate;
+
+    // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+    while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+    {
+      predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+      // Update exclusive tile prefix with the window prefix
+      ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay());
+      exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+    }
+
+    // Compute the inclusive tile prefix and update the status for this tile
+    if (threadIdx.x == 0)
+    {
+      inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+      tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+      detail::uninitialized_copy_single(&temp_storage.exclusive_prefix, exclusive_prefix);
+
+      detail::uninitialized_copy_single(&temp_storage.inclusive_prefix, inclusive_prefix);
+    }
+
+    // Return exclusive_prefix
+    return exclusive_prefix;
+  }
+
+  // Get the exclusive prefix stored in temporary storage
+  _CCCL_DEVICE _CCCL_FORCEINLINE T GetExclusivePrefix()
+  {
+    return temp_storage.exclusive_prefix;
+  }
+
+  // Get the inclusive prefix stored in temporary storage
+  _CCCL_DEVICE _CCCL_FORCEINLINE T GetInclusivePrefix()
+  {
+    return temp_storage.inclusive_prefix;
+  }
+
+  // Get the block aggregate stored in temporary storage
+  _CCCL_DEVICE _CCCL_FORCEINLINE T GetBlockAggregate()
+  {
+    return temp_storage.block_aggregate;
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE int GetTileIdx() const
+  {
+    return tile_idx;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_adjacent_difference.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 000000000..5bc3bae32
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,965 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! The cub::BlockAdjacentDifference class provides collective methods for computing the differences of adjacent
+//! elements partitioned across a CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! BlockAdjacentDifference provides :ref:`collective <collective-primitives>` methods for computing the
+//! differences of adjacent elements partitioned across a CUDA thread block.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! BlockAdjacentDifference calculates the differences of adjacent elements in the elements partitioned across a CUDA
+//! thread block. Because the binary operation could be noncommutative, there are two sets of methods.
+//! Methods named SubtractLeft subtract left element ``i - 1`` of input sequence from current element ``i``.
+//! Methods named SubtractRight subtract the right element ``i + 1`` from the current one ``i``:
+//!
+//! .. code-block:: c++
+//!
+//!    int values[4]; // [1, 2, 3, 4]
+//!    //...
+//!    int subtract_left_result[4];  <-- [  1,  1,  1,  1 ]
+//!    int subtract_right_result[4]; <-- [ -1, -1, -1,  4 ]
+//!
+//! - For SubtractLeft, if the left element is out of bounds, the input value is assigned to ``output[0]``
+//!   without modification.
+//! - For SubtractRight, if the right element is out of bounds, the input value is assigned to the current output value
+//!   without modification.
+//! - The block/example_block_reduce_dyn_smem.cu example under the examples/block folder illustrates usage of
+//!   dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//!   This example can be easily adapted to the storage required by BlockAdjacentDifference.
+//!
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates how to use BlockAdjacentDifference to
+//! compute the left difference between adjacent elements.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+//!
+//!    struct CustomDifference
+//!    {
+//!      template <typename DataType>
+//!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+//!      {
+//!        return lhs - rhs;
+//!      }
+//!    };
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockAdjacentDifference for a 1D block of
+//!        // 128 threads of type int
+//!        using BlockAdjacentDifferenceT =
+//!           cub::BlockAdjacentDifference<int, 128>;
+//!
+//!        // Allocate shared memory for BlockAdjacentDifference
+//!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Collectively compute adjacent_difference
+//!        int result[4];
+//!
+//!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+//!            thread_data,
+//!            result,
+//!            CustomDifference());
+//!
+//! Suppose the set of input `thread_data` across the block of threads is
+//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+//! The corresponding output ``result`` in those threads will be
+//! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+//!
+//! @endrst
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0>
+class BlockAdjacentDifference
+{
+private:
+  /// The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+  /// Shared memory storage layout type (last element from each thread's input)
+  struct _TempStorage
+  {
+    T first_items[BLOCK_THREADS];
+    T last_items[BLOCK_THREADS];
+  };
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Specialization for when FlagOp has third index param
+  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::value>
+  struct ApplyOp
+  {
+    // Apply flag operator
+    static _CCCL_DEVICE _CCCL_FORCEINLINE T FlagT(FlagOp flag_op, const T& a, const T& b, int idx)
+    {
+      return flag_op(b, a, idx);
+    }
+  };
+
+  /// Specialization for when FlagOp does not have a third index param
+  template <typename FlagOp>
+  struct ApplyOp<FlagOp, false>
+  {
+    // Apply flag operator
+    static _CCCL_DEVICE _CCCL_FORCEINLINE T FlagT(FlagOp flag_op, const T& a, const T& b, int /*idx*/)
+    {
+      return flag_op(b, a);
+    }
+  };
+
+  /// Templated unrolling of item comparison (inductive case)
+  struct Iterate
+  {
+    /**
+     * Head flags
+     *
+     * @param[out] flags Calling thread's discontinuity head_flags
+     * @param[in] input Calling thread's input items
+     * @param[out] preds Calling thread's predecessor items
+     * @param[in] flag_op Binary boolean flag predicate
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
+      int linear_tid,
+      FlagT (&flags)[ITEMS_PER_THREAD],
+      T (&input)[ITEMS_PER_THREAD],
+      T (&preds)[ITEMS_PER_THREAD],
+      FlagOp flag_op)
+    {
+#pragma unroll
+      for (int i = 1; i < ITEMS_PER_THREAD; ++i)
+      {
+        preds[i] = input[i - 1];
+        flags[i] = ApplyOp<FlagOp>::FlagT(flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i);
+      }
+    }
+
+    /**
+     * Tail flags
+     *
+     * @param[out] flags Calling thread's discontinuity head_flags
+     * @param[in] input Calling thread's input items
+     * @param[in] flag_op Binary boolean flag predicate
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void
+    FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
+    {
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i)
+      {
+        flags[i] = ApplyOp<FlagOp>::FlagT(flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1);
+      }
+    }
+  };
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+public:
+  /// @smemstorage{BlockAdjacentDifference}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockAdjacentDifference()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @brief Collective constructor using the specified memory allocation as temporary storage
+  //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockAdjacentDifference(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @} end member group
+  //! @name Read left operations
+  //! @{
+
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+  //! adjacent elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockAdjacentDifference for a 1D block
+  //!        // of 128 threads of type int
+  //!        using BlockAdjacentDifferenceT =
+  //!           cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockAdjacentDifference
+  //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute adjacent_difference
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+  //!            thread_data,
+  //!            thread_data,
+  //!            CustomDifference());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+  //! The corresponding output ``result`` in those threads will be
+  //! ``{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  SubtractLeft(T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+    {
+      output[item] = difference_op(input[item], input[item - 1]);
+    }
+
+    if (linear_tid == 0)
+    {
+      output[0] = input[0];
+    }
+    else
+    {
+      output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]);
+    }
+  }
+
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+  //! adjacent elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockAdjacentDifference for a 1D block of
+  //!        // 128 threads of type int
+  //!        using BlockAdjacentDifferenceT =
+  //!           cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockAdjacentDifference
+  //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // The last item in the previous tile:
+  //!        int tile_predecessor_item = ...;
+  //!
+  //!        // Collectively compute adjacent_difference
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
+  //!            thread_data,
+  //!            thread_data,
+  //!            CustomDifference(),
+  //!            tile_predecessor_item);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+  //! and that `tile_predecessor_item` is `3`. The corresponding output
+  //! ``result`` in those threads will be
+  //! ``{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  //!
+  //! @param[in] tile_predecessor_item
+  //!   @rst
+  //!   *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+  //!   (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeft(
+    T (&input)[ITEMS_PER_THREAD],
+    OutputT (&output)[ITEMS_PER_THREAD],
+    DifferenceOpT difference_op,
+    T tile_predecessor_item)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+    {
+      output[item] = difference_op(input[item], input[item - 1]);
+    }
+
+    // Set flag for first thread-item
+    if (linear_tid == 0)
+    {
+      output[0] = difference_op(input[0], tile_predecessor_item);
+    }
+    else
+    {
+      output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]);
+    }
+  }
+
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+  //! adjacent elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!      // Specialize BlockAdjacentDifference for a 1D block of
+  //!      // 128 threads of type int
+  //!      using BlockAdjacentDifferenceT =
+  //!         cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!      // Allocate shared memory for BlockAdjacentDifference
+  //!      __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!      // Obtain a segment of consecutive items that are blocked across threads
+  //!      int thread_data[4];
+  //!      ...
+  //!      int valid_items = 9;
+  //!
+  //!      // Collectively compute adjacent_difference
+  //!      BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+  //!          thread_data,
+  //!          thread_data,
+  //!          CustomDifference(),
+  //!          valid_items);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+  //! The corresponding output ``result`` in those threads will be
+  //! ``{ [4,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  //!
+  //! @param[in] valid_items
+  //!   Number of valid items in thread block
+  template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeftPartialTile(
+    T (&input)[ITEMS_PER_THREAD], OutputType (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+    {
+#pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        output[item] = difference_op(input[item], input[item - 1]);
+      }
+    }
+    else
+    {
+#pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+        if (idx < valid_items)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+        else
+        {
+          output[item] = input[item];
+        }
+      }
+    }
+
+    if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
+    {
+      output[0] = input[0];
+    }
+    else
+    {
+      output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]);
+    }
+  }
+
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the left difference between
+  //! adjacent elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!      // Specialize BlockAdjacentDifference for a 1D block of
+  //!      // 128 threads of type int
+  //!      using BlockAdjacentDifferenceT =
+  //!         cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!      // Allocate shared memory for BlockAdjacentDifference
+  //!      __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!      // Obtain a segment of consecutive items that are blocked across threads
+  //!      int thread_data[4];
+  //!      ...
+  //!      int valid_items = 9;
+  //!      int tile_predecessor_item = 4;
+  //!
+  //!      // Collectively compute adjacent_difference
+  //!      BlockAdjacentDifferenceT(temp_storage).SubtractLeftPartialTile(
+  //!          thread_data,
+  //!          thread_data,
+  //!          CustomDifference(),
+  //!          valid_items,
+  //!          tile_predecessor_item);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
+  //! The corresponding output ``result`` in those threads will be
+  //! ``{ [0,-2,-1,0], [0,0,0,0], [1,3,3,3], [3,4,1,4], ... }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  //!
+  //! @param[in] valid_items
+  //!   Number of valid items in thread block
+  //!
+  //! @param[in] tile_predecessor_item
+  //!   @rst
+  //!   *thread*\ :sub:`0` only item which is going to be subtracted from the first tile item
+  //!   (*input*\ :sub:`0` from *thread*\ :sub:`0`).
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename OutputType, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractLeftPartialTile(
+    T (&input)[ITEMS_PER_THREAD],
+    OutputType (&output)[ITEMS_PER_THREAD],
+    DifferenceOpT difference_op,
+    int valid_items,
+    T tile_predecessor_item)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
+    {
+#pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        output[item] = difference_op(input[item], input[item - 1]);
+      }
+    }
+    else
+    {
+#pragma unroll
+      for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
+      {
+        const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+        if (idx < valid_items)
+        {
+          output[item] = difference_op(input[item], input[item - 1]);
+        }
+        else
+        {
+          output[item] = input[item];
+        }
+      }
+    }
+
+    if (valid_items <= linear_tid * ITEMS_PER_THREAD)
+    {
+      output[0] = input[0];
+    }
+    else if (linear_tid == 0)
+    {
+      output[0] = difference_op(input[0], tile_predecessor_item);
+    }
+    else
+    {
+      output[0] = difference_op(input[0], temp_storage.last_items[linear_tid - 1]);
+    }
+  }
+
+  //! @} end member group
+  //! @name Read right operations
+  //! @{
+  //!
+  //! @rst
+  //!
+  //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+  //! adjacent elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockAdjacentDifference for a 1D block of
+  //!        // 128 threads of type int
+  //!        using BlockAdjacentDifferenceT =
+  //!           cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockAdjacentDifference
+  //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute adjacent_difference
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+  //!            thread_data,
+  //!            thread_data,
+  //!            CustomDifference());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+  //! The corresponding output ``result`` in those threads will be
+  //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  SubtractRight(T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op)
+  {
+    // Share first item
+    temp_storage.first_items[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+    {
+      output[item] = difference_op(input[item], input[item + 1]);
+    }
+
+    if (linear_tid == BLOCK_THREADS - 1)
+    {
+      output[ITEMS_PER_THREAD - 1] = input[ITEMS_PER_THREAD - 1];
+    }
+    else
+    {
+      output[ITEMS_PER_THREAD - 1] =
+        difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]);
+    }
+  }
+
+  //! @rst
+  //! Subtracts the right element of each adjacent pair of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+  //! adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockAdjacentDifference for a 1D block of
+  //!        // 128 threads of type int
+  //!        using BlockAdjacentDifferenceT =
+  //!           cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockAdjacentDifference
+  //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // The first item in the next tile:
+  //!        int tile_successor_item = ...;
+  //!
+  //!        // Collectively compute adjacent_difference
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractRight(
+  //!            thread_data,
+  //!            thread_data,
+  //!            CustomDifference(),
+  //!            tile_successor_item);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``,
+  //! and that ``tile_successor_item`` is ``3``. The corresponding output ``result``
+  //! in those threads will be
+  //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  //!
+  //! @param[in] tile_successor_item
+  //!   @rst
+  //!   *thread*\ :sub:`BLOCK_THREADS` only item which is going to be subtracted from the last tile item
+  //!   (*input*\ :sub:`ITEMS_PER_THREAD` from *thread*\ :sub:`BLOCK_THREADS`).
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractRight(
+    T (&input)[ITEMS_PER_THREAD],
+    OutputT (&output)[ITEMS_PER_THREAD],
+    DifferenceOpT difference_op,
+    T tile_successor_item)
+  {
+    // Share first item
+    temp_storage.first_items[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+    // Set flag for last thread-item
+    T successor_item = (linear_tid == BLOCK_THREADS - 1)
+                       ? tile_successor_item // Last thread
+                       : temp_storage.first_items[linear_tid + 1];
+
+#pragma unroll
+    for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+    {
+      output[item] = difference_op(input[item], input[item + 1]);
+    }
+
+    output[ITEMS_PER_THREAD - 1] = difference_op(input[ITEMS_PER_THREAD - 1], successor_item);
+  }
+
+  //! @rst
+  //! Subtracts the right element of each adjacent pair in range of elements partitioned across a CUDA thread block.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates how to use BlockAdjacentDifference to compute the right difference between
+  //! adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/block/block_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockAdjacentDifference for a 1D block of
+  //!        // 128 threads of type int
+  //!        using BlockAdjacentDifferenceT =
+  //!           cub::BlockAdjacentDifference<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockAdjacentDifference
+  //!        __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute adjacent_difference
+  //!        BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
+  //!            thread_data,
+  //!            thread_data,
+  //!            CustomDifference(),
+  //!            valid_items);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }``.
+  //! and that ``valid_items`` is ``507``. The corresponding output ``result`` in
+  //! those threads will be
+  //! ``{ ...-1, [2,1,0,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }``.
+  //! @endrst
+  //!
+  //! @param[out] output
+  //!   Calling thread's adjacent difference result
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items (may be aliased to `output`)
+  //!
+  //! @param[in] difference_op
+  //!   Binary difference operator
+  //!
+  //! @param[in] valid_items
+  //!   Number of valid items in thread block
+  template <int ITEMS_PER_THREAD, typename OutputT, typename DifferenceOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SubtractRightPartialTile(
+    T (&input)[ITEMS_PER_THREAD], OutputT (&output)[ITEMS_PER_THREAD], DifferenceOpT difference_op, int valid_items)
+  {
+    // Share first item
+    temp_storage.first_items[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+    if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items)
+    {
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
+      {
+        output[item] = difference_op(input[item], input[item + 1]);
+      }
+
+      output[ITEMS_PER_THREAD - 1] =
+        difference_op(input[ITEMS_PER_THREAD - 1], temp_storage.first_items[linear_tid + 1]);
+    }
+    else
+    {
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; item++)
+      {
+        const int idx = linear_tid * ITEMS_PER_THREAD + item;
+
+        // Right element of input[valid_items - 1] is out of bounds.
+        // According to the API it's copied into output array
+        // without modification.
+        if (idx < valid_items - 1)
+        {
+          output[item] = difference_op(input[item], input[item + 1]);
+        }
+        else
+        {
+          output[item] = input[item];
+        }
+      }
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_discontinuity.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_discontinuity.cuh
new file mode 100644
index 000000000..2fb15e905
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1219 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](../index.html#sec0) methods for
+ * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The BlockDiscontinuity class provides :ref:`collective <collective-primitives>` methods for
+//! flagging discontinuities within an ordered set of items partitioned across a CUDA thread
+//! block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+//!   that differ from their predecessors (or successors). For example, head flags are convenient
+//!   for demarcating disjoint data segments as part of a segmented scan or reduction.
+//! - @blocked
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Incurs zero bank conflicts for most types
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockDiscontinuity}
+//!
+//! The code snippet below illustrates the head flagging of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+//!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+//!
+//!        // Allocate shared memory for BlockDiscontinuity
+//!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Collectively compute head flags for discontinuities in the segment
+//!        int head_flags[4];
+//!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+//! The corresponding output ``head_flags`` in those threads will be
+//! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``examples/block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockDiscontinuity.
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be flagged.
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0>
+class BlockDiscontinuity
+{
+private:
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /// Shared memory storage layout type (last element from each thread's input)
+  struct _TempStorage
+  {
+    T first_items[BLOCK_THREADS];
+    T last_items[BLOCK_THREADS];
+  };
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Specialization for when FlagOp has third index param
+  template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::value>
+  struct ApplyOp
+  {
+    // Apply flag operator
+    static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int idx)
+    {
+      return flag_op(a, b, idx);
+    }
+  };
+
+  /// Specialization for when FlagOp does not have a third index param
+  template <typename FlagOp>
+  struct ApplyOp<FlagOp, false>
+  {
+    // Apply flag operator
+    static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int /*idx*/)
+    {
+      return flag_op(a, b);
+    }
+  };
+
+  /// Templated unrolling of item comparison (inductive case)
+  struct Iterate
+  {
+    /**
+     * @brief Head flags
+     *
+     * @param[out] flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] preds
+     *   Calling thread's predecessor items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
+      int linear_tid,
+      FlagT (&flags)[ITEMS_PER_THREAD],
+      T (&input)[ITEMS_PER_THREAD],
+      T (&preds)[ITEMS_PER_THREAD],
+      FlagOp flag_op)
+    {
+#pragma unroll
+      for (int i = 1; i < ITEMS_PER_THREAD; ++i)
+      {
+        preds[i] = input[i - 1];
+        flags[i] = ApplyOp<FlagOp>::FlagT(flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i);
+      }
+    }
+
+    /**
+     * @brief Tail flags
+     *
+     * @param[out] flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    static _CCCL_DEVICE _CCCL_FORCEINLINE void
+    FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
+    {
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i)
+      {
+        flags[i] = ApplyOp<FlagOp>::FlagT(flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1);
+      }
+    }
+  };
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+public:
+  /// @smemstorage{BlockDiscontinuity}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  /**
+   * @brief Collective constructor using a private static allocation of shared memory as temporary
+   *        storage.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @} end member group
+  //! @name Head flag operations
+  //! @{
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+  /**
+   * @param[out] head_flags
+   *   Calling thread's discontinuity head_flags
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] preds
+   *   Calling thread's predecessor items
+   *
+   * @param[in] flag_op
+   *   Binary boolean flag predicate
+   */
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
+    FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    if (linear_tid == 0)
+    {
+      // Set flag for first thread-item (preds[0] is undefined)
+      head_flags[0] = 1;
+    }
+    else
+    {
+      preds[0]      = temp_storage.last_items[linear_tid - 1];
+      head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+    }
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+  }
+
+  /**
+   * @param[out] head_flags
+   *   Calling thread's discontinuity head_flags
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] preds
+   *   Calling thread's predecessor items
+   *
+   * @param[in] flag_op
+   *   Binary boolean flag predicate
+   *
+   * @param[in] tile_predecessor_item
+   *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+   *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+   */
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
+    FlagT (&head_flags)[ITEMS_PER_THREAD],
+    T (&input)[ITEMS_PER_THREAD],
+    T (&preds)[ITEMS_PER_THREAD],
+    FlagOp flag_op,
+    T tile_predecessor_item)
+  {
+    // Share last item
+    temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    // Set flag for first thread-item
+    preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
+                 temp_storage.last_items[linear_tid - 1];
+
+    head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+  }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sets head flags indicating discontinuities between items partitioned across the thread
+  //! block, for which the first item has no reference and is always flagged.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
+  //!   ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+  //!   the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute head flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
+  //! The corresponding output ``head_flags`` in those threads will be
+  //! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`.
+  //!   `b_index` is the rank of b in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity head_flags
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
+  {
+    T preds[ITEMS_PER_THREAD];
+    FlagHeads(head_flags, input, preds, flag_op);
+  }
+
+  //! @rst
+  //! Sets head flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+  //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
+  //!   in the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Have thread0 obtain the predecessor item for the entire tile
+  //!        int tile_predecessor_item;
+  //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+  //!
+  //!        // Collectively compute head flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeads(
+  //!            head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
+  //! and that ``tile_predecessor_item`` is ``0``.  The corresponding output ``head_flags`` in those
+  //! threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`,
+  //!   and returning `true` if a discontinuity exists between `a` and `b`,
+  //!   otherwise `false`.  `b_index` is the rank of b in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity `head_flags`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  //!
+  //! @param[in] tile_predecessor_item
+  //!   @rst
+  //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
+    FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
+  {
+    T preds[ITEMS_PER_THREAD];
+    FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+  }
+
+  //! @} end member group
+  //! @name Tail flag operations
+  //! @{
+
+  //! @rst
+  //! Sets tail flags indicating discontinuities between items partitioned across the thread
+  //! block, for which the last item has no reference and is always flagged.
+  //!
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+  //!   ``flag_op(input[i], next-item)``
+  //!   returns ``true`` (where `next-item` is either the next item
+  //!   in the same thread or the first item in the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute tail flags for discontinuities in the segment
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
+  //! The corresponding output ``tail_flags`` in those threads will be
+  //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+  //!   rank of `b` in the aggregate tile of data.
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
+  {
+    // Share first item
+    temp_storage.first_items[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+    // Set flag for last thread-item
+    tail_flags[ITEMS_PER_THREAD - 1] =
+      (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
+        ApplyOp<FlagOp>::FlagT(
+          flag_op,
+          input[ITEMS_PER_THREAD - 1],
+          temp_storage.first_items[linear_tid + 1],
+          (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @rst
+  //! Sets tail flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+  //!   returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
+  //!   the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared against
+  //!   ``tile_successor_item``.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Have thread127 obtain the successor item for the entire tile
+  //!        int tile_successor_item;
+  //!        if (threadIdx.x == 127) tile_successor_item == ...
+  //!
+  //!        // Collectively compute tail flags for discontinuities in the segment
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagTails(
+  //!            tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+  //! and that ``tile_successor_item`` is ``125``.  The corresponding output ``tail_flags`` in those
+  //! threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+  //!   rank of `b` in the aggregate tile of data.
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  //!
+  //! @param[in] tile_successor_item
+  //!   @rst
+  //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to
+  //!   compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+  //!   *thread*\ :sub:`BLOCK_THREADS - 1`).
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
+  {
+    // Share first item
+    temp_storage.first_items[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+    // Set flag for last thread-item
+    T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
+                         temp_storage.first_items[linear_tid + 1];
+
+    tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+      flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @} end member group
+  //! @name Head & tail flag operations
+  //! @{
+
+  //! @rst
+  //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
+  //!   ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+  //!   the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+  //!   returns ``true`` (where next-item is either the next item in the same thread or the first item in
+  //!   the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute head and flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
+  //!            head_flags, tail_flags, thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+  //! and that the tile_successor_item is ``125``.  The corresponding output ``head_flags``
+  //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+  //! and the corresponding output ``tail_flags`` in those threads will be
+  //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+  //!   rank of `b` in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity head_flags
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
+    FlagT (&head_flags)[ITEMS_PER_THREAD],
+    FlagT (&tail_flags)[ITEMS_PER_THREAD],
+    T (&input)[ITEMS_PER_THREAD],
+    FlagOp flag_op)
+  {
+    // Share first and last items
+    temp_storage.first_items[linear_tid] = input[0];
+    temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    T preds[ITEMS_PER_THREAD];
+
+    // Set flag for first thread-item
+    if (linear_tid == 0)
+    {
+      head_flags[0] = 1;
+    }
+    else
+    {
+      preds[0]      = temp_storage.last_items[linear_tid - 1];
+      head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+    }
+
+    // Set flag for last thread-item
+    tail_flags[ITEMS_PER_THREAD - 1] =
+      (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
+        ApplyOp<FlagOp>::FlagT(
+          flag_op,
+          input[ITEMS_PER_THREAD - 1],
+          temp_storage.first_items[linear_tid + 1],
+          (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @rst
+  //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when
+  //!   ``flag_op(previous-item, input[i])`` returns ``true`` (where ``previous-item`` is either the preceding item
+  //!   in the same thread or the last item in the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` returns ``true``
+  //!   (where ``next-item`` is either the next item in the same thread or the first item in the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
+  //!   against ``tile_predecessor_item``.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Have thread127 obtain the successor item for the entire tile
+  //!        int tile_successor_item;
+  //!        if (threadIdx.x == 127) tile_successor_item == ...
+  //!
+  //!        // Collectively compute head and flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
+  //!            head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
+  //! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
+  //! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+  //! and the corresponding output ``tail_flags`` in those threads will be
+  //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
+  //!   rank of b in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity head_flags
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] tile_successor_item
+  //!   @rst
+  //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare
+  //!   the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
+  //!   *thread*\ :sub:`BLOCK_THREADS - 1`).
+  //!   @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
+    FlagT (&head_flags)[ITEMS_PER_THREAD],
+    FlagT (&tail_flags)[ITEMS_PER_THREAD],
+    T tile_successor_item,
+    T (&input)[ITEMS_PER_THREAD],
+    FlagOp flag_op)
+  {
+    // Share first and last items
+    temp_storage.first_items[linear_tid] = input[0];
+    temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    T preds[ITEMS_PER_THREAD];
+
+    // Set flag for first thread-item
+    if (linear_tid == 0)
+    {
+      head_flags[0] = 1;
+    }
+    else
+    {
+      preds[0]      = temp_storage.last_items[linear_tid - 1];
+      head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+    }
+
+    // Set flag for last thread-item
+    T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
+                         temp_storage.first_items[linear_tid + 1];
+
+    tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+      flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @rst
+  //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+  //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
+  //!   in the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
+  //!   ``flag_op(input[i], next-item)`` returns ``true`` (where ``next-item`` is either the next item
+  //!   in the same thread or the first item in the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item
+  //!   ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Have thread0 obtain the predecessor item for the entire tile
+  //!        int tile_predecessor_item;
+  //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+  //!
+  //!        // Have thread127 obtain the successor item for the entire tile
+  //!        int tile_successor_item;
+  //!        if (threadIdx.x == 127) tile_successor_item == ...
+  //!
+  //!        // Collectively compute head and flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
+  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+  //!            thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+  //! that the ``tile_predecessor_item`` is ``0``, and that the ``tile_successor_item`` is ``125``.
+  //! The corresponding output ``head_flags`` in those threads will be
+  //! ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``, and the corresponding output ``tail_flags``
+  //! in those threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+  //!   of b in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity head_flags
+  //!
+  //! @param[in] tile_predecessor_item
+  //!   @rst
+  //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+  //!   @endrst
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
+    FlagT (&head_flags)[ITEMS_PER_THREAD],
+    T tile_predecessor_item,
+    FlagT (&tail_flags)[ITEMS_PER_THREAD],
+    T (&input)[ITEMS_PER_THREAD],
+    FlagOp flag_op)
+  {
+    // Share first and last items
+    temp_storage.first_items[linear_tid] = input[0];
+    temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    T preds[ITEMS_PER_THREAD];
+
+    // Set flag for first thread-item
+    preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
+                 temp_storage.last_items[linear_tid - 1];
+
+    head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+    // Set flag for last thread-item
+    tail_flags[ITEMS_PER_THREAD - 1] =
+      (linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
+        ApplyOp<FlagOp>::FlagT(
+          flag_op,
+          input[ITEMS_PER_THREAD - 1],
+          temp_storage.first_items[linear_tid + 1],
+          (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @rst
+  //! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+  //!
+  //! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
+  //!   returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
+  //!   the previous thread).
+  //! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
+  //! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
+  //!   returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
+  //!   the next thread).
+  //! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
+  //!   against ``tile_successor_item``.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
+  //!        using BlockDiscontinuity = cub::BlockDiscontinuity<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockDiscontinuity
+  //!        __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Have thread0 obtain the predecessor item for the entire tile
+  //!        int tile_predecessor_item;
+  //!        if (threadIdx.x == 0) tile_predecessor_item == ...
+  //!
+  //!        // Have thread127 obtain the successor item for the entire tile
+  //!        int tile_successor_item;
+  //!        if (threadIdx.x == 127) tile_successor_item == ...
+  //!
+  //!        // Collectively compute head and flags for discontinuities in the segment
+  //!        int head_flags[4];
+  //!        int tail_flags[4];
+  //!        BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
+  //!            head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+  //!            thread_data, cub::Inequality());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
+  //! that the ``tile_predecessor_item`` is ``0``, and that the
+  //! ``tile_successor_item`` is ``125``. The corresponding output ``head_flags``
+  //! in those threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
+  //! and the corresponding output ``tail_flags`` in those threads will be
+  //! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam FlagT
+  //!   **[inferred]** The flag type (must be an integer type)
+  //!
+  //! @tparam FlagOp
+  //!   **[inferred]** Binary predicate functor type having member
+  //!   `T operator()(const T &a, const T &b)` or member
+  //!   `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
+  //!   if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
+  //!   of `b` in the aggregate tile of data.
+  //!
+  //! @param[out] head_flags
+  //!   Calling thread's discontinuity head_flags
+  //!
+  //! @param[in] tile_predecessor_item
+  //!   @rst
+  //!   *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
+  //!   @endrst
+  //!
+  //! @param[out] tail_flags
+  //!   Calling thread's discontinuity tail_flags
+  //!
+  //! @param[in] tile_successor_item
+  //!   @rst
+  //!   *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item
+  //!   (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`).
+  //!   @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[in] flag_op
+  //!   Binary boolean flag predicate
+  template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
+    FlagT (&head_flags)[ITEMS_PER_THREAD],
+    T tile_predecessor_item,
+    FlagT (&tail_flags)[ITEMS_PER_THREAD],
+    T tile_successor_item,
+    T (&input)[ITEMS_PER_THREAD],
+    FlagOp flag_op)
+  {
+    // Share first and last items
+    temp_storage.first_items[linear_tid] = input[0];
+    temp_storage.last_items[linear_tid]  = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+    T preds[ITEMS_PER_THREAD];
+
+    // Set flag for first thread-item
+    preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
+                 temp_storage.last_items[linear_tid - 1];
+
+    head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+    // Set flag for last thread-item
+    T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
+                         temp_storage.first_items[linear_tid + 1];
+
+    tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+      flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+    // Set head_flags for remaining items
+    Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+    // Set tail_flags for remaining items
+    Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
+  }
+
+  //! @} end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_exchange.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_exchange.cuh
new file mode 100644
index 000000000..a781d68e6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_exchange.cuh
@@ -0,0 +1,1298 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
+//! rearranging data partitioned across a CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The BlockExchange class provides :ref:`collective <collective-primitives>` methods for rearranging data partitioned
+//! across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - It is commonplace for blocks of threads to rearrange data items between threads.  For example, the
+//!   device-accessible memory subsystem prefers access patterns where data items are "striped" across threads (where
+//!   consecutive threads access consecutive items), yet most block-wide operations prefer a "blocked" partitioning of
+//!   items across threads (where consecutive items belong to a single thread).
+//! - BlockExchange supports the following types of data exchanges:
+//!
+//!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and :ref:`striped <flexible-data-arrangement>`
+//!     arrangements
+//!   - Transposing between :ref:`blocked <flexible-data-arrangement>` and
+//!     :ref:`warp-striped <flexible-data-arrangement>`  arrangements
+//!   - Scattering ranked items to a :ref:`blocked arrangement <flexible-data-arrangement>`
+//!   - Scattering ranked items to a :ref:`striped arrangement <flexible-data-arrangement>`
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockExchange}
+//!
+//! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement of 512 integer items
+//! partitioned across 128 threads where each thread owns 4 items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+//!        using BlockExchange = cub::BlockExchange<int, 128, 4>;
+//!
+//!        // Allocate shared memory for BlockExchange
+//!        __shared__ typename BlockExchange::TempStorage temp_storage;
+//!
+//!        // Load a tile of data striped across threads
+//!        int thread_data[4];
+//!        cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+//!
+//!        // Collectively exchange data into a blocked arrangement across threads
+//!        BlockExchange(temp_storage).StripedToBlocked(thread_data);
+//!
+//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Proper device-specific padding ensures zero bank conflicts for most types.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockExchange.
+//! @endrst
+//!
+//! @tparam T
+//!    The data type to be exchanged
+//!
+//! @tparam BLOCK_DIM_X
+//!    The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!    The number of items partitioned onto each thread.
+//!
+//! @tparam WARP_TIME_SLICING
+//!    **[optional]** When `true`, only use enough shared memory for a single warp's worth of
+//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint
+//! at the expense of decreased parallelism. (Default: false)
+//!
+//! @tparam BLOCK_DIM_Y
+//!    **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!    **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!    <b>[optional]</b> Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          bool WARP_TIME_SLICING = false,
+          int BLOCK_DIM_Y        = 1,
+          int BLOCK_DIM_Z        = 1,
+          int LEGACY_PTX_ARCH    = 0>
+class BlockExchange
+{
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads
+  static constexpr int WARP_THREADS  = CUB_WARP_THREADS(0);
+  static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in
+                                                                                  // C++14
+  static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
+
+  static constexpr int TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD;
+  static constexpr int TIME_SLICES         = WARP_TIME_SLICING ? WARPS : 1;
+  static constexpr int TIME_SLICED_THREADS = WARP_TIME_SLICING ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
+  static constexpr int TIME_SLICED_ITEMS   = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+  static constexpr int WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS);
+  static constexpr int WARP_TIME_SLICED_ITEMS   = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;
+
+  // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
+  // we can typically use 128b loads)
+  static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo<ITEMS_PER_THREAD>::VALUE;
+  static constexpr int PADDING_ITEMS   = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0;
+
+  /// Shared memory storage layout type
+  struct alignas(16) _TempStorage
+  {
+    T buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+  };
+
+public:
+  /// @smemstorage{BlockExchange}
+  using TempStorage = Uninitialized<_TempStorage>;
+
+private:
+  _TempStorage& temp_storage;
+
+  // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
+  unsigned int linear_tid  = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+  unsigned int lane_id     = LaneId();
+  unsigned int warp_id     = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
+  unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for no
+  //!        timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @brief Transposes data items from **blocked** arrangement to **striped** arrangement. Specialized for
+  //!        warp-timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+  {
+    T temp_items[ITEMS_PER_THREAD];
+
+#pragma unroll
+    for (int slice = 0; slice < TIME_SLICES; slice++)
+    {
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
+
+      CTA_SYNC();
+
+      if (warp_id == slice)
+      {
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+        }
+      }
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        // Read a strip of items
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
+
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
+        {
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
+          {
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+            {
+              item_offset += item_offset >> LOG_SMEM_BANKS;
+            }
+            temp_items[i] = temp_storage.buff[item_offset];
+          }
+        }
+      }
+    }
+
+// Copy
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      output_items[i] = temp_items[i];
+    }
+  }
+
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for no
+  //!        timeslicing
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    WARP_SYNC(0xffffffff);
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @brief Transposes data items from **blocked** arrangement to **warp-striped** arrangement. Specialized for
+  //!        warp-timeslicing
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+  {
+    if (warp_id == 0)
+    {
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        int item_offset = i + lane_id * ITEMS_PER_THREAD;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+        {
+          item_offset += item_offset >> LOG_SMEM_BANKS;
+        }
+        detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+      }
+
+      WARP_SYNC(0xffffffff);
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+        _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+        {
+          item_offset += item_offset >> LOG_SMEM_BANKS;
+        }
+        output_items[i] = temp_storage.buff[item_offset];
+      }
+    }
+
+#pragma unroll
+    for (int slice = 1; slice < TIME_SLICES; ++slice)
+    {
+      CTA_SYNC();
+
+      if (warp_id == slice)
+      {
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+        }
+
+        WARP_SYNC(0xffffffff);
+
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          output_items[i] = temp_storage.buff[item_offset];
+        }
+      }
+    }
+  }
+
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    CTA_SYNC();
+
+// No timeslicing
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @brief Transposes data items from **striped** arrangement to **blocked** arrangement. Specialized for
+  //!        warp-timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+  {
+    // Warp time-slicing
+    T temp_items[ITEMS_PER_THREAD];
+
+#pragma unroll
+    for (int slice = 0; slice < TIME_SLICES; slice++)
+    {
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        // Write a strip of items
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
+
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
+        {
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
+          {
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+            {
+              item_offset += item_offset >> LOG_SMEM_BANKS;
+            }
+            detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+          }
+        }
+      }
+
+      CTA_SYNC();
+
+      if (warp_id == slice)
+      {
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          temp_items[i] = temp_storage.buff[item_offset];
+        }
+      }
+    }
+
+// Copy
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      output_items[i] = temp_items[i];
+    }
+  }
+
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for no
+  //!        timeslicing
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    WARP_SYNC(0xffffffff);
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset += item_offset >> LOG_SMEM_BANKS;
+      }
+      detail::uninitialized_copy_single(output_items + i, temp_storage.buff[item_offset]);
+    }
+  }
+
+  //! @brief Transposes data items from **warp-striped** arrangement to **blocked** arrangement. Specialized for
+  //! warp-timeslicing
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], Int2Type<true> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int slice = 0; slice < TIME_SLICES; ++slice)
+    {
+      CTA_SYNC();
+
+      if (warp_id == slice)
+      {
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+        }
+
+        WARP_SYNC(0xffffffff);
+
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = i + lane_id * ITEMS_PER_THREAD;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset += item_offset >> LOG_SMEM_BANKS;
+          }
+          output_items[i] = temp_storage.buff[item_offset];
+        }
+      }
+    }
+  }
+
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for no timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = linear_tid * ITEMS_PER_THREAD + i;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @brief Exchanges data items annotated by rank into **blocked** arrangement. Specialized for warp-timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT ranks[ITEMS_PER_THREAD],
+    Int2Type<true> /*time_slicing*/)
+  {
+    T temp_items[ITEMS_PER_THREAD];
+
+#pragma unroll
+    for (int slice = 0; slice < TIME_SLICES; slice++)
+    {
+      CTA_SYNC();
+
+      const int slice_offset = TIME_SLICED_ITEMS * slice;
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
+        {
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+          }
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+        }
+      }
+
+      CTA_SYNC();
+
+      if (warp_id == slice)
+      {
+#pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; i++)
+        {
+          int item_offset = lane_id * ITEMS_PER_THREAD + i;
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+          }
+          temp_items[i] = temp_storage.buff[item_offset];
+        }
+      }
+    }
+
+// Copy
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      output_items[i] = temp_items[i];
+    }
+  }
+
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for no timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD],
+    Int2Type<false> /*time_slicing*/)
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @brief Exchanges data items annotated by rank into **striped** arrangement. Specialized for warp-timeslicing.
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items to exchange, converting between **blocked** and **striped** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD],
+    Int2Type<true> /*time_slicing*/)
+  {
+    T temp_items[ITEMS_PER_THREAD];
+
+#pragma unroll
+    for (int slice = 0; slice < TIME_SLICES; slice++)
+    {
+      const int slice_offset = slice * TIME_SLICED_ITEMS;
+      const int slice_oob    = slice_offset + TIME_SLICED_ITEMS;
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        int item_offset = ranks[i] - slice_offset;
+        if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
+        {
+          _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+          {
+            item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+          }
+          detail::uninitialized_copy_single(temp_storage.buff + item_offset, input_items[i]);
+        }
+      }
+
+      CTA_SYNC();
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        // Read a strip of items
+        const int strip_offset = i * BLOCK_THREADS;
+        const int strip_oob    = strip_offset + BLOCK_THREADS;
+
+        if (slice_offset < strip_oob && slice_oob > strip_offset)
+        {
+          int item_offset = strip_offset + linear_tid - slice_offset;
+          if (item_offset >= 0 && item_offset < TIME_SLICED_ITEMS)
+          {
+            _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+            {
+              item_offset += item_offset >> LOG_SMEM_BANKS;
+            }
+            temp_items[i] = temp_storage.buff[item_offset];
+          }
+        }
+      }
+    }
+
+// Copy
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      output_items[i] = temp_items[i];
+    }
+  }
+
+public:
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange()
+      : temp_storage(PrivateStorage())
+  {}
+
+  //! @brief Collective constructor using the specified memory allocation as temporary storage.
+  //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockExchange(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+  {}
+
+  //! @} end member group
+  //! @name Structured exchanges
+  //! @{
+
+  //! @rst
+  //! Transposes data items from **striped** arrangement to **blocked** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+  //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockExchange = cub::BlockExchange<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockExchange
+  //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+  //!
+  //!        // Load a tile of ordered data into a striped arrangement across block threads
+  //!        int thread_data[4];
+  //!        cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+  //!
+  //!        // Collectively exchange data into a blocked arrangement across threads
+  //!        BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+  //!
+  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
+  //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
+  //! ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! @endrst
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @rst
+  //! Transposes data items from **blocked** arrangement to **striped** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+  //! of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockExchange = cub::BlockExchange<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockExchange
+  //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively exchange data into a striped arrangement across threads
+  //!        BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+  //!
+  //!        // Store data striped across block threads into an ordered tile
+  //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }`` in preparation for storing to device-accessible
+  //! memory.
+  //! @endrst
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @rst
+  //! Transposes data items from **warp-striped** arrangement to **blocked** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
+  //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+  //! items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockExchange = cub::BlockExchange<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockExchange
+  //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+  //!
+  //!        // Load a tile of ordered data into a warp-striped arrangement across warp threads
+  //!        int thread_data[4];
+  //!        cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+  //!
+  //!        // Collectively exchange data into a blocked arrangement across threads
+  //!        BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+  //!
+  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
+  //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
+  //! items are striped across the first warp of 32 threads, the second 128 items are striped across the second warp,
+  //! etc.) The corresponding output ``thread_data`` in those threads will be ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11],
+  //! ..., [508,509,510,511] }``.
+  //! @endrst
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @rst
+  //! Transposes data items from **blocked** arrangement to **warp-striped** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
+  //! arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+  //! items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockExchange = cub::BlockExchange<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockExchange
+  //!        __shared__ typename BlockExchange::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively exchange data into a warp-striped arrangement across threads
+  //!        BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+  //!
+  //!        // Store data striped across warp threads into an ordered tile
+  //!        cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+  //!
+  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
+  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` in preparation for storing to
+  //! device-accessible memory. (The first 128 items are striped across the first warp of 32 threads, the second 128
+  //! items are striped across the second warp, etc.)
+  //! @endrst
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @}  end member group
+  //! @name Scatter exchanges
+  //! @{
+
+  //! @rst
+  //! Exchanges data items annotated by rank into **blocked** arrangement.
+  //!
+  //! - @smemreuse
+  //! @endrst
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for local offsets
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @rst
+  //! Exchanges data items annotated by rank into **striped** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for local offsets
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+  }
+
+  //! @rst
+  //! Exchanges data items annotated by rank into **striped** arrangement. Items with rank -1 are not exchanged.
+  //!
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for local offsets
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      if (ranks[i] >= 0)
+      {
+        temp_storage.buff[item_offset] = input_items[i];
+      }
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @rst
+  //! Exchanges valid data items annotated by rank into **striped** arrangement.
+  //!
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for local offsets
+  //!
+  //! @tparam ValidFlag
+  //!   **[inferred]** FlagT type denoting which items are valid
+  //!
+  //! @param[in] input_items
+  //!   Items to exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[out] output_items
+  //!   Items from exchange, converting between **striped** and **blocked** arrangements.
+  //!
+  //! @param[in] ranks
+  //!   Corresponding scatter ranks
+  //!
+  //! @param[in] is_valid
+  //!   Corresponding flag denoting item validity
+  template <typename OutputT, typename OffsetT, typename ValidFlag>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
+    const T (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD],
+    ValidFlag (&is_valid)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = ranks[i];
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      if (is_valid[i])
+      {
+        temp_storage.buff[item_offset] = input_items[i];
+      }
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++)
+    {
+      int item_offset = i * BLOCK_THREADS + linear_tid;
+      _CCCL_IF_CONSTEXPR (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+      output_items[i] = temp_storage.buff[item_offset];
+    }
+  }
+
+  //! @}  end member group
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD])
+  {
+    StripedToBlocked(items, items);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD])
+  {
+    BlockedToStriped(items, items);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD])
+  {
+    WarpStripedToBlocked(items, items);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD])
+  {
+    BlockedToWarpStriped(items, items);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  ///
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToBlocked(items, items, ranks);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToStriped(items, items, ranks);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToStripedGuarded(items, items, ranks);
+  }
+
+  /// @param[in-out] items
+  ///   Items to exchange, converting between **striped** and **blocked** arrangements.
+  /// @param[in] ranks
+  ///   Corresponding scatter ranks
+  /// @param[in] is_valid
+  ///   Corresponding flag denoting item validity
+  template <typename OffsetT, typename ValidFlag>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
+    T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
+  {
+    ScatterToStriped(items, items, ranks, is_valid);
+  }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_histogram.cuh
new file mode 100644
index 000000000..d5726f240
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_histogram.cuh
@@ -0,0 +1,423 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::BlockHistogram class provides [<em>collective</em>](../index.html#sec0) methods for
+ * constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/specializations/block_histogram_atomic.cuh>
+#include <cub/block/specializations/block_histogram_sort.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
+//!        block-wide histograms.
+enum BlockHistogramAlgorithm
+{
+
+  //! @rst
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Sorting followed by differentiation. Execution is comprised of two phases:
+  //!
+  //! #. Sort the data using efficient radix sort
+  //! #. Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Delivers consistent throughput regardless of sample bin distribution.
+  //!
+  //! @endrst
+  BLOCK_HISTO_SORT,
+
+  //! @rst
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Use atomic addition to update byte counts directly
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Performance is strongly tied to the hardware implementation of atomic
+  //! addition, and may be significantly degraded for non uniformly-random
+  //! input distributions where many concurrent updates are likely to be
+  //! made to the same bin counter.
+  //!
+  //! @endrst
+  BLOCK_HISTO_ATOMIC,
+};
+
+//! @rst
+//! The BlockHistogram class provides :ref:`collective <collective-primitives>` methods for
+//! constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A `histogram <http://en.wikipedia.org/wiki/Histogram>`_ counts the number of observations that fall into
+//!   each of the disjoint categories (known as *bins*).
+//! - The ``T`` type must be implicitly castable to an integer type.
+//! - BlockHistogram expects each integral ``input[i]`` value to satisfy
+//!   ``0 <= input[i] < BINS``. Values outside of this range result in undefined behavior.
+//! - BlockHistogram can be optionally specialized to use different algorithms:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_HISTO_SORT`: Sorting followed by differentiation.
+//!   #. :cpp:enumerator:`cub::BLOCK_HISTO_ATOMIC`: Use atomic addition to update byte counts directly.
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockHistogram}
+//!
+//! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+//! are partitioned across 128 threads where each thread owns 4 samples.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+//!        using BlockHistogram = cub::BlockHistogram<unsigned char, 128, 4, 256>;
+//!
+//!        // Allocate shared memory for BlockHistogram
+//!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+//!
+//!        // Allocate shared memory for block-wide histogram bin counts
+//!        __shared__ unsigned int smem_histogram[256];
+//!
+//!        // Obtain input samples per thread
+//!        unsigned char data[4];
+//!        ...
+//!
+//!        // Compute the block-wide histogram
+//!        BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+//!
+//! Performance and Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - All input values must fall between ``[0, BINS)``, or behavior is undefined.
+//! - The histogram output can be constructed in shared or device-accessible memory
+//! - See ``cub::BlockHistogramAlgorithm`` for performance details regarding algorithmic alternatives
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage
+//! required by BlockHistogram.
+//! @endrst
+//!
+//! @tparam T
+//!   The sample type being histogrammed (must be castable to an integer bin identifier)
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of items per thread
+//!
+//! @tparam BINS
+//!   The number bins within the histogram
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_HISTO_SORT)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT,
+          int BLOCK_DIM_Y                   = 1,
+          int BLOCK_DIM_Z                   = 1,
+          int LEGACY_PTX_ARCH               = 0>
+class BlockHistogram
+{
+private:
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /// Internal specialization.
+  using InternalBlockHistogram =
+    ::cuda::std::_If<ALGORITHM == BLOCK_HISTO_SORT,
+                     BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z>,
+                     BlockHistogramAtomic<BINS>>;
+
+  /// Shared memory storage layout type for BlockHistogram
+  using _TempStorage = typename InternalBlockHistogram::TempStorage;
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+public:
+  /// @smemstorage{BlockHistogram}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogram()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogram(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Histogram operations
+  //! @{
+
+  //! @rst
+  //! Initialize the shared histogram counters to zero.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a the initialization and update of a
+  //! histogram of 512 integer samples that are partitioned across 128 threads
+  //! where each thread owns 4 samples.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!      // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+  //!      using BlockHistogram = cub::BlockHistogram<unsigned char, 128, 4, 256>;
+  //!
+  //!      // Allocate shared memory for BlockHistogram
+  //!      __shared__ typename BlockHistogram::TempStorage temp_storage;
+  //!
+  //!      // Allocate shared memory for block-wide histogram bin counts
+  //!      __shared__ unsigned int smem_histogram[256];
+  //!
+  //!      // Obtain input samples per thread
+  //!      unsigned char thread_samples[4];
+  //!      ...
+  //!
+  //!      // Initialize the block-wide histogram
+  //!      BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+  //!
+  //!      // Update the block-wide histogram
+  //!      BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!
+  //! @endrst
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Histogram counter type
+  template <typename CounterT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitHistogram(CounterT histogram[BINS])
+  {
+    // Initialize histogram bin counts to zeros
+    int histo_offset = 0;
+
+#pragma unroll
+    for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+    {
+      histogram[histo_offset + linear_tid] = 0;
+    }
+    // Finish up with guarded initialization if necessary
+    if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+    {
+      histogram[histo_offset + linear_tid] = 0;
+    }
+  }
+
+  //! @rst
+  //! Constructs a block-wide histogram in shared/device-accessible memory.
+  //! Each thread contributes an array of input elements.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+  //! are partitioned across 128 threads where each thread owns 4 samples.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+  //!        using BlockHistogram = cub::BlockHistogram<unsigned char, 128, 4, 256>;
+  //!
+  //!        // Allocate shared memory for BlockHistogram
+  //!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+  //!
+  //!        // Allocate shared memory for block-wide histogram bin counts
+  //!        __shared__ unsigned int smem_histogram[256];
+  //!
+  //!        // Obtain input samples per thread
+  //!        unsigned char thread_samples[4];
+  //!        ...
+  //!
+  //!        // Compute the block-wide histogram
+  //!        BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+  //!
+  //! @endrst
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Histogram counter type
+  //!
+  //! @param[in] items
+  //!   Calling thread's input values to histogram
+  //!
+  //! @param[out] histogram
+  //!   Reference to shared/device-accessible memory histogram
+  template <typename CounterT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Histogram(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
+  {
+    // Initialize histogram bin counts to zeros
+    InitHistogram(histogram);
+
+    CTA_SYNC();
+
+    // Composite the histogram
+    InternalBlockHistogram(temp_storage).Composite(items, histogram);
+  }
+
+  //! @rst
+  //! Updates an existing block-wide histogram in shared/device-accessible memory.
+  //! Each thread composites an array of input elements.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a the initialization and update of a
+  //! histogram of 512 integer samples that are partitioned across 128 threads
+  //! where each thread owns 4 samples.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+  //!        using BlockHistogram = cub::BlockHistogram<unsigned char, 128, 4, 256>;
+  //!
+  //!        // Allocate shared memory for BlockHistogram
+  //!        __shared__ typename BlockHistogram::TempStorage temp_storage;
+  //!
+  //!        // Allocate shared memory for block-wide histogram bin counts
+  //!        __shared__ unsigned int smem_histogram[256];
+  //!
+  //!        // Obtain input samples per thread
+  //!        unsigned char thread_samples[4];
+  //!        ...
+  //!
+  //!        // Initialize the block-wide histogram
+  //!        BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+  //!
+  //!        // Update the block-wide histogram
+  //!        BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+  //!
+  //! @endrst
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Histogram counter type
+  //!
+  //! @param[in] items
+  //!   Calling thread's input values to histogram
+  //!
+  //! @param[out] histogram
+  //!   Reference to shared/device-accessible memory histogram
+  template <typename CounterT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
+  {
+    InternalBlockHistogram(temp_storage).Composite(items, histogram);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_load.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_load.cuh
new file mode 100644
index 000000000..284ac4401
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_load.cuh
@@ -0,0 +1,1243 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @name Blocked arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//!
+//! @blocked
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base input iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+{
+// Load directly in thread-blocked order
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
+  }
+}
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
+    if (src_pos < block_items_end)
+    {
+      dst_items[i] = block_src_it[src_pos];
+    }
+  }
+}
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block, guarded
+//! by range, with a fall-back assignment of out-of-bound elements.
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **[inferred]** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base input iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   First out-of-bounds index when loading from block_src_it
+//!
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = oob_default;
+  }
+
+  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+//! @brief Internal implementation for load vectorization
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
+//!
+//! @param[in] block_src_ptr
+//!   Input pointer for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
+{
+  // Find biggest memory access word that T is a whole multiple of
+  using device_word_t = typename UnitWord<T>::DeviceWord;
+  _CCCL_DIAG_PUSH
+#  if defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
+  _CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div")
+#  endif // defined(CUB_CLANG_VERSION) && CUB_CLANG_VERSION >= 100000
+  constexpr int total_words = static_cast<int>(sizeof(dst_items) / sizeof(device_word_t));
+  _CCCL_DIAG_POP
+  constexpr int vector_size        = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1;
+  constexpr int vectors_per_thread = total_words / vector_size;
+  using vector_t                   = typename CubVector<device_word_t, vector_size>::Type;
+
+  // Load into an array of vectors in thread-blocked order
+  vector_t vec_items[vectors_per_thread];
+  const vector_t* vec_ptr = reinterpret_cast<const vector_t*>(block_src_ptr) + linear_tid * vectors_per_thread;
+#  pragma unroll
+  for (int i = 0; i < vectors_per_thread; i++)
+  {
+    vec_items[i] = ThreadLoad<MODIFIER>(vec_ptr + i);
+  }
+
+// Copy to destination
+#  pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
+  }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+//! @rst
+//! Load a linear segment of items into a blocked arrangement across the thread block.
+//!
+//! @blocked
+//!
+//! The input offset (``block_ptr + block_offset``) must be quad-item aligned
+//!
+//! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+//!
+//! - ``ITEMS_PER_THREAD`` is odd
+//! - The data type ``T`` is not a built-in primitive or CUDA vector type
+//!   (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//!   linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_src_ptr
+//!   The thread block's base pointer for loading from
+//!
+//! @param[out] dst_items
+//!  destination to load data into
+template <typename T, int ITEMS_PER_THREAD>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
+{
+  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
+}
+
+//! @} end member group
+//! @name Striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block.
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//!
+//! @tparam T
+//!   **[inferred]** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **[inferred]** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
+//!   thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
+  }
+}
+
+namespace detail
+{
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
+  }
+}
+} // namespace detail
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//!
+//! @tparam T
+//!   **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) +
+//! linear_tid</tt> for 2D thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   Number of valid items to load
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    const auto src_pos = linear_tid + i * BLOCK_THREADS;
+    if (src_pos < block_items_end)
+    {
+      dst_items[i] = block_src_it[src_pos];
+    }
+  }
+}
+
+//! @rst
+//! Load a linear segment of items into a striped arrangement across the thread block, guarded
+//! by range, with a fall-back assignment of out-of-bound elements.
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//!
+//! @tparam T
+//!   **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   Number of valid items to load
+//!
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
+template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
+{
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = oob_default;
+  }
+
+  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
+}
+
+//! @} end member group
+//! @name Warp-striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block.
+//!
+//! @warpstriped
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+{
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+// Load directly in warp-striped order
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * CUB_PTX_WARP_THREADS)]);
+  }
+}
+
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+//!
+//! @warpstriped
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   Number of valid items to load
+template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
+  int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+{
+  const int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  const int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  const int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+// Load directly in warp-striped order
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    const auto src_pos = warp_offset + tid + (i * CUB_PTX_WARP_THREADS);
+    if (src_pos < block_items_end)
+    {
+      new (&dst_items[i]) T(block_src_it[src_pos]);
+    }
+  }
+}
+
+//! @rst
+//! Load a linear segment of items into a warp-striped arrangement across the thread block,
+//! guarded by range, with a fall-back assignment of out-of-bound elements.
+//!
+//! @warpstriped
+//!
+//! @endrst
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @tparam T
+//!   **inferred** The data type to load.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **inferred** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam RandomAccessIterator
+//!   **inferred** The random-access iterator type for input \iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
+//! linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_src_it
+//!   The thread block's base iterator for loading from
+//!
+//! @param[out] dst_items
+//!   Destination to load data into
+//!
+//! @param[in] block_items_end
+//!   Number of valid items to load
+//!
+//! @param[in] oob_default
+//!   Default value to assign out-of-bound items
+template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
+  int linear_tid,
+  RandomAccessIterator block_src_it,
+  T (&dst_items)[ITEMS_PER_THREAD],
+  int block_items_end,
+  DefaultT oob_default)
+{
+// Load directly in warp-striped order
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; i++)
+  {
+    dst_items[i] = oob_default;
+  }
+
+  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+}
+
+//! @} end member group
+
+//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data
+//!        from memory into a blocked arrangement across a CUDA thread block.
+enum BlockLoadAlgorithm
+{
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases
+  //! (i.e., the number items per thread).
+  //! @endrst
+  BLOCK_LOAD_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread.
+  //!
+  //! @endrst
+  BLOCK_LOAD_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
+  //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
+  //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads
+  //!   (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever
+  //!   is lower).
+  //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+  //!
+  //!   - ``ITEMS_PER_THREAD`` is odd
+  //!   - The ``RandomAccessIterator`` is not a simple pointer type
+  //!   - The block input offset is not quadword-aligned
+  //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
+  BLOCK_LOAD_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then locally
+  //! transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_LOAD_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
+  //! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and
+  //!   cub::BLOCK_LOAD_VECTORIZE alternatives.
+  //! - Provisions more shared storage, but incurs smaller latencies than the
+  //!   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+  //!
+  //! @endrst
+  BLOCK_LOAD_WARP_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read
+  //! directly from memory and then is locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently
+  //! time-sliced among warps.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE
+  //!   alternative.
+  //!
+  //! @endrst
+  BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+//! @rst
+//! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
+//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread
+//! block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different
+//!   cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures,
+//!   data types, granularity sizes, etc.
+//! - BlockLoad can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_STRIPED`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_VECTORIZE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory
+//!      using CUDA's built-in vectorized loads as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`:
+//!      A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
+//!   #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED`:
+//!      A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
+//!      locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>` one warp at a time.
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockLoad}
+//!
+//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
+//! pattern (after which items are locally reordered among threads).
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+//!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
+//!
+//!        // Allocate shared memory for BlockLoad
+//!        __shared__ typename BlockLoad::TempStorage temp_storage;
+//!
+//!        // Load a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        BlockLoad(temp_storage).Load(d_data, thread_data);
+//!
+//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
+//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
+//! by BlockLoad.
+//!
+//! @endrst
+//!
+//! @tparam T
+// The data type to read into (which must be convertible from the input iterator's value type).
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
+          int BLOCK_DIM_Y              = 1,
+          int BLOCK_DIM_Z              = 1,
+          int LEGACY_PTX_ARCH          = 0>
+class BlockLoad
+{
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
+
+  template <BlockLoadAlgorithm _POLICY, int DUMMY>
+  struct LoadInternal; // helper to dispatch the load algorithm
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+  {
+    using TempStorage = NullType;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
+    }
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
+    }
+
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
+  {
+    using TempStorage = NullType;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
+    }
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
+    }
+
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+  {
+    using TempStorage = NullType;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    // attempts vectorization (pointer)
+    template <typename>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
+    }
+
+    // any other iterator, no vectorization
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items);
+    }
+
+    // attempts vectorization (cache modified iterator)
+    template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
+    }
+
+    // skips vectorization
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
+    }
+
+    // skips vectorization
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+  {
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
+
+    _TempStorage& temp_storage;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+  {
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
+
+    _TempStorage& temp_storage;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+  {
+    static constexpr int WARP_THREADS = CUB_WARP_THREADS(0);
+    static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+    using _TempStorage  = typename BlockExchange::TempStorage;
+    using TempStorage   = Uninitialized<_TempStorage>;
+
+    _TempStorage& temp_storage;
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+
+    template <typename RandomAccessIterator, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+    {
+      LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
+      BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
+    }
+  };
+
+  using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
+  using _TempStorage = typename InternalLoad::TempStorage;
+
+  // Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  _TempStorage& temp_storage;
+  int linear_tid;
+
+public:
+  /// @smemstorage{BlockLoad}
+  using TempStorage = Uninitialized<_TempStorage>;
+
+  //! @name Collective constructors
+  //! @{
+
+  /// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /// @brief Collective constructor using the specified memory allocation as temporary storage.
+  /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @} end member group
+  //! @name Data movement
+  //! @{
+
+  //! @rst
+  //! Load a linear segment of items from memory.
+  //!
+  //! - @blocked
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
+  //! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
+  //! pattern (after which items are locally reordered among threads).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
+  //!
+  //!        // Allocate shared memory for BlockLoad
+  //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
+  //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
+  //!
+  //! @param[out] dst_items
+  //!   Destination to load data into
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
+  }
+
+  //! @rst
+  //!
+  //! Load a linear segment of items from memory, guarded by range.
+  //!
+  //! - @blocked
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
+  //! pattern (after which items are locally reordered among threads).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
+  //!    {
+  //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
+  //!
+  //!        // Allocate shared memory for BlockLoad
+  //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
+  //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
+  //! with only the first two threads being unmasked to load portions of valid data (and other items remaining
+  //! unassigned).
+  //!
+  //! @endrst
+  //!
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
+  //!
+  //! @param[out] dst_items
+  //!   Destination to load data into
+  //!
+  //! @param[in] block_items_end
+  //!   Number of valid items to load
+  template <typename RandomAccessIterator>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
+  }
+
+  //! @rst
+  //! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+  //!
+  //! - @blocked
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
+  //! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
+  //! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
+  //! pattern (after which items are locally reordered among threads).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
+  //!    {
+  //!        // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
+  //!
+  //!        // Allocate shared memory for BlockLoad
+  //!        __shared__ typename BlockLoad::TempStorage temp_storage;
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
+  //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
+  //! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load
+  //! portions of valid data (and other items are assigned ``-1``)
+  //!
+  //! @endrst
+  //!
+  //! @param[in] block_src_it
+  //!   The thread block's base iterator for loading from
+  //!
+  //! @param[out] dst_items
+  //!   Destination to load data into
+  //!
+  //! @param[in] block_items_end
+  //!   Number of valid items to load
+  //!
+  //! @param[in] oob_default
+  //!   Default value to assign out-of-bound items
+  template <typename RandomAccessIterator, typename DefaultT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
+  }
+
+  //! @}  end member group
+};
+
+template <class Policy, class It, class T = cub::detail::value_t<It>>
+struct BlockLoadType
+{
+  using type = cub::BlockLoad<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::LOAD_ALGORITHM>;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_merge_sort.cuh
new file mode 100644
index 000000000..29510db5e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_merge_sort.cuh
@@ -0,0 +1,769 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_sort.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+// This implements the DiagonalIntersection algorithm from Merge-Path. Additional details can be found in:
+// * S. Odeh, O. Green, Z. Mwassi, O. Shmueli, Y. Birk, "Merge Path - Parallel Merging Made Simple", Multithreaded
+//   Architectures and Applications (MTAAP) Workshop, IEEE 26th International Parallel & Distributed Processing
+//   Symposium (IPDPS), 2012
+// * S. Odeh, O. Green, Y. Birk, "Merge Path - A Visually Intuitive Approach to Parallel Merging", 2014, URL:
+//   https://arxiv.org/abs/1406.2628
+template <typename KeyIt1, typename KeyIt2, typename OffsetT, typename BinaryPred>
+_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
+MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
+{
+  OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
+  OffsetT keys1_end   = (cub::min)(diag, keys1_count);
+
+  while (keys1_begin < keys1_end)
+  {
+    const OffsetT mid = cub::MidPoint<OffsetT>(keys1_begin, keys1_end);
+    // pull copies of the keys before calling binary_pred so proxy references are unwrapped
+    const detail::value_t<KeyIt1> key1 = keys1[mid];
+    const detail::value_t<KeyIt2> key2 = keys2[diag - 1 - mid];
+    if (binary_pred(key2, key1))
+    {
+      keys1_end = mid;
+    }
+    else
+    {
+      keys1_begin = mid + 1;
+    }
+  }
+  return keys1_begin;
+}
+
+template <typename KeyIt, typename KeyT, typename CompareOp, int ITEMS_PER_THREAD>
+_CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+  KeyIt keys_shared,
+  int keys1_beg,
+  int keys2_beg,
+  int keys1_count,
+  int keys2_count,
+  KeyT (&output)[ITEMS_PER_THREAD],
+  int (&indices)[ITEMS_PER_THREAD],
+  CompareOp compare_op)
+{
+  const int keys1_end = keys1_beg + keys1_count;
+  const int keys2_end = keys2_beg + keys2_count;
+
+  KeyT key1 = keys_shared[keys1_beg];
+  KeyT key2 = keys_shared[keys2_beg];
+
+#pragma unroll
+  for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+  {
+    const bool p  = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+    output[item]  = p ? key2 : key1;
+    indices[item] = p ? keys2_beg++ : keys1_beg++;
+    if (p)
+    {
+      key2 = keys_shared[keys2_beg];
+    }
+    else
+    {
+      key1 = keys_shared[keys1_beg];
+    }
+  }
+}
+
+/**
+ * @brief Generalized merge sort algorithm
+ *
+ * This class is used to reduce code duplication. Warp and Block merge sort
+ * differ only in how they compute thread index and how they synchronize
+ * threads. Since synchronization might require access to custom data
+ * (like member mask), CRTP is used.
+ *
+ * @par
+ * The code snippet below illustrates the way this class can be used.
+ * @par
+ * @code
+ * #include <cub/cub.cuh> // or equivalently <cub/block/block_merge_sort.cuh>
+ *
+ * constexpr int BLOCK_THREADS = 256;
+ * constexpr int ITEMS_PER_THREAD = 9;
+ *
+ * class BlockMergeSort : public BlockMergeSortStrategy<int,
+ *                                                      cub::NullType,
+ *                                                      BLOCK_THREADS,
+ *                                                      ITEMS_PER_THREAD,
+ *                                                      BlockMergeSort>
+ * {
+ *   using BlockMergeSortStrategyT =
+ *     BlockMergeSortStrategy<int,
+ *                            cub::NullType,
+ *                            BLOCK_THREADS,
+ *                            ITEMS_PER_THREAD,
+ *                            BlockMergeSort>;
+ * public:
+ *   __device__ __forceinline__ explicit BlockMergeSort(
+ *     typename BlockMergeSortStrategyT::TempStorage &temp_storage)
+ *       : BlockMergeSortStrategyT(temp_storage, threadIdx.x)
+ *   {}
+ *
+ *   __device__ __forceinline__ void SyncImplementation() const
+ *   {
+ *     __syncthreads();
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam ValueT
+ *   ValueT type. cub::NullType indicates a keys-only sort
+ *
+ * @tparam SynchronizationPolicy
+ *   Provides a way of synchronizing threads. Should be derived from
+ *   `BlockMergeSortStrategy`.
+ */
+template <typename KeyT, typename ValueT, int NUM_THREADS, int ITEMS_PER_THREAD, typename SynchronizationPolicy>
+class BlockMergeSortStrategy
+{
+  static_assert(PowerOfTwo<NUM_THREADS>::VALUE, "NUM_THREADS must be a power of two");
+
+private:
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * NUM_THREADS;
+
+  // Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    KeyT keys_shared[ITEMS_PER_TILE + 1];
+    ValueT items_shared[ITEMS_PER_TILE + 1];
+  }; // union TempStorage
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  const unsigned int linear_tid;
+
+public:
+  /// \smemstorage{BlockMergeSort}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  BlockMergeSortStrategy() = delete;
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSortStrategy(unsigned int linear_tid)
+      : temp_storage(PrivateStorage())
+      , linear_tid(linear_tid)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSortStrategy(TempStorage& temp_storage, unsigned int linear_tid)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(linear_tid)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int get_linear_tid() const
+  {
+    return linear_tid;
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that i and j are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
+  {
+    ValueT items[ITEMS_PER_THREAD];
+    Sort<CompareOp, true>(keys, items, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using a merge sorting method.
+   *
+   * @par
+   * Sort is not guaranteed to be stable. That is, suppose that `i` and `j` are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op)
+  {
+    Sort<CompareOp, false>(keys, items, compare_op, ITEMS_PER_TILE, keys[0]);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - Sort is not guaranteed to be stable. That is, suppose that `i` and `j`
+   *   are equivalent: neither one is less than the other. It is not guaranteed
+   *   that the relative order of these two elements will be preserved by sort.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp, bool IS_LAST_TILE = true>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+       ValueT (&items)[ITEMS_PER_THREAD],
+       CompareOp compare_op,
+       int valid_items,
+       KeyT oob_default)
+  {
+    if (IS_LAST_TILE)
+    {
+      // if last tile, find valid max_key
+      // and fill the remaining keys with it
+      //
+      KeyT max_key = oob_default;
+
+#pragma unroll
+      for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+      {
+        if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+        {
+          max_key = compare_op(max_key, keys[item]) ? keys[item] : max_key;
+        }
+        else
+        {
+          keys[item] = max_key;
+        }
+      }
+    }
+
+    // if first element of thread is in input range, stable sort items
+    //
+    if (!IS_LAST_TILE || ITEMS_PER_THREAD * linear_tid < valid_items)
+    {
+      StableOddEvenSort(keys, items, compare_op);
+    }
+
+    // each thread has sorted keys
+    // merge sort keys in shared memory
+    //
+    for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS;
+         target_merged_threads_number *= 2)
+    {
+      int merged_threads_number = target_merged_threads_number / 2;
+      int mask                  = target_merged_threads_number - 1;
+
+      Sync();
+
+// store keys in shmem
+//
+#pragma unroll
+      for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+      {
+        int idx                       = ITEMS_PER_THREAD * linear_tid + item;
+        temp_storage.keys_shared[idx] = keys[item];
+      }
+
+      Sync();
+
+      int indices[ITEMS_PER_THREAD];
+
+      int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
+      int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
+      int size  = ITEMS_PER_THREAD * merged_threads_number;
+
+      int thread_idx_in_thread_group_being_merged = mask & linear_tid;
+
+      int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
+
+      int keys1_beg = (cub::min)(valid_items, start);
+      int keys1_end = (cub::min)(valid_items, keys1_beg + size);
+      int keys2_beg = keys1_end;
+      int keys2_end = (cub::min)(valid_items, keys2_beg + size);
+
+      int keys1_count = keys1_end - keys1_beg;
+      int keys2_count = keys2_end - keys2_beg;
+
+      int partition_diag = MergePath(
+        &temp_storage.keys_shared[keys1_beg],
+        &temp_storage.keys_shared[keys2_beg],
+        keys1_count,
+        keys2_count,
+        diag,
+        compare_op);
+
+      int keys1_beg_loc   = keys1_beg + partition_diag;
+      int keys1_end_loc   = keys1_end;
+      int keys2_beg_loc   = keys2_beg + diag - partition_diag;
+      int keys2_end_loc   = keys2_end;
+      int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
+      int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
+      SerialMerge(
+        &temp_storage.keys_shared[0],
+        keys1_beg_loc,
+        keys2_beg_loc,
+        keys1_count_loc,
+        keys2_count_loc,
+        keys,
+        indices,
+        compare_op);
+
+      if (!KEYS_ONLY)
+      {
+        Sync();
+
+// store keys in shmem
+//
+#pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          int idx                        = ITEMS_PER_THREAD * linear_tid + item;
+          temp_storage.items_shared[idx] = items[item];
+        }
+
+        Sync();
+
+// gather items from shmem
+//
+#pragma unroll
+        for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+        {
+          items[item] = temp_storage.items_shared[indices[item]];
+        }
+      }
+    }
+  } // func block_merge_sort
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op)
+  {
+    Sort(keys, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * StableSort is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of StableSort is that `x` still precedes `y`.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StableSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op)
+  {
+    Sort(keys, items, compare_op);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`.
+   *   If there is a value that is ordered after `oob_default`, it won't be
+   *   placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StableSort(KeyT (&keys)[ITEMS_PER_THREAD], CompareOp compare_op, int valid_items, KeyT oob_default)
+  {
+    Sort(keys, compare_op, valid_items, oob_default);
+  }
+
+  /**
+   * @brief Sorts items partitioned across a CUDA thread block using
+   *        a merge sorting method.
+   *
+   * @par
+   * - StableSort is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes
+   *   `y`, and if the two elements are equivalent (neither `x < y` nor `y < x`)
+   *   then a postcondition of StableSort is that `x` still precedes `y`.
+   * - The value of `oob_default` is assigned to all elements that are out of
+   *   `valid_items` boundaries. It's expected that `oob_default` is ordered
+   *   after any value in the `valid_items` boundaries. The algorithm always
+   *   sorts a fixed amount of elements, which is equal to
+   *   `ITEMS_PER_THREAD * BLOCK_THREADS`. If there is a value that is ordered
+   *   after `oob_default`, it won't be placed within `valid_items` boundaries.
+   *
+   * @tparam CompareOp
+   *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`.
+   *   `CompareOp` is a model of [Strict Weak Ordering].
+   *
+   * @tparam IS_LAST_TILE
+   *   True if `valid_items` isn't equal to the `ITEMS_PER_TILE`
+   *
+   * @param[in,out] keys
+   *   Keys to sort
+   *
+   * @param[in,out] items
+   *   Values to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] valid_items
+   *   Number of valid items to sort
+   *
+   * @param[in] oob_default
+   *   Default value to assign out-of-bound items
+   *
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   */
+  template <typename CompareOp, bool IS_LAST_TILE = true>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void StableSort(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&items)[ITEMS_PER_THREAD],
+    CompareOp compare_op,
+    int valid_items,
+    KeyT oob_default)
+  {
+    Sort<CompareOp, IS_LAST_TILE>(keys, items, compare_op, valid_items, oob_default);
+  }
+
+private:
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Sync() const
+  {
+    static_cast<const SynchronizationPolicy*>(this)->SyncImplementation();
+  }
+};
+
+/**
+ * @brief The BlockMergeSort class provides methods for sorting items
+ *        partitioned across a CUDA thread block using a merge sorting method.
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @tparam ValueT
+ *   **[optional]** ValueT type (default: `cub::NullType`, which indicates
+ *   a keys-only sort)
+ *
+ * @tparam BLOCK_DIM_Y
+ *   **[optional]** The thread block length in threads along the Y dimension
+ *   (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   **[optional]** The thread block length in threads along the Z dimension
+ *   (default: 1)
+ *
+ * @par Overview
+ *   BlockMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types
+ *   and comparison functors, but is slower than BlockRadixSort when sorting
+ *   arithmetic types into ascending/descending order.
+ *
+ * @par A Simple Example
+ * @blockcollective{BlockMergeSort}
+ * @par
+ * The code snippet below illustrates a sort of 512 integer keys that are
+ * partitioned across 128 threads * where each thread owns 4 consecutive items.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>  // or equivalently <cub/block/block_merge_sort.cuh>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockMergeSort for a 1D block of 128 threads owning 4 integer items each
+ *     using BlockMergeSort = cub::BlockMergeSort<int, 128, 4>;
+ *
+ *     // Allocate shared memory for BlockMergeSort
+ *     __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     BlockMergeSort(temp_storage_shuffle).Sort(thread_keys, CustomLess());
+ *     ...
+ * }
+ * @endcode
+ * @par
+ * Suppose the set of input `thread_keys` across the block of threads is
+ * `{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }`.
+ * The corresponding output `thread_keys` in those threads will be
+ * `{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }`.
+ *
+ * @par Re-using dynamically allocating shared memory
+ * The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region.
+ *
+ * This example can be easily adapted to the storage required by BlockMergeSort.
+ */
+template <typename KeyT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          typename ValueT = NullType,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class BlockMergeSort
+    : public BlockMergeSortStrategy<
+        KeyT,
+        ValueT,
+        BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,
+        BlockMergeSort<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, ValueT, BLOCK_DIM_Y, BLOCK_DIM_Z>>
+{
+private:
+  // The thread block size in threads
+  static constexpr int BLOCK_THREADS  = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * BLOCK_THREADS;
+
+  using BlockMergeSortStrategyT = BlockMergeSortStrategy<KeyT, ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, BlockMergeSort>;
+
+public:
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockMergeSort()
+      : BlockMergeSortStrategyT(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE explicit BlockMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage)
+      : BlockMergeSortStrategyT(temp_storage, RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+private:
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
+  {
+    CTA_SYNC();
+  }
+
+  friend BlockMergeSortStrategyT;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_rank.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_rank.cuh
new file mode 100644
index 000000000..21a487919
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_rank.cuh
@@ -0,0 +1,1216 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/thread/thread_scan.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
+//!        keys from a single tile. Note that different ranking algorithms require different
+//!        initial arrangements of keys to function properly.
+enum RadixRankAlgorithm
+{
+  //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == false`.
+  //! It uses thread-private histograms, and thus uses more shared memory.
+  //! Requires blocked arrangement of keys. Does not support count callbacks.
+  RADIX_RANK_BASIC,
+
+  //! Ranking using the BlockRadixRank algorithm with `MEMOIZE_OUTER_SCAN == true`.
+  //! Similar to RADIX_RANK BASIC, it requires blocked arrangement of keys and does not support count callbacks.
+  RADIX_RANK_MEMOIZE,
+
+  //! Ranking using the BlockRadixRankMatch algorithm. It uses warp-private histograms and matching for ranking
+  //! the keys in a single warp. Therefore, it uses less shared memory compared to RADIX_RANK_BASIC.
+  //! It requires warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH,
+
+  //! Ranking using the BlockRadixRankMatchEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ANY`.
+  //! An alternative implementation of match-based ranking that computes bin counts early.
+  //! Because of this, it works better with onesweep sorting, which requires bin counts for decoupled look-back.
+  //! Assumes warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+
+  //! Ranking using the BlockRadixRankEarlyCounts algorithm with `MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR`.
+  //! It uses extra space in shared memory to generate warp match masks using `atomicOr()`.
+  //! This is faster when there are few matches, but can lead to slowdowns if the number of matching keys among
+  //! warp lanes is high. Assumes warp-striped key arrangement and supports count callbacks.
+  RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+};
+
+/** Empty callback implementation */
+template <int BINS_PER_THREAD>
+struct BlockRadixRankEmptyCallback
+{
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(int (&bins)[BINS_PER_THREAD]) {}
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+namespace detail
+{
+
+template <int Bits, int PartialWarpThreads, int PartialWarpId>
+struct warp_in_block_matcher_t
+{
+  static _CCCL_DEVICE ::cuda::std::uint32_t match_any(::cuda::std::uint32_t label, ::cuda::std::uint32_t warp_id)
+  {
+    if (warp_id == static_cast<::cuda::std::uint32_t>(PartialWarpId))
+    {
+      return MatchAny<Bits, PartialWarpThreads>(label);
+    }
+
+    return MatchAny<Bits>(label);
+  }
+};
+
+template <int Bits, int PartialWarpId>
+struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
+{
+  static _CCCL_DEVICE ::cuda::std::uint32_t match_any(::cuda::std::uint32_t label, ::cuda::std::uint32_t warp_id)
+  {
+    return MatchAny<Bits>(label);
+  }
+};
+
+} // namespace detail
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+//! @rst
+//! BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+//! - @blocked
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!      constexpr int block_threads = 2;
+//!      constexpr int radix_bits = 5;
+//!
+//!      // Specialize BlockRadixRank for a 1D block of 2 threads
+//!      // Specialize BlockRadixRank for a 1D block of 2 threads
+//!      using block_radix_rank = cub::BlockRadixRank<block_threads, radix_bits>;
+//!      using storage_t = typename block_radix_rank::TempStorage;
+//!
+//!      // Allocate shared memory for BlockRadixSort
+//!      __shared__ storage_t temp_storage;
+//!
+//!      // Obtain a segment of consecutive items that are blocked across threads
+//!      int keys[2];
+//!      int ranks[2];
+//!      ...
+//!
+//!      cub::BFEDigitExtractor<int> extractor(0, radix_bits);
+//!      block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
+//!
+//!      ...
+//!
+//! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
+//! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockRadixRank.
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam RADIX_BITS
+//!   The number of radix bits per digit place
+//!
+//! @tparam IS_DESCENDING
+//!   Whether or not the sorted-order is high-to-low
+//!
+//! @tparam MEMOIZE_OUTER_SCAN
+//!   **[optional]** Whether or not to buffer outer raking scan
+//!   partials to incur fewer shared memory reads at the expense of higher register pressure
+//!   (default: true for architectures SM35 and newer, false otherwise).
+//!   See `BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE` for more details.
+//!
+//! @tparam INNER_SCAN_ALGORITHM
+//!   **[optional]** The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+//!
+//! @tparam SMEM_CONFIG
+//!   **[optional]** Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <int BLOCK_DIM_X,
+          int RADIX_BITS,
+          bool IS_DESCENDING,
+          bool MEMOIZE_OUTER_SCAN                 = true,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          cudaSharedMemConfig SMEM_CONFIG         = cudaSharedMemBankSizeFourByte,
+          int BLOCK_DIM_Y                         = 1,
+          int BLOCK_DIM_Z                         = 1,
+          int LEGACY_PTX_ARCH                     = 0>
+class BlockRadixRank
+{
+private:
+  // Integer type for digit counters (to be packed into words of type PackedCounters)
+  using DigitCounter = unsigned short;
+
+  // Integer type for packing DigitCounters into columns of shared memory banks
+  using PackedCounter =
+    ::cuda::std::_If<SMEM_CONFIG == cudaSharedMemBankSizeEightByte, unsigned long long, unsigned int>;
+
+  static constexpr DigitCounter max_tile_size = ::cuda::std::numeric_limits<DigitCounter>::max();
+
+  enum
+  {
+    // The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    RADIX_DIGITS = 1 << RADIX_BITS,
+
+    LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
+    WARP_THREADS     = 1 << LOG_WARP_THREADS,
+    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+    BYTES_PER_COUNTER     = sizeof(DigitCounter),
+    LOG_BYTES_PER_COUNTER = Log2<BYTES_PER_COUNTER>::VALUE,
+
+    PACKING_RATIO     = static_cast<int>(sizeof(PackedCounter) / sizeof(DigitCounter)),
+    LOG_PACKING_RATIO = Log2<PACKING_RATIO>::VALUE,
+
+    // Always at least one lane
+    LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),
+    COUNTER_LANES     = 1 << LOG_COUNTER_LANES,
+
+    // The number of packed counters per thread (plus one for padding)
+    PADDED_COUNTER_LANES = COUNTER_LANES + 1,
+    RAKING_SEGMENT       = PADDED_COUNTER_LANES,
+  };
+
+public:
+  enum
+  {
+    /// Number of bin-starting offsets tracked per thread
+    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+  };
+
+private:
+  /// BlockScan type
+  using BlockScan = BlockScan<PackedCounter, BLOCK_DIM_X, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  struct __align__(16) _TempStorage
+  {
+    union Aliasable
+    {
+      DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+      PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+    } aliasable;
+
+    // Storage for scanning local ranks
+    typename BlockScan::TempStorage block_scan;
+  };
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+  /// Copy of raking segment, promoted to registers
+  PackedCounter cached_segment[RAKING_SEGMENT];
+
+  /**
+   * Internal storage allocator
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /**
+   * Performs upsweep raking reduction, returning the aggregate
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE PackedCounter Upsweep()
+  {
+    PackedCounter* smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+    PackedCounter* raking_ptr;
+
+    if (MEMOIZE_OUTER_SCAN)
+    {
+// Copy data into registers
+#pragma unroll
+      for (int i = 0; i < RAKING_SEGMENT; i++)
+      {
+        cached_segment[i] = smem_raking_ptr[i];
+      }
+      raking_ptr = cached_segment;
+    }
+    else
+    {
+      raking_ptr = smem_raking_ptr;
+    }
+
+    return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+  }
+
+  /// Performs exclusive downsweep raking scan
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveDownsweep(PackedCounter raking_partial)
+  {
+    PackedCounter* smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+    PackedCounter* raking_ptr = (MEMOIZE_OUTER_SCAN) ? cached_segment : smem_raking_ptr;
+
+    // Exclusive raking downsweep scan
+    internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+    if (MEMOIZE_OUTER_SCAN)
+    {
+// Copy data back to smem
+#pragma unroll
+      for (int i = 0; i < RAKING_SEGMENT; i++)
+      {
+        smem_raking_ptr[i] = cached_segment[i];
+      }
+    }
+  }
+
+  /**
+   * Reset shared memory digit counters
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ResetCounters()
+  {
+// Reset shared memory digit counters
+#pragma unroll
+    for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+    {
+      *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+    }
+  }
+
+  /**
+   * Block-scan prefix callback
+   */
+  struct PrefixCallBack
+  {
+    _CCCL_DEVICE _CCCL_FORCEINLINE PackedCounter operator()(PackedCounter block_aggregate)
+    {
+      PackedCounter block_prefix = 0;
+
+// Propagate totals in packed fields
+#pragma unroll
+      for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+      {
+        block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+      }
+
+      return block_prefix;
+    }
+  };
+
+  /**
+   * Scan shared memory digit counters.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanCounters()
+  {
+    // Upsweep scan
+    PackedCounter raking_partial = Upsweep();
+
+    // Compute exclusive sum
+    PackedCounter exclusive_partial;
+    PrefixCallBack prefix_call_back;
+    BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+    // Downsweep scan with exclusive partial
+    ExclusiveDownsweep(exclusive_partial);
+  }
+
+public:
+  /// @smemstorage{BlockScan}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRank()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRank(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @} end member group
+  //! @name Raking
+  //! @{
+
+  /**
+   * @brief Rank keys.
+   *
+   * @param[in] keys
+   *   Keys for this tile
+   *
+   * @param[out] ranks
+   *   For each key, the local rank within the tile
+   *
+   * @param[in] digit_extractor
+   *   The digit extractor
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
+  {
+    static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size,
+                  "DigitCounter type is too small to hold this number of keys");
+
+    DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the
+                                                   // same digit
+    DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter
+                                                   // in smem
+
+    // Reset shared memory digit counters
+    ResetCounters();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+    {
+      // Get digit
+      ::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]);
+
+      // Get sub-counter
+      ::cuda::std::uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
+
+      // Get counter lane
+      ::cuda::std::uint32_t counter_lane = digit & (COUNTER_LANES - 1);
+
+      if (IS_DESCENDING)
+      {
+        sub_counter  = PACKING_RATIO - 1 - sub_counter;
+        counter_lane = COUNTER_LANES - 1 - counter_lane;
+      }
+
+      // Pointer to smem digit counter
+      digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+      // Load thread-exclusive prefix
+      thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+      // Store inclusive prefix
+      *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+    }
+
+    CTA_SYNC();
+
+    // Scan shared memory counters
+    ScanCounters();
+
+    CTA_SYNC();
+
+// Extract the local ranks of each key
+#pragma unroll
+    for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+    {
+      // Add in thread block exclusive prefix
+      ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+    }
+  }
+
+  /**
+   * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+   *        provided for the corresponding thread.
+   *
+   * @param[in] keys
+   *   Keys for this tile
+   *
+   * @param[out] ranks
+   *   For each key, the local rank within the tile (out parameter)
+   *
+   * @param[in] digit_extractor
+   *   The digit extractor
+   *
+   * @param[out] exclusive_digit_prefix
+   *   The exclusive prefix sum for the digits
+   *   [(threadIdx.x * BINS_TRACKED_PER_THREAD)
+   *                   ...
+   *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+           int (&ranks)[KEYS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
+  {
+    static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size,
+                  "DigitCounter type is too small to hold this number of keys");
+
+    // Rank keys
+    RankKeys(keys, ranks, digit_extractor);
+
+// Get the inclusive and exclusive digit totals corresponding to the calling thread.
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        if (IS_DESCENDING)
+        {
+          bin_idx = RADIX_DIGITS - bin_idx - 1;
+        }
+
+        // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+        // first counter column, resulting in unavoidable bank conflicts.)
+        unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1));
+        unsigned int sub_counter  = bin_idx >> (LOG_COUNTER_LANES);
+
+        exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+      }
+    }
+  }
+
+  //! @}
+};
+
+/**
+ * Radix-rank using match.any
+ */
+template <int BLOCK_DIM_X,
+          int RADIX_BITS,
+          bool IS_DESCENDING,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          int BLOCK_DIM_Y                         = 1,
+          int BLOCK_DIM_Z                         = 1,
+          int LEGACY_PTX_ARCH                     = 0>
+class BlockRadixRankMatch
+{
+private:
+  using RankT         = int32_t;
+  using DigitCounterT = int32_t;
+
+  enum
+  {
+    // The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    RADIX_DIGITS = 1 << RADIX_BITS,
+
+    LOG_WARP_THREADS     = CUB_LOG_WARP_THREADS(0),
+    WARP_THREADS         = 1 << LOG_WARP_THREADS,
+    PARTIAL_WARP_THREADS = BLOCK_THREADS % WARP_THREADS,
+    WARPS                = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+    PADDED_WARPS = ((WARPS & 0x1) == 0) ? WARPS + 1 : WARPS,
+
+    COUNTERS              = PADDED_WARPS * RADIX_DIGITS,
+    RAKING_SEGMENT        = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? RAKING_SEGMENT + 1 : RAKING_SEGMENT,
+  };
+
+public:
+  enum
+  {
+    /// Number of bin-starting offsets tracked per thread
+    BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+  };
+
+private:
+  /// BlockScan type
+  using BlockScanT = BlockScan<DigitCounterT, BLOCK_THREADS, INNER_SCAN_ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  struct __align__(16) _TempStorage
+  {
+    typename BlockScanT::TempStorage block_scan;
+
+    union __align__(16) Aliasable
+    {
+      volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+      DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+    }
+    aliasable;
+  };
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+public:
+  /// @smemstorage{BlockRadixRankMatch}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRankMatch(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Raking
+  //! @{
+
+  /**
+   * @brief Computes the count of keys for each digit value, and calls the
+   *        callback with the array of key counts.
+   *
+   * @tparam CountsCallback The callback type. It should implement an instance
+   * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins
+   * is an array of key counts for each digit value distributed in block
+   * distribution among the threads of the thread block. Key counts can be
+   * used, to update other data structures in global or shared
+   * memory. Depending on the implementation of the ranking algoirhtm
+   * (see BlockRadixRankMatchEarlyCounts), key counts may become available
+   * early, therefore, they are returned through a callback rather than a
+   * separate output parameter of RankKeys().
+   */
+  template <int KEYS_PER_THREAD, typename CountsCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void CallBack(CountsCallback callback)
+  {
+    int bins[BINS_TRACKED_PER_THREAD];
+// Get count for each digit
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx              = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+      constexpr int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        if (IS_DESCENDING)
+        {
+          bin_idx     = RADIX_DIGITS - bin_idx - 1;
+          bins[track] = (bin_idx > 0 ? temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS)
+                      - temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+        }
+        else
+        {
+          bins[track] =
+            (bin_idx < RADIX_DIGITS - 1 ? temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS)
+            - temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+        }
+      }
+    }
+    callback(bins);
+  }
+
+  /**
+   * @brief Rank keys.
+   *
+   * @param[in] keys
+   *   Keys for this tile
+   *
+   * @param[out] ranks
+   *   For each key, the local rank within the tile
+   *
+   * @param[in] digit_extractor
+   *   The digit extractor
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+           int (&ranks)[KEYS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           CountsCallback callback)
+  {
+    // Initialize shared digit counters
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+    {
+      temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+    }
+
+    CTA_SYNC();
+
+    // Each warp will strip-mine its section of input, one strip at a time
+
+    volatile DigitCounterT* digit_counters[KEYS_PER_THREAD];
+    uint32_t warp_id      = linear_tid >> LOG_WARP_THREADS;
+    uint32_t lane_mask_lt = LaneMaskLt();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+    {
+      // My digit
+      ::cuda::std::uint32_t digit = digit_extractor.Digit(keys[ITEM]);
+
+      if (IS_DESCENDING)
+      {
+        digit = RADIX_DIGITS - digit - 1;
+      }
+
+      // Mask of peers who have same digit as me
+      uint32_t peer_mask =
+        detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, WARPS - 1>::match_any(digit, warp_id);
+
+      // Pointer to smem digit counter for this key
+      digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+      // Number of occurrences in previous strips
+      DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+      // Warp-sync
+      WARP_SYNC(0xFFFFFFFF);
+
+      // Number of peers having same digit as me
+      int32_t digit_count = __popc(peer_mask);
+
+      // Number of lower-ranked peers having same digit seen so far
+      int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+      if (peer_digit_prefix == 0)
+      {
+        // First thread for each digit updates the shared warp counter
+        *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+      }
+
+      // Warp-sync
+      WARP_SYNC(0xFFFFFFFF);
+
+      // Number of prior keys having same digit
+      ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+    }
+
+    CTA_SYNC();
+
+    // Scan warp counters
+
+    DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+    {
+      scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+    }
+
+    BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+    {
+      temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+    }
+
+    CTA_SYNC();
+    if (!::cuda::std::is_same<CountsCallback, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>>::value)
+    {
+      CallBack<KEYS_PER_THREAD>(callback);
+    }
+
+// Seed ranks with counter values from previous warps
+#pragma unroll
+    for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+    {
+      ranks[ITEM] += *digit_counters[ITEM];
+    }
+  }
+
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
+  {
+    RankKeys(keys, ranks, digit_extractor, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+  }
+
+  /**
+   * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+   *        provided for the corresponding thread.
+   *
+   * @param[in] keys
+   *   Keys for this tile
+   *
+   * @param[out] ranks
+   *   For each key, the local rank within the tile (out parameter)
+   *
+   * @param[in] digit_extractor
+   *   The digit extractor
+   *
+   * @param[out] exclusive_digit_prefix
+   *   The exclusive prefix sum for the digits
+   *   [(threadIdx.x * BINS_TRACKED_PER_THREAD)
+   *                   ...
+   *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(
+    UnsignedBits (&keys)[KEYS_PER_THREAD],
+    int (&ranks)[KEYS_PER_THREAD],
+    DigitExtractorT digit_extractor,
+    int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],
+    CountsCallback callback)
+  {
+    RankKeys(keys, ranks, digit_extractor, callback);
+
+// Get exclusive count for each digit
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        if (IS_DESCENDING)
+        {
+          bin_idx = RADIX_DIGITS - bin_idx - 1;
+        }
+
+        exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+      }
+    }
+  }
+
+  /**
+   * @param[in] keys
+   *   Keys for this tile
+   *
+   * @param[out] ranks
+   *   For each key, the local rank within the tile (out parameter)
+   *
+   * @param[out] exclusive_digit_prefix
+   *   The exclusive prefix sum for the digits
+   *   [(threadIdx.x * BINS_TRACKED_PER_THREAD)
+   *                   ...
+   *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+           int (&ranks)[KEYS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
+  {
+    RankKeys(
+      keys, ranks, digit_extractor, exclusive_digit_prefix, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+  }
+
+  //! @}
+};
+
+enum WarpMatchAlgorithm
+{
+  WARP_MATCH_ANY,
+  WARP_MATCH_ATOMIC_OR
+};
+
+/**
+ * Radix-rank using matching which computes the counts of keys for each digit
+ * value early, at the expense of doing more work. This may be useful e.g. for
+ * decoupled look-back, where it reduces the time other thread blocks need to
+ * wait for digit counts to become available.
+ */
+template <int BLOCK_DIM_X,
+          int RADIX_BITS,
+          bool IS_DESCENDING,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          WarpMatchAlgorithm MATCH_ALGORITHM      = WARP_MATCH_ANY,
+          int NUM_PARTS                           = 1>
+struct BlockRadixRankMatchEarlyCounts
+{
+  // constants
+  enum
+  {
+    BLOCK_THREADS           = BLOCK_DIM_X,
+    RADIX_DIGITS            = 1 << RADIX_BITS,
+    BINS_PER_THREAD         = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    BINS_TRACKED_PER_THREAD = BINS_PER_THREAD,
+    FULL_BINS               = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
+    WARP_THREADS            = CUB_PTX_WARP_THREADS,
+    PARTIAL_WARP_THREADS    = BLOCK_THREADS % WARP_THREADS,
+    BLOCK_WARPS             = BLOCK_THREADS / WARP_THREADS,
+    PARTIAL_WARP_ID         = BLOCK_WARPS - 1,
+    WARP_MASK               = ~0,
+    NUM_MATCH_MASKS         = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0,
+    // Guard against declaring zero-sized array:
+    MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS,
+  };
+
+  // types
+  using BlockScan = cub::BlockScan<int, BLOCK_THREADS, INNER_SCAN_ALGORITHM>;
+
+  struct TempStorage
+  {
+    union
+    {
+      int warp_offsets[BLOCK_WARPS][RADIX_DIGITS];
+      int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS];
+    };
+
+    int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS];
+
+    typename BlockScan::TempStorage prefix_tmp;
+  };
+
+  TempStorage& temp_storage;
+
+  // internal ranking implementation
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback>
+  struct BlockRadixRankMatchInternal
+  {
+    TempStorage& s;
+    DigitExtractorT digit_extractor;
+    CountsCallback callback;
+    int warp;
+    int lane;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key)
+    {
+      ::cuda::std::uint32_t digit = digit_extractor.Digit(key);
+      return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE int ThreadBin(int u)
+    {
+      int bin = threadIdx.x * BINS_PER_THREAD + u;
+      return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin;
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD])
+    {
+      // int* warp_offsets = &s.warp_offsets[warp][0];
+      int(&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp];
+// compute warp-private histograms
+#pragma unroll
+      for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+      {
+#pragma unroll
+        for (int part = 0; part < NUM_PARTS; ++part)
+        {
+          warp_histograms[bin][part] = 0;
+        }
+      }
+      if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR)
+      {
+        int* match_masks = &s.match_masks[warp][0];
+#pragma unroll
+        for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+        {
+          match_masks[bin] = 0;
+        }
+      }
+      WARP_SYNC(WARP_MASK);
+
+      // compute private per-part histograms
+      int part = lane % NUM_PARTS;
+#pragma unroll
+      for (int u = 0; u < KEYS_PER_THREAD; ++u)
+      {
+        atomicAdd(&warp_histograms[Digit(keys[u])][part], 1);
+      }
+
+      // sum different parts;
+      // no extra work is necessary if NUM_PARTS == 1
+      if (NUM_PARTS > 1)
+      {
+        WARP_SYNC(WARP_MASK);
+        // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
+        constexpr int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
+        int bins[WARP_BINS_PER_THREAD];
+#pragma unroll
+        for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+        {
+          int bin = lane + u * WARP_THREADS;
+          bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum());
+        }
+        CTA_SYNC();
+
+        // store the resulting histogram in shared memory
+        int* warp_offsets = &s.warp_offsets[warp][0];
+#pragma unroll
+        for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+        {
+          int bin           = lane + u * WARP_THREADS;
+          warp_offsets[bin] = bins[u];
+        }
+      }
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD])
+    {
+// sum up warp-private histograms
+#pragma unroll
+      for (int u = 0; u < BINS_PER_THREAD; ++u)
+      {
+        bins[u] = 0;
+        int bin = ThreadBin(u);
+        if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+        {
+#pragma unroll
+          for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+          {
+            int warp_offset             = s.warp_offsets[j_warp][bin];
+            s.warp_offsets[j_warp][bin] = bins[u];
+            bins[u] += warp_offset;
+          }
+        }
+      }
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD])
+    {
+#pragma unroll
+      for (int u = 0; u < BINS_PER_THREAD; ++u)
+      {
+        int bin = ThreadBin(u);
+        if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+        {
+          int digit_offset = offsets[u];
+#pragma unroll
+          for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+          {
+            s.warp_offsets[j_warp][bin] += digit_offset;
+          }
+        }
+      }
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ComputeRanksItem(
+      UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type<WARP_MATCH_ATOMIC_OR>)
+    {
+      // compute key ranks
+      int lane_mask     = 1 << lane;
+      int* warp_offsets = &s.warp_offsets[warp][0];
+      int* match_masks  = &s.match_masks[warp][0];
+#pragma unroll
+      for (int u = 0; u < KEYS_PER_THREAD; ++u)
+      {
+        ::cuda::std::uint32_t bin = Digit(keys[u]);
+        int* p_match_mask         = &match_masks[bin];
+        atomicOr(p_match_mask, lane_mask);
+        WARP_SYNC(WARP_MASK);
+        int bin_mask    = *p_match_mask;
+        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int warp_offset = 0;
+        int popc        = __popc(bin_mask & LaneMaskLe());
+        if (lane == leader)
+        {
+          // atomic is a bit faster
+          warp_offset = atomicAdd(&warp_offsets[bin], popc);
+        }
+        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        if (lane == leader)
+        {
+          *p_match_mask = 0;
+        }
+        WARP_SYNC(WARP_MASK);
+        ranks[u] = warp_offset + popc - 1;
+      }
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    ComputeRanksItem(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], Int2Type<WARP_MATCH_ANY>)
+    {
+      // compute key ranks
+      int* warp_offsets = &s.warp_offsets[warp][0];
+#pragma unroll
+      for (int u = 0; u < KEYS_PER_THREAD; ++u)
+      {
+        ::cuda::std::uint32_t bin = Digit(keys[u]);
+        int bin_mask =
+          detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
+        int leader      = (WARP_THREADS - 1) - __clz(bin_mask);
+        int warp_offset = 0;
+        int popc        = __popc(bin_mask & LaneMaskLe());
+        if (lane == leader)
+        {
+          // atomic is a bit faster
+          warp_offset = atomicAdd(&warp_offsets[bin], popc);
+        }
+        warp_offset = SHFL_IDX_SYNC(warp_offset, leader, WARP_MASK);
+        ranks[u]    = warp_offset + popc - 1;
+      }
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+             int (&ranks)[KEYS_PER_THREAD],
+             int (&exclusive_digit_prefix)[BINS_PER_THREAD])
+    {
+      ComputeHistogramsWarp(keys);
+
+      CTA_SYNC();
+      int bins[BINS_PER_THREAD];
+      ComputeOffsetsWarpUpsweep(bins);
+      callback(bins);
+
+      BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
+
+      ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
+      CTA_SYNC();
+      ComputeRanksItem(keys, ranks, Int2Type<MATCH_ALGORITHM>());
+    }
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE
+    BlockRadixRankMatchInternal(TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback)
+        : s(temp_storage)
+        , digit_extractor(digit_extractor)
+        , callback(callback)
+        , warp(threadIdx.x / WARP_THREADS)
+        , lane(LaneId())
+    {}
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixRankMatchEarlyCounts(TempStorage& temp_storage)
+      : temp_storage(temp_storage)
+  {}
+
+  /**
+   * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+   *        provided for the corresponding thread.
+   */
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT, typename CountsCallback>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void RankKeys(
+    UnsignedBits (&keys)[KEYS_PER_THREAD],
+    int (&ranks)[KEYS_PER_THREAD],
+    DigitExtractorT digit_extractor,
+    int (&exclusive_digit_prefix)[BINS_PER_THREAD],
+    CountsCallback callback)
+  {
+    BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback> internal(
+      temp_storage, digit_extractor, callback);
+    internal.RankKeys(keys, ranks, exclusive_digit_prefix);
+  }
+
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+           int (&ranks)[KEYS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           int (&exclusive_digit_prefix)[BINS_PER_THREAD])
+  {
+    using CountsCallback = BlockRadixRankEmptyCallback<BINS_PER_THREAD>;
+    BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback> internal(
+      temp_storage, digit_extractor, CountsCallback());
+    internal.RankKeys(keys, ranks, exclusive_digit_prefix);
+  }
+
+  template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], DigitExtractorT digit_extractor)
+  {
+    int exclusive_digit_prefix[BINS_PER_THREAD];
+    RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix);
+  }
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+namespace detail
+{
+
+// `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm
+// template parameter. Other algorithms don't provide the same template parameters, not allowing
+// multi-dimensional thread block specializations.
+//
+// TODO(senior-zero) for 3.0:
+// - Put existing implementations into the detail namespace
+// - Support multi-dimensional thread blocks in the rest of implementations
+// - Repurpose BlockRadixRank as an entry name with the algorithm template parameter
+template <RadixRankAlgorithm RankAlgorithm, int BlockDimX, int RadixBits, bool IsDescending, BlockScanAlgorithm ScanAlgorithm>
+using block_radix_rank_t = ::cuda::std::_If<
+  RankAlgorithm == RADIX_RANK_BASIC,
+  BlockRadixRank<BlockDimX, RadixBits, IsDescending, false, ScanAlgorithm>,
+  ::cuda::std::_If<
+    RankAlgorithm == RADIX_RANK_MEMOIZE,
+    BlockRadixRank<BlockDimX, RadixBits, IsDescending, true, ScanAlgorithm>,
+    ::cuda::std::_If<
+      RankAlgorithm == RADIX_RANK_MATCH,
+      BlockRadixRankMatch<BlockDimX, RadixBits, IsDescending, ScanAlgorithm>,
+      ::cuda::std::_If<
+        RankAlgorithm == RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+        BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ANY>,
+        BlockRadixRankMatchEarlyCounts<BlockDimX, RadixBits, IsDescending, ScanAlgorithm, WARP_MATCH_ATOMIC_OR>>>>>;
+
+} // namespace detail
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_sort.cuh
new file mode 100644
index 000000000..486509929
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_radix_sort.cuh
@@ -0,0 +1,2185 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](../index.html#sec0) methods for radix
+ * sorting of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! BlockRadixSort class provides :ref:`collective <collective-primitives>` methods for sorting
+//! items partitioned across a CUDA thread block using a radix sorting method.
+//!
+//! .. image:: ../../img/sorting_logo.png
+//!     :align: center
+//!
+//! Overview
+//! --------------------------------------------------
+//!
+//! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_ arranges
+//! items into ascending order. It relies upon a positional representation for
+//! keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+//! characters, etc.) specified from least-significant to most-significant.  For a
+//! given input sequence of keys and a set of rules specifying a total ordering
+//! of the symbolic alphabet, the radix sorting method produces a lexicographic
+//! ordering of those keys.
+//!
+//! @rowmajor
+//!
+//! Supported Types
+//! --------------------------------------------------
+//!
+//! BlockRadixSort can sort all of the built-in C++ numeric primitive types
+//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
+//! half-precision floating-point type. User-defined types are supported as long
+//! as decomposer object is provided.
+//!
+//! Floating-Point Special Cases
+//! --------------------------------------------------
+//!
+//! - Positive and negative zeros are considered equivalent, and will be treated
+//!   as such in the output.
+//! - No special handling is implemented for NaN values; these are sorted
+//!   according to their bit representations after any transformations.
+//!
+//! Bitwise Key Transformations
+//! --------------------------------------------------
+//!
+//! Although the direct radix sorting method can only be applied to unsigned
+//! integral types, BlockRadixSort is able to sort signed and floating-point
+//! types via simple bit-wise transformations that ensure lexicographic key
+//! ordering.
+//!
+//! These transformations must be considered when restricting the
+//! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
+//! before the bit-range truncation.
+//!
+//! Any transformations applied to the keys prior to sorting are reversed
+//! while writing to the final output buffer.
+//!
+//! Type Specific Bitwise Transformations
+//! --------------------------------------------------
+//!
+//! To convert the input values into a radix-sortable bitwise representation,
+//! the following transformations take place prior to sorting:
+//!
+//! * For unsigned integral values, the keys are used directly.
+//! * For signed integral values, the sign bit is inverted.
+//! * For positive floating point values, the sign bit is inverted.
+//! * For negative floating point values, the full key is inverted.
+//!
+//! No Descending Sort Transformations
+//! --------------------------------------------------
+//!
+//! Unlike ``DeviceRadixSort``, ``BlockRadixSort`` does not invert the input key bits
+//! when performing a descending sort. Instead, it has special logic to reverse
+//! the order of the keys while sorting.
+//!
+//! Stability
+//! --------------------------------------------------
+//!
+//! BlockRadixSort is stable. For floating-point types -0.0 and +0.0
+//! are considered equal and appear in the result in the same order as they
+//! appear in the input.
+//!
+//!
+//! Performance Considerations
+//! --------------------------------------------------
+//!
+//! * @granularity
+//!
+//! A Simple Example
+//! --------------------------------------------------
+//!
+//! @blockcollective{BlockRadixSort}
+//!
+//! The code snippet below illustrates a sort of 512 integer keys that
+//! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. tab-set-code::
+//!
+//!    .. code-block:: c++
+//!
+//!        #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+//!
+//!        __global__ void kernel(...)
+//!        {
+//!            // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+//!            using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+//!
+//!            // Allocate shared memory for BlockRadixSort
+//!            __shared__ typename BlockRadixSort::TempStorage temp_storage;
+//!
+//!            // Obtain a segment of consecutive items that are blocked across threads
+//!            int thread_keys[4];
+//!            ...
+//!
+//!            // Collectively sort the keys
+//!            BlockRadixSort(temp_storage).Sort(thread_keys);
+//!
+//!            ...
+//!
+//!    .. code-block:: python
+//!
+//!        import cuda.cooperative.experimental as cudax
+//!        from pynvjitlink import patch
+//!        patch.patch_numba_linker(lto=True)
+//!
+//!        # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
+//!        block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
+//!        temp_storage_bytes = block_radix_sort.temp_storage_bytes
+//!
+//!        @cuda.jit(link=block_radix_sort.files)
+//!        def kernel():
+//!            Allocate shared memory for radix sort
+//!            temp_storage = cuda.shared.array(shape=temp_storage_bytes, dtype='uint8')
+//!
+//!            # Obtain a segment of consecutive items that are blocked across threads
+//!            thread_keys = cuda.local.array(shape=items_per_thread, dtype=numba.int32)
+//!            # ...
+//!
+//!            // Collectively sort the keys
+//!            block_radix_sort(temp_storage, thread_keys)
+//!            # ...
+//!
+//! Suppose the set of input ``thread_keys`` across the block of threads is
+//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+//! The corresponding output ``thread_keys`` in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+//!
+//! Re-using dynamically allocating shared memory
+//! --------------------------------------------------
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//!
+//! This example can be easily adapted to the storage required by BlockRadixSort.
+//! @endrst
+//!
+//! @tparam KeyT
+//!   KeyT type
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of items per thread
+//!
+//! @tparam ValueT
+//!   **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
+//!
+//! @tparam RADIX_BITS
+//!   **[optional]** The number of radix bits per digit place (default: 4 bits)
+//!
+//! @tparam MEMOIZE_OUTER_SCAN
+//!  **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
+//!  reads at the expense of higher register pressure (default: true for architectures SM35 and
+//!  newer, false otherwise).
+//!
+//! @tparam INNER_SCAN_ALGORITHM
+//!   **[optional]** The cub::BlockScanAlgorithm algorithm to use
+//!   (default: cub::BLOCK_SCAN_WARP_SCANS)
+//!
+//! @tparam SMEM_CONFIG
+//!   **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused
+template <typename KeyT,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          typename ValueT                         = NullType,
+          int RADIX_BITS                          = 4,
+          bool MEMOIZE_OUTER_SCAN                 = true,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          cudaSharedMemConfig SMEM_CONFIG         = cudaSharedMemBankSizeFourByte,
+          int BLOCK_DIM_Y                         = 1,
+          int BLOCK_DIM_Z                         = 1,
+          int LEGACY_PTX_ARCH                     = 0>
+class BlockRadixSort
+{
+private:
+  /******************************************************************************
+   * Constants and type definitions
+   ******************************************************************************/
+
+  enum
+  {
+    // The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    // Whether or not there are values to be trucked along with keys
+    KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value,
+  };
+
+  // KeyT traits and unsigned bits type
+  using traits                 = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type       = typename traits::bit_ordered_type;
+  using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+  /// Ascending BlockRadixRank utility type
+  using AscendingBlockRadixRank =
+    BlockRadixRank<BLOCK_DIM_X,
+                   RADIX_BITS,
+                   false,
+                   MEMOIZE_OUTER_SCAN,
+                   INNER_SCAN_ALGORITHM,
+                   SMEM_CONFIG,
+                   BLOCK_DIM_Y,
+                   BLOCK_DIM_Z>;
+
+  /// Descending BlockRadixRank utility type
+  using DescendingBlockRadixRank =
+    BlockRadixRank<BLOCK_DIM_X,
+                   RADIX_BITS,
+                   true,
+                   MEMOIZE_OUTER_SCAN,
+                   INNER_SCAN_ALGORITHM,
+                   SMEM_CONFIG,
+                   BLOCK_DIM_Y,
+                   BLOCK_DIM_Z>;
+
+  /// Digit extractor type
+  using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
+
+  /// BlockExchange utility type for keys
+  using BlockExchangeKeys = BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// BlockExchange utility type for values
+  using BlockExchangeValues = BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  /// Shared memory storage layout type
+  union _TempStorage
+  {
+    typename AscendingBlockRadixRank::TempStorage asending_ranking_storage;
+    typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+    typename BlockExchangeKeys::TempStorage exchange_keys;
+    typename BlockExchangeValues::TempStorage exchange_values;
+  };
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+  /******************************************************************************
+   * Utility methods
+   ******************************************************************************/
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Rank keys (specialized for ascending sort)
+  template <class DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
+           int (&ranks)[ITEMS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           Int2Type<false> /*is_descending*/)
+  {
+    AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
+  }
+
+  /// Rank keys (specialized for descending sort)
+  template <class DigitExtractorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RankKeys(bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD],
+           int (&ranks)[ITEMS_PER_THREAD],
+           DigitExtractorT digit_extractor,
+           Int2Type<true> /*is_descending*/)
+  {
+    DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(unsigned_keys, ranks, digit_extractor);
+  }
+
+  /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int (&ranks)[ITEMS_PER_THREAD],
+    Int2Type<false> /*is_keys_only*/,
+    Int2Type<true> /*is_blocked*/)
+  {
+    CTA_SYNC();
+
+    // Exchange values through shared memory in blocked arrangement
+    BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+  }
+
+  /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int (&ranks)[ITEMS_PER_THREAD],
+    Int2Type<false> /*is_keys_only*/,
+    Int2Type<false> /*is_blocked*/)
+  {
+    CTA_SYNC();
+
+    // Exchange values through shared memory in blocked arrangement
+    BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+  }
+
+  /// ExchangeValues (specialized for keys-only sort)
+  template <int IS_BLOCKED>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
+    ValueT (& /*values*/)[ITEMS_PER_THREAD],
+    int (& /*ranks*/)[ITEMS_PER_THREAD],
+    Int2Type<true> /*is_keys_only*/,
+    Int2Type<IS_BLOCKED> /*is_blocked*/)
+  {}
+
+  /**
+   * @brief Sort blocked arrangement
+   *
+   * @param keys
+   *   Keys to sort
+   *
+   * @param values
+   *   Values to sort
+   *
+   * @param begin_bit
+   *   The beginning (least-significant) bit index needed for key comparison
+   *
+   * @param end_bit
+   *   The past-the-end (most-significant) bit index needed for key comparison
+   *
+   * @param is_descending
+   *   Tag whether is a descending-order sort
+   *
+   * @param is_keys_only
+   *   Tag whether is keys-only sort
+   */
+  template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int begin_bit,
+    int end_bit,
+    Int2Type<DESCENDING> is_descending,
+    Int2Type<KEYS_ONLY> is_keys_only,
+    DecomposerT decomposer = {})
+  {
+    bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
+
+#pragma unroll
+    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+    {
+      unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
+    }
+
+    // Radix sorting passes
+    while (true)
+    {
+      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      auto digit_extractor =
+        traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
+
+      // Rank the blocked keys
+      int ranks[ITEMS_PER_THREAD];
+      RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
+      begin_bit += RADIX_BITS;
+
+      CTA_SYNC();
+
+      // Exchange keys through shared memory in blocked arrangement
+      BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+      // Exchange values through shared memory in blocked arrangement
+      ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+      // Quit if done
+      if (begin_bit >= end_bit)
+      {
+        break;
+      }
+
+      CTA_SYNC();
+    }
+
+// Untwiddle bits if necessary
+#pragma unroll
+    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+    {
+      unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
+    }
+  }
+
+public:
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+  /**
+   * @brief Sort blocked -> striped arrangement
+   *
+   * @param keys
+   *   Keys to sort
+   *
+   * @param values
+   *   Values to sort
+   *
+   * @param begin_bit
+   *   The beginning (least-significant) bit index needed for key comparison
+   *
+   * @param end_bit
+   *   The past-the-end (most-significant) bit index needed for key comparison
+   *
+   * @param is_descending
+   *   Tag whether is a descending-order sort
+   *
+   * @param is_keys_only
+   *   Tag whether is keys-only sort
+   */
+  template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int begin_bit,
+    int end_bit,
+    Int2Type<DESCENDING> is_descending,
+    Int2Type<KEYS_ONLY> is_keys_only,
+    DecomposerT decomposer = {})
+  {
+    bit_ordered_type(&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
+
+#  pragma unroll
+    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+    {
+      unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
+    }
+
+    // Radix sorting passes
+    while (true)
+    {
+      int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+      auto digit_extractor =
+        traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
+
+      // Rank the blocked keys
+      int ranks[ITEMS_PER_THREAD];
+      RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
+      begin_bit += RADIX_BITS;
+
+      CTA_SYNC();
+
+      // Check if this is the last pass
+      if (begin_bit >= end_bit)
+      {
+        // Last pass exchanges keys through shared memory in striped arrangement
+        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+        // Last pass exchanges through shared memory in striped arrangement
+        ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+        // Quit
+        break;
+      }
+
+      // Exchange keys through shared memory in blocked arrangement
+      BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+      // Exchange values through shared memory in blocked arrangement
+      ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+      CTA_SYNC();
+    }
+
+// Untwiddle bits if necessary
+#  pragma unroll
+    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+    {
+      unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
+    }
+  }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /// @smemstorage{BlockRadixSort}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @} end member group
+  //! @name Sorting (blocked arrangements)
+  //! @{
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys
+  //!        BlockRadixSort(temp_storage).Sort(thread_keys);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 2 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 1 key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-bits
+  //!     :end-before: example-end keys-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys
+  //!     :end-before: example-end keys
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of keys and values.
+  //!
+  //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices. The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive pairs.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        int thread_values[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys and values among block threads
+  //!        BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+  //!
+  //! @endcode
+  //! @par
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.  The
+  //! corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+       ValueT (&values)[ITEMS_PER_THREAD],
+       int begin_bit = 0,
+       int end_bit   = sizeof(KeyT) * 8)
+  {
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
+  //!
+  //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices. The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 2 keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 1 pair.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-bits
+  //!     :end-before: example-end pairs-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+       ValueT (&values)[ITEMS_PER_THREAD],
+       DecomposerT decomposer,
+       int begin_bit,
+       int end_bit)
+  {
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
+  //!
+  //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices. The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive pairs.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs
+  //!     :end-before: example-end pairs
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  Sort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of keys.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys that
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
+  //! where each thread owns 4 consecutive keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys
+  //!        BlockRadixSort(temp_storage).Sort(thread_keys);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 2 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 1 key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending-bits
+  //!     :end-before: example-end keys-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending
+  //!     :end-before: example-end keys-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlocked(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<true>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of keys and values.
+  //!
+  //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices.  The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive pairs.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        int thread_values[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys and values among block threads
+  //!        BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. The
+  //! corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int begin_bit = 0,
+    int end_bit   = sizeof(KeyT) * 8)
+  {
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
+  //!
+  //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices. The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 2 pairs that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 1 pair.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending-bits
+  //!     :end-before: example-end pairs-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
+                 ValueT (&values)[ITEMS_PER_THREAD],
+                 DecomposerT decomposer,
+                 int begin_bit,
+                 int end_bit)
+  {
+    SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values.
+  //!
+  //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices. The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive pairs.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending
+  //!     :end-before: example-end pairs-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    SortBlocked(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<true>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //! @}  end member group
+  //! @name Sorting (blocked arrangement -> striped arrangement)
+  //! @{
+
+  //! @rst
+  //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys,
+  //! leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys that
+  //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+  //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys
+  //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 4 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 2 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-striped-bits
+  //!     :end-before: example-end keys-striped-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-striped
+  //!     :end-before: example-end keys-striped
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<false>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //! @rst
+  //! Performs an ascending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>` of keys and
+  //! values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices.  The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys and values that
+  //! are initially partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128
+  //! threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        int thread_values[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys and values among block threads
+  //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int begin_bit = 0,
+    int end_bit   = sizeof(KeyT) * 8)
+  {
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 4 pairs that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-striped-bits
+  //!     :end-before: example-end pairs-striped-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                       ValueT (&values)[ITEMS_PER_THREAD],
+                       DecomposerT decomposer,
+                       int begin_bit,
+                       int end_bit)
+  {
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs an ascending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 pairs that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-striped
+  //!     :end-before: example-end pairs-striped
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    SortBlockedToStriped(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<false>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of keys, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys that
+  //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+  //! threads where each thread owns 4 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys
+  //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 4 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 2 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-striped-descending-bits
+  //!     :end-before: example-end keys-striped-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer, int begin_bit, int end_bit)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive keys. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-striped-descending
+  //!     :end-before: example-end keys-striped-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    NullType values[ITEMS_PER_THREAD];
+
+    SortBlockedToStriped(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<true>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending radix sort across a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of keys and values, leaving them in a :ref:`striped arrangement <flexible-data-arrangement>`
+  //!
+  //! - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+  //!   more than one tile of values, simply perform a key-value sort of the keys paired
+  //!   with a temporary value array that enumerates the key indices.  The reordered indices
+  //!   can then be used as a gather-vector for exchanging other associated tile data through
+  //!   shared memory.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sort of 512 integer keys and values that
+  //! are initially partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128
+  //! threads where each thread owns 4 consecutive pairs. The final partitioning is striped.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+  //!        using BlockRadixSort = cub::BlockRadixSort<int, 128, 4, int>;
+  //!
+  //!        // Allocate shared memory for BlockRadixSort
+  //!        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_keys[4];
+  //!        int thread_values[4];
+  //!        ...
+  //!
+  //!        // Collectively sort the keys and values among block threads
+  //!        BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+  //!
+  //! Suppose the set of input ``thread_keys`` across the block of threads is
+  //! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``.
+  //! The corresponding output ``thread_keys`` in those threads will be
+  //! ``{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The beginning (least-significant) bit index needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The past-the-end (most-significant) bit index needed for key comparison
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int begin_bit = 0,
+    int end_bit   = sizeof(KeyT) * 8)
+  {
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 4 keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 2 consecutive pairs. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-striped-descending-bits
+  //!     :end-before: example-end pairs-striped-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescendingBlockedToStriped(
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    DecomposerT decomposer,
+    int begin_bit,
+    int end_bit)
+  {
+    SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>(), decomposer);
+  }
+
+  //! @rst
+  //! Performs a descending block-wide radix sort over a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` of keys and values, leaving them in a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! * @granularity
+  //! * @smemreuse
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The code snippet below illustrates a sort of 6 keys and values that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 2 threads
+  //! where each thread owns 3 consecutive pairs. The final partitioning is striped.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-striped-descending
+  //!     :end-before: example-end pairs-striped-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in,out] keys
+  //!   Keys to sort
+  //!
+  //! @param[in,out] values
+  //!   Values to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  template <class DecomposerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE //
+  typename ::cuda::std::enable_if< //
+    !::cuda::std::is_convertible<DecomposerT, int>::value>::type
+  SortDescendingBlockedToStriped(
+    KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
+  {
+    SortBlockedToStriped(
+      keys,
+      values,
+      0,
+      detail::radix::traits_t<KeyT>::default_end_bit(decomposer),
+      Int2Type<true>(),
+      Int2Type<KEYS_ONLY>(),
+      decomposer);
+  }
+
+  //@}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_raking_layout.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_raking_layout.cuh
new file mode 100644
index 000000000..4d675b626
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_raking_layout.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking
+ * across thread block data.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! This type facilitates a shared memory usage pattern where a block of CUDA
+//! threads places elements into shared memory and then reduces the active
+//! parallelism to one "raking" warp of threads for serially aggregating consecutive
+//! sequences of shared items.  Padding is inserted to eliminate bank conflicts
+//! (for most data types).
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be exchanged.
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads.
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T, int BLOCK_THREADS, int LEGACY_PTX_ARCH = 0>
+struct BlockRakingLayout
+{
+  //---------------------------------------------------------------------
+  // Constants and type definitions
+  //---------------------------------------------------------------------
+
+  enum
+  {
+    /// The total number of elements that need to be cooperatively reduced
+    SHARED_ELEMENTS = BLOCK_THREADS,
+
+    /// Maximum number of warp-synchronous raking threads
+    MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)),
+
+    /// Number of raking elements per warp-synchronous raking thread (rounded up)
+    SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+    /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2,
+    /// we should only use 31 raking threads)
+    RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+    /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+    HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0),
+
+    /// Degree of bank conflicts (e.g., 4-way)
+    CONFLICT_DEGREE = (HAS_CONFLICTS) ? (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) : 1,
+
+    /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be
+    /// optimized as a vector load
+    USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+    /// Total number of elements in the raking grid
+    GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+    /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the
+    /// number of raking threads)
+    UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+  };
+
+  /**
+   * @brief Shared memory storage type
+   */
+  struct __align__(16) _TempStorage
+  {
+    T buff[BlockRakingLayout::GRID_ELEMENTS];
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  /**
+   * @brief Returns the location for the calling thread to place data into the grid
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE T* PlacementPtr(TempStorage& temp_storage, unsigned int linear_tid)
+  {
+    // Offset for partial
+    unsigned int offset = linear_tid;
+
+    // Add in one padding element for every segment
+    if (USE_SEGMENT_PADDING > 0)
+    {
+      offset += offset / SEGMENT_LENGTH;
+    }
+
+    // Incorporating a block of padding partials every shared memory segment
+    return temp_storage.Alias().buff + offset;
+  }
+
+  /**
+   * @brief Returns the location for the calling thread to begin sequential raking
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE T* RakingPtr(TempStorage& temp_storage, unsigned int linear_tid)
+  {
+    return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_reduce.cuh
new file mode 100644
index 000000000..12c97ee5b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_reduce.cuh
@@ -0,0 +1,626 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! The cub::BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing a parallel
+//! reduction of items partitioned across a CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/specializations/block_reduce_raking.cuh>
+#include <cub/block/specializations/block_reduce_raking_commutative_only.cuh>
+#include <cub/block/specializations/block_reduce_warp_reductions.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+//! BlockReduceAlgorithm enumerates alternative algorithms for parallel reduction across a CUDA thread block.
+enum BlockReduceAlgorithm
+{
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! An efficient "raking" reduction algorithm that only supports commutative
+  //! reduction operators (true for most operations, e.g., addition).
+  //!
+  //! Execution is comprised of three phases:
+  //!   #. Upsweep sequential reduction in registers (if threads contribute more
+  //!      than one input each). Threads in warps other than the first warp place
+  //!      their partial reductions into shared memory.
+  //!   #. Upsweep sequential reduction in shared memory. Threads within the first
+  //!      warp continue to accumulate by raking across segments of shared partial reductions
+  //!   #. A warp-synchronous Kogge-Stone style reduction within the raking warp.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+  //!   and is preferable when the reduction operator is commutative. This variant
+  //!   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+  //!   throughput across the GPU when suitably occupied. However, turn-around latency may be
+  //!   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+  //!   when the GPU is under-occupied.
+  //!
+  //! @endrst
+  BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! An efficient "raking" reduction algorithm that supports commutative
+  //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+  //! operators. @blocked.
+  //!
+  //! Execution is comprised of three phases:
+  //!   #. Upsweep sequential reduction in registers (if threads contribute more
+  //!      than one input each). Each thread then places the partial reduction
+  //!      of its item(s) into shared memory.
+  //!   #. Upsweep sequential reduction in shared memory. Threads within a
+  //!      single warp rake across segments of shared partial reductions.
+  //!   #. A warp-synchronous Kogge-Stone style reduction within the raking warp.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - This variant performs more communication than BLOCK_REDUCE_RAKING
+  //!   and is only preferable when the reduction operator is non-commutative. This variant
+  //!   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+  //!   throughput across the GPU when suitably occupied. However, turn-around latency may be
+  //!   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+  //!   when the GPU is under-occupied.
+  //!
+  //! @endrst
+  BLOCK_REDUCE_RAKING,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A quick "tiled warp-reductions" reduction algorithm that supports commutative
+  //! (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+  //! operators.
+  //!
+  //! Execution is comprised of four phases:
+  //!   #. Upsweep sequential reduction in registers (if threads contribute more
+  //!      than one input each). Each thread then places the partial reduction
+  //!      of its item(s) into shared memory.
+  //!   #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+  //!      reduction within each warp.
+  //!   #. A propagation phase where the warp reduction outputs in each warp are
+  //!      updated with the aggregate from each preceding warp.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+  //!   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+  //!   throughput across the GPU. However turn-around latency may be lower and
+  //!   thus useful when the GPU is under-occupied.
+  //!
+  //! @endrst
+  BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+//! @rst
+//! The BlockReduce class provides :ref:`collective <collective-primitives>` methods for computing a parallel reduction
+//! of items partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_ (or *fold*) uses a binary combining
+//!   operator to compute a single aggregate from a list of input elements.
+//! - @rowmajor
+//! - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput
+//!   workload profiles:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY`:
+//!      An efficient "raking" reduction algorithm that only supports commutative reduction operators.
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_RAKING`:
+//!      An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators.
+//!   #. :cpp:enumerator:`cub::BLOCK_REDUCE_WARP_REDUCTIONS`:
+//!      A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative
+//!      reduction operators.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Very efficient (only one synchronization barrier).
+//! - Incurs zero bank conflicts for most types
+//! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!   - Summation (vs. generic reduction)
+//!   - ``BLOCK_THREADS`` is a multiple of the architecture's warp size
+//!   - Every thread has a valid input (i.e., full vs. partial-tiles)
+//! - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockReduce}
+//!
+//! The code snippet below illustrates a sum reduction of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+//!        using BlockReduce = cub::BlockReduce<int, 128>;
+//!
+//!        // Allocate shared memory for BlockReduce
+//!        __shared__ typename BlockReduce::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Compute the block-wide sum for thread0
+//!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   Data type being reduced
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS,
+          int BLOCK_DIM_Y                = 1,
+          int BLOCK_DIM_Z                = 1,
+          int LEGACY_PTX_ARCH            = 0>
+class BlockReduce
+{
+private:
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  using WarpReductions        = BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using RakingCommutativeOnly = BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using Raking                = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Internal specialization type
+  using InternalBlockReduce =
+    ::cuda::std::_If<ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS,
+                     WarpReductions,
+                     ::cuda::std::_If<ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+                                      RakingCommutativeOnly,
+                                      Raking>>; // BlockReduceRaking
+
+  /// Shared memory storage layout type for BlockReduce
+  using _TempStorage = typename InternalBlockReduce::TempStorage;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+public:
+  /// @smemstorage{BlockReduce}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduce()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduce(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Generic reductions
+  //! @{
+
+  //! @rst
+  //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor.
+  //! Each thread contributes one input element.
+  //!
+  //! - The return value is undefined in threads other than thread\ :sub:`0`.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a max reduction of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Each thread obtains an input item
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Compute the block-wide max for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+  //!
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op)
+  {
+    return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+  }
+
+  //! @rst
+  //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - The return value is undefined in threads other than thread\ :sub:`0`.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a max reduction of 512 integer items that are partitioned in a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
+  //! 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Compute the block-wide max for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] inputs
+  //!   Calling thread's input segment
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  template <int ITEMS_PER_THREAD, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T (&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
+  {
+    // Reduce partials
+    T partial = internal::ThreadReduce(inputs, reduction_op);
+    return Reduce(partial, reduction_op);
+  }
+
+  //! @rst
+  //! Computes a block-wide reduction for thread\ :sub:`0` using the specified binary reduction functor.
+  //! The first ``num_valid`` threads each contribute one input element.
+  //!
+  //! - The return value is undefined in threads other than thread<sub>0</sub>.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a max reduction of a partially-full tile of integer items
+  //! that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int num_valid, ...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Each thread obtains an input item
+  //!        int thread_data;
+  //!        if (threadIdx.x < num_valid) thread_data = ...
+  //!
+  //!        // Compute the block-wide max for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+  //!
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] num_valid
+  //!   Number of threads containing valid elements (may be less than BLOCK_THREADS)
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int num_valid)
+  {
+    // Determine if we skip bounds checking
+    if (num_valid >= BLOCK_THREADS)
+    {
+      return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+    }
+    else
+    {
+      return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+    }
+  }
+
+  //! @}  end member group
+  //! @name Summation reductions
+  //! @{
+
+  //! @rst
+  //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator.
+  //! Each thread contributes one input element.
+  //!
+  //! - The return value is undefined in threads other than thread\ :sub:`0`.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sum reduction of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Each thread obtains an input item
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Compute the block-wide sum for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  {
+    return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+  }
+
+  //! @rst
+  //! Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - The return value is undefined in threads other than thread\ :sub:`0`.
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sum reduction of 512 integer items that are partitioned in a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
+  //! 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Compute the block-wide sum for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @param[in] inputs
+  //!   Calling thread's input segment
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T (&inputs)[ITEMS_PER_THREAD])
+  {
+    // Reduce partials
+    T partial = internal::ThreadReduce(inputs, cub::Sum());
+    return Sum(partial);
+  }
+
+  //! @rst
+  //! Computes a block-wide reduction for thread\ :sub:`0` using addition (+) as the reduction operator.
+  //! The first ``num_valid`` threads each contribute one input element.
+  //!
+  //! - The return value is undefined in threads other than thread\ :sub:`0`.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sum reduction of a partially-full tile of integer items
+  //! that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int num_valid, ...)
+  //!    {
+  //!        // Specialize BlockReduce for a 1D block of 128 threads of type int
+  //!        using BlockReduce = cub::BlockReduce<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockReduce
+  //!        __shared__ typename BlockReduce::TempStorage temp_storage;
+  //!
+  //!        // Each thread obtains an input item (up to num_items)
+  //!        int thread_data;
+  //!        if (threadIdx.x < num_valid)
+  //!            thread_data = ...
+  //!
+  //!        // Compute the block-wide sum for thread0
+  //!        int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] num_valid
+  //!   Number of threads containing valid elements (may be less than BLOCK_THREADS)
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int num_valid)
+  {
+    // Determine if we skip bounds checking
+    if (num_valid >= BLOCK_THREADS)
+    {
+      return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+    }
+    else
+    {
+      return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+    }
+  }
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_run_length_decode.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_run_length_decode.cuh
new file mode 100644
index 000000000..253fdb8b1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_run_length_decode.cuh
@@ -0,0 +1,432 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_scan.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <limits>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That
+//! is, given the two arrays ``run_value[N]`` and ``run_lengths[N]``, ``run_value[i]`` is repeated ``run_lengths[i]``
+//! many times in the output array. Due to the nature of the run-length decoding algorithm
+//! ("decompression"), the output size of the run-length decoded array is runtime-dependent and
+//! potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a
+//! "window" from the run-length decoded array. The window's offset can be specified and
+//! BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from
+//! the specified window will be returned.
+//!
+//! .. note::
+//!
+//!    Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+//!    A run of length zero may not be followed by a run length that is not zero.
+//!
+//! .. code-block:: c++
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!      // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
+//!      using RunItemT = uint64_t;
+//!      // Type large enough to index into the run-length decoded array
+//!      using RunLengthT = uint32_t;
+//!
+//!      // Specialising BlockRunLengthDecode for a 1D block of 128 threads
+//!      constexpr int BLOCK_DIM_X = 128;
+//!      // Specialising BlockRunLengthDecode to have each thread contribute 2 run-length encoded runs
+//!      constexpr int RUNS_PER_THREAD = 2;
+//!      // Specialising BlockRunLengthDecode to have each thread hold 4 run-length decoded items
+//!      constexpr int DECODED_ITEMS_PER_THREAD = 4;
+//!
+//!      // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+//!      using BlockRunLengthDecodeT =
+//!        cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+//!
+//!      // Allocate shared memory for BlockRunLengthDecode
+//!      __shared__ typename BlockRunLengthDecodeT::TempStorage temp_storage;
+//!
+//!      // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+//!      RunItemT run_values[RUNS_PER_THREAD];
+//!      RunLengthT run_lengths[RUNS_PER_THREAD];
+//!      ...
+//!
+//!      // Initialize the BlockRunLengthDecode with the runs that we want to run-length decode
+//!      uint32_t total_decoded_size = 0;
+//!      BlockRunLengthDecodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+//!
+//!      // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all
+//!      runs
+//!      // have been decoded.
+//!      uint32_t decoded_window_offset = 0U;
+//!      while (decoded_window_offset < total_decoded_size)
+//!      {
+//!        RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+//!        RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+//!
+//!        // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+//!        uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+//!        block_rld.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+//!
+//!        decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+//!
+//!        ...
+//!      }
+//!    }
+//!
+//! Suppose the set of input ``run_values`` across the block of threads is
+//! ``{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }`` and
+//! ``run_lengths`` is ``{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }``.
+//! The corresponding output ``decoded_items`` in those threads will be
+//! ``{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }``
+//! and ``relative_offsets`` will be
+//! ``{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }`` during the
+//! first iteration of the while loop.
+//!
+//! @endrst
+//!
+//! @tparam ItemT
+//!   The data type of the items being run-length decoded
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam RUNS_PER_THREAD
+//!   The number of consecutive runs that each thread contributes
+//!
+//! @tparam DECODED_ITEMS_PER_THREAD
+//!   The maximum number of decoded items that each thread holds
+//!
+//! @tparam DecodedOffsetT
+//!   Type used to index into the block's decoded items (large enough to hold the sum over all the
+//!   runs' lengths)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   The thread block length in threads along the Y dimension
+//!
+//! @tparam BLOCK_DIM_Z
+//!   The thread block length in threads along the Z dimension
+template <typename ItemT,
+          int BLOCK_DIM_X,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          typename DecodedOffsetT = uint32_t,
+          int BLOCK_DIM_Y         = 1,
+          int BLOCK_DIM_Z         = 1>
+class BlockRunLengthDecode
+{
+  //---------------------------------------------------------------------
+  // CONFIGS & TYPE ALIASES
+  //---------------------------------------------------------------------
+
+private:
+  /// The thread block size in threads
+  static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+
+  /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+  static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+  /// BlockScan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+  using RunOffsetScanT = BlockScan<DecodedOffsetT, BLOCK_DIM_X, BLOCK_SCAN_RAKING_MEMOIZE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Type used to index into the block's runs
+  using RunOffsetT = uint32_t;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  /// Shared memory type required by this thread block
+  union _TempStorage
+  {
+    typename RunOffsetScanT::TempStorage offset_scan;
+    struct
+    {
+      ItemT run_values[BLOCK_RUNS];
+      DecodedOffsetT run_offsets[BLOCK_RUNS];
+    } runs;
+  }; // union TempStorage
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /// Internal storage allocator (used when the user does not provide pre-allocated shared memory)
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  uint32_t linear_tid;
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // CONSTRUCTOR
+  //---------------------------------------------------------------------
+
+  //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths.
+  //!        The algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+  //!        `RunLengthDecode` calls.
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
+    TempStorage& temp_storage,
+    ItemT (&run_values)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    TotalDecodedSizeT& total_decoded_size)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  //! @brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets.
+  //!        The algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+  //!        `RunLengthDecode` calls.
+  template <typename UserRunOffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
+    TempStorage& temp_storage, ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+   */
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockRunLengthDecode(
+    ItemT (&run_values)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    TotalDecodedSizeT& total_decoded_size)
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunLengths(run_values, run_lengths, total_decoded_size);
+  }
+
+  /**
+   * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+   */
+  template <typename UserRunOffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  BlockRunLengthDecode(ItemT (&run_values)[RUNS_PER_THREAD], UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+private:
+  /**
+   * @brief Returns the offset of the first value within @p input which compares greater than
+   * @p val. This version takes @p MAX_NUM_ITEMS, an upper bound of the array size, which will
+   * be used to determine the number of binary search iterations at compile time.
+   *
+   * @param[in] input
+   *   Input sequence
+   *
+   * @param[in] num_items
+   *   Input sequence length
+   *
+   * @param[in] val
+   *   Search key
+   */
+  template <int MAX_NUM_ITEMS, typename InputIteratorT, typename OffsetT, typename T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT StaticUpperBound(InputIteratorT input, OffsetT num_items, T val)
+  {
+    OffsetT lower_bound = 0;
+    OffsetT upper_bound = num_items;
+#pragma unroll
+    for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
+    {
+      OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
+      mid         = (cub::min)(mid, num_items - 1);
+
+      if (val < input[mid])
+      {
+        upper_bound = mid;
+      }
+      else
+      {
+        lower_bound = mid + 1;
+      }
+    }
+
+    return lower_bound;
+  }
+
+  template <typename RunOffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InitWithRunOffsets(ItemT (&run_values)[RUNS_PER_THREAD], RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+  {
+    // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+    RunOffsetT thread_dst_offset = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      temp_storage.runs.run_values[thread_dst_offset]  = run_values[i];
+      temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+      thread_dst_offset++;
+    }
+
+    // Ensure run offsets and run values have been writen to shared memory
+    CTA_SYNC();
+  }
+
+  template <typename RunLengthT, typename TotalDecodedSizeT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InitWithRunLengths(
+    ItemT (&run_values)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    TotalDecodedSizeT& total_decoded_size)
+  {
+    // Compute the offset for the beginning of each run
+    DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+#pragma unroll
+    for (int i = 0; i < RUNS_PER_THREAD; i++)
+    {
+      run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+    }
+    DecodedOffsetT decoded_size_aggregate;
+    RunOffsetScanT(this->temp_storage.offset_scan).ExclusiveSum(run_offsets, run_offsets, decoded_size_aggregate);
+    total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+    // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+    CTA_SYNC();
+
+    InitWithRunOffsets(run_values, run_offsets);
+  }
+
+public:
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within
+   * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting \p from_decoded_offset can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * `RunLengthDecode` is not required.
+   * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+   * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+   * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+   * \smemreuse
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  template <typename RelativeOffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void RunLengthDecode(
+    ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+    RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+    DecodedOffsetT from_decoded_offset = 0)
+  {
+    // The (global) offset of the first item decoded by this thread
+    DecodedOffsetT thread_decoded_offset = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+    // The run that the first decoded item of this thread belongs to
+    // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+    // last run
+    RunOffsetT assigned_run =
+      StaticUpperBound<BLOCK_RUNS>(temp_storage.runs.run_offsets, BLOCK_RUNS, thread_decoded_offset)
+      - static_cast<RunOffsetT>(1U);
+
+    DecodedOffsetT assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+    // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+    DecodedOffsetT assigned_run_end =
+      (assigned_run == BLOCK_RUNS - 1)
+        ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+        : temp_storage.runs.run_offsets[assigned_run + 1];
+
+    ItemT val = temp_storage.runs.run_values[assigned_run];
+
+#pragma unroll
+    for (DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; i++)
+    {
+      decoded_items[i] = val;
+      item_offsets[i]  = thread_decoded_offset - assigned_run_begin;
+
+      // A thread only needs to fetch the next run if this was not the last loop iteration
+      const bool is_final_loop_iteration = (i + 1 >= DECODED_ITEMS_PER_THREAD);
+      if (!is_final_loop_iteration && (thread_decoded_offset == assigned_run_end - 1))
+      {
+        // We make sure that a thread is not re-entering this conditional when being assigned to the last run already by
+        // extending the last run's length to all the thread's item
+        assigned_run++;
+        assigned_run_begin = temp_storage.runs.run_offsets[assigned_run];
+
+        // If this thread is getting assigned the last run, we make sure it will not fetch any other run after this
+        assigned_run_end = (assigned_run == BLOCK_RUNS - 1)
+                           ? thread_decoded_offset + DECODED_ITEMS_PER_THREAD
+                           : temp_storage.runs.run_offsets[assigned_run + 1];
+        val              = temp_storage.runs.run_values[assigned_run];
+      }
+      thread_decoded_offset++;
+    }
+  }
+
+  /**
+   * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+   * items in a blocked arrangement to `decoded_items`. If the number of run-length decoded items exceeds the
+   * run-length decode buffer (i.e., `DECODED_ITEMS_PER_THREAD * BLOCK_THREADS`), only the items that fit within
+   * the buffer are returned. Subsequent calls to `RunLengthDecode` adjusting `from_decoded_offset` can be
+   * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+   * `RunLengthDecode` is not required.
+   *
+   * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+   * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+   * in undefined behavior.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  RunLengthDecode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], DecodedOffsetT from_decoded_offset = 0)
+  {
+    DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+    RunLengthDecode(decoded_items, item_offsets, from_decoded_offset);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_scan.cuh
new file mode 100644
index 000000000..afc4df76d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_scan.cuh
@@ -0,0 +1,2589 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! The cub::BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
+//! sum/scan of items partitioned across a CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/specializations/block_scan_raking.cuh>
+#include <cub/block/specializations/block_scan_warp_scans.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+//! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
+//!        parallel prefix scan across a CUDA thread block.
+enum BlockScanAlgorithm
+{
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases:
+  //!
+  //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
+  //!    Each thread then places the partial reduction of its item(s) into shared memory.
+  //! #. Upsweep sequential reduction in shared memory.
+  //!    Threads within a single warp rake across segments of shared partial reductions.
+  //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+  //! #. Downsweep sequential exclusive scan in shared memory.
+  //!    Threads within a single warp rake across segments of shared partial reductions,
+  //!    seeded with the warp-scan output.
+  //! #. Downsweep sequential scan in registers (if threads contribute more than one input),
+  //!    seeded with the raking scan output.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Although this variant may suffer longer turnaround latencies when the
+  //!   GPU is under-occupied, it can often provide higher overall throughput
+  //!   across the GPU when suitably occupied.
+  //!
+  //! @endrst
+  BLOCK_SCAN_RAKING,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher
+  //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing
+  //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory.
+  //!
+  //! @endrst
+  BLOCK_SCAN_RAKING_MEMOIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases:
+  //!   #. Upsweep sequential reduction in registers (if threads contribute more than one input each).
+  //!      Each thread then places the partial reduction of its item(s) into shared memory.
+  //!   #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+  //!   #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate
+  //!      from each preceding warp.
+  //!   #. Downsweep sequential scan in registers (if threads contribute more than one input),
+  //!      seeded with the raking scan output.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Although this variant may suffer lower overall throughput across the
+  //!   GPU because due to a heavy reliance on inefficient warpscans, it can
+  //!   often provide lower turnaround latencies when the GPU is under-occupied.
+  //!
+  //! @endrst
+  BLOCK_SCAN_WARP_SCANS,
+};
+
+//! @rst
+//! The BlockScan class provides :ref:`collective <collective-primitives>` methods for computing a parallel prefix
+//! sum/scan of items partitioned across a CUDA thread block.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - Given a list of input elements and a binary reduction operator, a
+//!   `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output list where each element is computed
+//!   to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with
+//!   the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates
+//!   the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
+//!   the *i*\ :sup:`th` output reduction.
+//! - @rowmajor
+//! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`:
+//!      An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm.
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`:
+//!      Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional
+//!      register pressure for intermediate storage.
+//!   #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`:
+//!      A quick (low latency) "tiled warpscans" prefix scan algorithm.
+//!
+//! Performance Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - @granularity
+//! - Uses special instructions when applicable (e.g., warp ``SHFL``)
+//! - Uses synchronization-free communication between warp lanes when applicable
+//! - Invokes a minimal number of minimal block-wide synchronization barriers (only
+//!   one or two depending on algorithm selection)
+//! - Incurs zero bank conflicts for most types
+//! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!
+//!   - Prefix sum variants (vs. generic scan)
+//!   - @blocksize
+//!
+//! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockScan}
+//!
+//! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+//! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+//! where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize BlockScan for a 1D block of 128 threads of type int
+//!        using BlockScan = cub::BlockScan<int, 128>;
+//!
+//!        // Allocate shared memory for BlockScan
+//!        __shared__ typename BlockScan::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Collectively compute the block-wide exclusive prefix sum
+//!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
+//! The corresponding output ``thread_data`` in those threads will be
+//! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
+//! BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockScan.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   Data type being scanned
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use
+//!   (default: cub::BLOCK_SCAN_RAKING)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension
+//!   (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING,
+          int BLOCK_DIM_Y              = 1,
+          int BLOCK_DIM_Z              = 1,
+          int LEGACY_PTX_ARCH          = 0>
+class BlockScan
+{
+private:
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /**
+   * Ensure the template parameterization meets the requirements of the
+   * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+   * cannot be used with thread block sizes not a multiple of the
+   * architectural warp size.
+   */
+  static constexpr BlockScanAlgorithm SAFE_ALGORITHM =
+    ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0))
+      ? BLOCK_SCAN_RAKING
+      : ALGORITHM;
+
+  using WarpScans = BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+  using Raking =
+    BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)>;
+
+  /// Define the delegate type for the desired algorithm
+  using InternalBlockScan = ::cuda::std::_If<SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS, WarpScans, Raking>;
+
+  /// Shared memory storage layout type for BlockScan
+  using _TempStorage = typename InternalBlockScan::TempStorage;
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+public:
+  /// @smemstorage{BlockScan}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Exclusive prefix sum operations
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned
+  //! to ``output`` in *thread*\ :sub:`0`.
+  //!
+  //! - @identityzero
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>  // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix sum
+  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
+  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output)
+  {
+    T initial_value{};
+
+    ExclusiveScan(input, output, initial_value, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes one input element.
+  //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @identityzero
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix sum
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
+  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
+  //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate)
+  {
+    T initial_value{};
+
+    ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor
+  //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
+  //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
+  //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @identityzero
+  //! - The ``block_prefix_callback_op`` functor must implement a member function
+  //!   ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` is the same value
+  //!   also returned by the scan operation. The functor will be invoked by the first warp of threads in the block,
+  //!   however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an exclusive prefix sum over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+  //! of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total += block_aggregate;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data = d_data[block_offset];
+  //!
+  //!            // Collectively compute the block-wide exclusive prefix sum
+  //!            BlockScan(temp_storage).ExclusiveSum(
+  //!                thread_data, thread_data, prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            d_data[block_offset] = thread_data;
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+  //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
+  //! The output for the second segment will be ``128, 129, ..., 255``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+  }
+
+  //! @} end member group
+  //! @name Exclusive prefix sum operations (multiple data per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
+  //!
+  //! - @identityzero
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix sum
+  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
+  {
+    T initial_value{};
+
+    ExclusiveScan(input, output, initial_value, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @identityzero
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in
+  //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
+  //! 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix sum
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
+  {
+    // Reduce consecutive thread items in registers
+    T initial_value{};
+
+    ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
+  //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
+  //! value that logically prefixes the thread block's scan inputs.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @identityzero
+  //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
+  //!   The functor's input parameter ``block_aggregate`` is the same value also returned
+  //!   by the scan operation. The functor will be invoked by the first warp of threads in
+  //!   the block, however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix.
+  //!   Can be stateful.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an exclusive prefix sum over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+  //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! across 128 threads where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total += block_aggregate;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  //!        using BlockLoad  = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>;
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>;
+  //!        using BlockScan  = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  //!        __shared__ union {
+  //!            typename BlockLoad::TempStorage     load;
+  //!            typename BlockScan::TempStorage     scan;
+  //!            typename BlockStore::TempStorage    store;
+  //!        } temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data[4];
+  //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!
+  //!            // Collectively compute the block-wide exclusive prefix sum
+  //!            int block_aggregate;
+  //!            BlockScan(temp_storage.scan).ExclusiveSum(
+  //!                thread_data, thread_data, prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+  //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
+  //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member
+  //!   `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(
+    T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+  }
+
+  //! @} end member group // Exclusive prefix sums
+  //! @name Exclusive prefix scan operations
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix max scan
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
+  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] initial_value
+  //!   @rst
+  //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+  //!   @endrst
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op)
+  {
+    InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix max scan
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
+  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
+  //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)``
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to ``input``)
+  //!
+  //! @param[in] initial_value
+  //!   @rst
+  //!   Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`)
+  //!   @endrst
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate)
+  {
+    InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by
+  //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
+  //! the "seed" value that logically prefixes the thread block's scan inputs.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
+  //!   The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation.
+  //!   The functor will be invoked by the first warp of threads in the block, however only the return value from
+  //!   *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an exclusive prefix max scan over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.
+  //! Each tile consists of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(INT_MIN);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data = d_data[block_offset];
+  //!
+  //!            // Collectively compute the block-wide exclusive prefix max scan
+  //!            BlockScan(temp_storage).ExclusiveScan(
+  //!                thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            d_data[block_offset] = thread_data;
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+  //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
+  //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+  }
+
+  //! @} end member group // Inclusive prefix sums
+  //! @name Exclusive prefix scan operations (multiple data per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix max scan of 512 integer
+  //! items that are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3)
+  //! across 128 threads where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix max scan
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] initial_value
+  //!   @rst
+  //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+  //!   @endrst
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+    // Exclusive scan in registers with prefix as seed
+    internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in
+  //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
+  //! 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide exclusive prefix max scan
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``.
+  //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] initial_value
+  //!   @rst
+  //!   Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`)
+  //!   @endrst
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
+    T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+    // Exclusive scan in registers with prefix as seed
+    internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value
+  //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread
+  //! block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function
+  //!   ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate``
+  //!   is the same value also returned by the scan operation. The functor will be invoked by the
+  //!   first warp of threads in the block, however only the return value from
+  //!   *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an exclusive prefix max scan over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans. Each tile consists
+  //! of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  //!        using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>  ;
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
+  //!        using BlockScan = cub::BlockScan<int, 128>                            ;
+  //!
+  //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  //!        __shared__ union {
+  //!            typename BlockLoad::TempStorage     load;
+  //!            typename BlockScan::TempStorage     scan;
+  //!            typename BlockStore::TempStorage    store;
+  //!        } temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data[4];
+  //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!
+  //!            // Collectively compute the block-wide exclusive prefix max scan
+  //!            BlockScan(temp_storage.scan).ExclusiveScan(
+  //!                thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+  //! The corresponding output for the first segment will be
+  //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``.
+  //! The output for the second segment will be
+  //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(
+    T (&input)[ITEMS_PER_THREAD],
+    T (&output)[ITEMS_PER_THREAD],
+    ScanOp scan_op,
+    BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+    // Exclusive scan in registers with prefix as seed
+    internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+  }
+
+  //! @}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans
+
+  //! @name Exclusive prefix scan operations (no initial value, single datum per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element.
+  //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op)
+  {
+    InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element. Also provides every thread with the block-wide
+  //! ``block_aggregate`` of all inputs. With no initial value, the output computed for
+  //! *thread*\ :sub:`0` is undefined.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+  {
+    InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+  }
+
+  //! @}  end member group
+  //! @name Exclusive prefix scan operations (no initial value, multiple data per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements. With no initial value, the
+  //! output computed for *thread*\ :sub:`0` is undefined.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_partial = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+    // Exclusive scan in registers with prefix
+    internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+  }
+
+  //! @rst
+  //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements. Also provides every thread
+  //! with the block-wide ``block_aggregate`` of all inputs.
+  //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_partial = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+    // Exclusive scan in registers with prefix
+    internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+  }
+
+  //! @}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+  //! @name Inclusive prefix sum operations
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+)
+  //! as the scan operator. Each thread contributes one input element.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix sum
+  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
+  //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output)
+  {
+    InclusiveScan(input, output, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes one input element.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix sum
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
+  //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
+  //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate)
+  {
+    InclusiveScan(input, output, cub::Sum(), block_aggregate);
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor
+  //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by
+  //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's
+  //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function
+  //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+  //!   ``block_aggregate`` is the same value also returned by the scan operation.
+  //!   The functor will be invoked by the first warp of threads in the block,
+  //!   however only the return value from *lane*\ :sub:`0` is applied
+  //!   as the block-wide prefix. Can be stateful.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an inclusive prefix sum over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.
+  //! Each tile consists of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total += block_aggregate;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data = d_data[block_offset];
+  //!
+  //!            // Collectively compute the block-wide inclusive prefix sum
+  //!            BlockScan(temp_storage).InclusiveSum(
+  //!                thread_data, thread_data, prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            d_data[block_offset] = thread_data;
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+  //! The corresponding output for the first segment will be ``1, 2, ..., 128``.
+  //! The output for the second segment will be ``129, 130, ..., 256``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied
+  //!   to the logical input sequence.
+  //!   @endrst
+  template <typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+  }
+
+  //! @}  end member group
+  //! @name Inclusive prefix sum operations (multiple data per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix sum
+  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output
+  //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD])
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveSum(input[0], output[0]);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      Sum scan_op;
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan
+      ExclusiveSum(thread_prefix, thread_prefix);
+
+      // Inclusive scan in registers with prefix as seed
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+    }
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix sum
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
+  //! corresponding output ``thread_data`` in those threads will be
+  //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``.
+  //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[out] block_aggregate
+  //!   block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate)
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveSum(input[0], output[0], block_aggregate);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      Sum scan_op;
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan
+      ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+      // Inclusive scan in registers with prefix as seed
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+    }
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.
+  //! Each thread contributes an array of consecutive input elements.
+  //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by
+  //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed"
+  //! value that logically prefixes the thread block's scan inputs. Also provides every thread with the
+  //! block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function
+  //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+  //!   ``block_aggregate`` is the same value also returned by the scan operation.
+  //!   The functor will be invoked by the first warp of threads in the block,
+  //!   however only the return value from *lane*\ :sub:`0` is applied
+  //!   as the block-wide prefix. Can be stateful.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an inclusive prefix sum over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+  //! of 512 integer items that are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! across 128 threads where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total += block_aggregate;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  //!        using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>  ;
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
+  //!        using BlockScan = cub::BlockScan<int, 128>                            ;
+  //!
+  //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  //!        __shared__ union {
+  //!            typename BlockLoad::TempStorage     load;
+  //!            typename BlockScan::TempStorage     scan;
+  //!            typename BlockStore::TempStorage    store;
+  //!        } temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data[4];
+  //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!
+  //!            // Collectively compute the block-wide inclusive prefix sum
+  //!            BlockScan(temp_storage.scan).IncluisveSum(
+  //!                thread_data, thread_data, prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
+  //! The corresponding output for the first segment will be
+  //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be
+  //! ``513, 514, 515, 516, ..., 1023, 1024``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the
+  //!   logical input sequence.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(
+    T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveSum(input[0], output[0], block_prefix_callback_op);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      Sum scan_op;
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan
+      ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+      // Inclusive scan in registers with prefix as seed
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+    }
+  }
+
+  //! @}  end member group
+  //! @name Inclusive prefix scan operations
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+  //! are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix max scan
+  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
+  //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
+  {
+    InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element. Also provides every thread with the block-wide
+  //! ``block_aggregate`` of all inputs.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 128
+  //! integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain input item for each thread
+  //!        int thread_data;
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix max scan
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
+  //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value
+  //! ``126`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   Block-wide aggregate reduction of input items
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+  {
+    InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op``
+  //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as
+  //! the "seed" value that logically prefixes the thread block's scan inputs.
+  //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function
+  //!   ``T operator()(T block_aggregate)``. The functor's input parameter
+  //!   ``block_aggregate`` is the same value also returned by the scan operation.
+  //!   The functor will be invoked by the first warp of threads in the block,
+  //!   however only the return value from *lane*\ :sub:`0` is applied
+  //!   as the block-wide prefix. Can be stateful.
+  //! - Supports non-commutative scan operators.
+  //! - @rowmajor
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an inclusive prefix max scan over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+  //! of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(INT_MIN);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data = d_data[block_offset];
+  //!
+  //!            // Collectively compute the block-wide inclusive prefix max scan
+  //!            BlockScan(temp_storage).InclusiveScan(
+  //!                thread_data, thread_data, cub::Max(), prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            d_data[block_offset] = thread_data;
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+  //! The corresponding output for the first segment will be
+  //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment
+  //! will be ``128, 128, 130, 130, ..., 254, 254``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] output
+  //!   Calling thread's output item (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+  }
+
+  //! @}  end member group
+  //! @name Inclusive prefix scan operations (multiple data per thread)
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix max scan
+  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op)
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveScan(input[0], output[0], scan_op);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan
+      ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+      // Inclusive scan in registers with prefix as seed (first thread does not seed)
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+    }
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
+  //! where each thread owns 2 consecutive items.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-scan-array-init-value
+  //!     :end-before: example-end inclusive-scan-array-init-value
+  //!
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the inclusive scan (uniform across block)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+    // Exclusive scan in registers with prefix as seed
+    internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements. Also provides every thread
+  //! with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+  //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
+  //! where each thread owns 4 consecutive items.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
+  //!        using BlockScan = cub::BlockScan<int, 128>;
+  //!
+  //!        // Allocate shared memory for BlockScan
+  //!        __shared__ typename BlockScan::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Collectively compute the block-wide inclusive prefix max scan
+  //!        int block_aggregate;
+  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
+  //! The corresponding output ``thread_data`` in those threads will be
+  //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``.
+  //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   Block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate)
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan (with no initial value)
+      ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+      // Inclusive scan in registers with prefix as seed (first thread does not seed)
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+    }
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements. Also provides every thread
+  //! with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+  //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 64 threads
+  //! where each thread owns 2 consecutive items.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-scan-array-aggregate-init-value
+  //!     :end-before: example-end inclusive-scan-array-aggregate-init-value
+  //!
+  //! The value ``126`` will be stored in ``block_aggregate`` for all threads.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the inclusive scan (uniform across block). It is not taken
+  //!   into account for block_aggregate.
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[out] block_aggregate
+  //!   Block-wide aggregate reduction of input items
+  template <int ITEMS_PER_THREAD, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
+    T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate)
+  {
+    // Reduce consecutive thread items in registers
+    T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+    // Exclusive thread block-scan
+    ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+    // Exclusive scan in registers with prefix as seed
+    internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+  }
+
+  //! @rst
+  //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor.
+  //! Each thread contributes an array of consecutive input elements.
+  //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block,
+  //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the
+  //! thread block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs.
+  //!
+  //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``.
+  //!   The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation.
+  //!   The functor will be invoked by the first warp of threads in the block, however only the return value
+  //!   from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful.
+  //! - Supports non-commutative scan operators.
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a single thread block that progressively
+  //! computes an inclusive prefix max scan over multiple "tiles" of input using a
+  //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
+  //! of 128 integer items that are partitioned across 128 threads.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+  //!
+  //!    // A stateful callback functor that maintains a running prefix to be applied
+  //!    // during consecutive scan operations.
+  //!    struct BlockPrefixCallbackOp
+  //!    {
+  //!        // Running prefix
+  //!        int running_total;
+  //!
+  //!        // Constructor
+  //!        __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+  //!
+  //!        // Callback operator to be entered by the first warp of threads in the block.
+  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  //!        __host__ int operator()(int block_aggregate)
+  //!        {
+  //!            int old_prefix = running_total;
+  //!            running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+  //!            return old_prefix;
+  //!        }
+  //!    };
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
+  //!    {
+  //!        // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  //!        using BlockLoad = cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>  ;
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE> ;
+  //!        using BlockScan = cub::BlockScan<int, 128>                            ;
+  //!
+  //!        // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  //!        __shared__ union {
+  //!            typename BlockLoad::TempStorage     load;
+  //!            typename BlockScan::TempStorage     scan;
+  //!            typename BlockStore::TempStorage    store;
+  //!        } temp_storage;
+  //!
+  //!        // Initialize running total
+  //!        BlockPrefixCallbackOp prefix_op(0);
+  //!
+  //!        // Have the block iterate over segments of items
+  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+  //!        {
+  //!            // Load a segment of consecutive items that are blocked across threads
+  //!            int thread_data[4];
+  //!            BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!
+  //!            // Collectively compute the block-wide inclusive prefix max scan
+  //!            BlockScan(temp_storage.scan).InclusiveScan(
+  //!                thread_data, thread_data, cub::Max(), prefix_op);
+  //!            CTA_SYNC();
+  //!
+  //!            // Store scanned items to output segment
+  //!            BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+  //!            CTA_SYNC();
+  //!        }
+  //!
+  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
+  //! The corresponding output for the first segment will be
+  //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second
+  //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``.
+  //!
+  //! @endrst
+  //!
+  //! @tparam ITEMS_PER_THREAD
+  //!   **[inferred]** The number of consecutive items partitioned onto each thread.
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam BlockPrefixCallbackOp
+  //!   **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input items
+  //!
+  //! @param[out] output
+  //!   Calling thread's output items (may be aliased to `input`)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in,out] block_prefix_callback_op
+  //!   @rst
+  //!   *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to
+  //!   the logical input sequence.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
+    T (&input)[ITEMS_PER_THREAD],
+    T (&output)[ITEMS_PER_THREAD],
+    ScanOp scan_op,
+    BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    if (ITEMS_PER_THREAD == 1)
+    {
+      InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+    }
+    else
+    {
+      // Reduce consecutive thread items in registers
+      T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+      // Exclusive thread block-scan
+      ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+      // Inclusive scan in registers with prefix as seed
+      internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+    }
+  }
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_shuffle.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_shuffle.cuh
new file mode 100644
index 000000000..a3dedcc3c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_shuffle.cuh
@@ -0,0 +1,348 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! The cub::BlockShuffle class provides :ref:`collective <collective-primitives>` methods for shuffling data
+//! partitioned across a CUDA thread block.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The BlockShuffle class provides :ref:`collective <collective-primitives>`
+//! methods for shuffling data partitioned across a CUDA thread block.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! It is commonplace for blocks of threads to rearrange data items between threads.
+//! The BlockShuffle abstraction allows threads to efficiently shift items either
+//! (a) up to their successor or
+//! (b) down to their predecessor
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The data type to be exchanged.
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int LEGACY_PTX_ARCH = 0>
+class BlockShuffle
+{
+private:
+  enum
+  {
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(0),
+    WARP_THREADS     = 1 << LOG_WARP_THREADS,
+    WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+  };
+
+  /// Shared memory storage layout type (last element from each thread's input)
+  using _TempStorage = T[BLOCK_THREADS];
+
+public:
+  /// \smemstorage{BlockShuffle}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+private:
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  unsigned int linear_tid;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+public:
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockShuffle()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation
+   *        as temporary storage.
+   *
+   * @param[in] temp_storage
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockShuffle(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Shuffle movement
+  //! @{
+
+  //! @rst
+  //!
+  //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`.
+  //! The offset ``distance`` may be negative.
+  //!
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   @rst
+  //!   The input item from the calling thread (*thread*\ :sub:`i`)
+  //!   @endrst
+  //!
+  //! @param[out] output
+  //!   @rst
+  //!   The ``input`` item from the successor (or predecessor) thread
+  //!   *thread*\ :sub:`i + distance` (may be aliased to ``input``).
+  //!   This value is only updated for for *thread*\ :sub:`i` when
+  //!   ``0 <= (i + distance) < BLOCK_THREADS - 1``
+  //!   @endrst
+  //!
+  //! @param[in] distance
+  //!   Offset distance (may be negative)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Offset(T input, T& output, int distance = 1)
+  {
+    temp_storage[linear_tid] = input;
+
+    CTA_SYNC();
+
+    const int offset_tid = static_cast<int>(linear_tid) + distance;
+    if ((offset_tid >= 0) && (offset_tid < BLOCK_THREADS))
+    {
+      output = temp_storage[static_cast<size_t>(offset_tid)];
+    }
+  }
+
+  //! @rst
+  //! Each *thread*\ :sub:`i` obtains the ``input`` provided by *thread*\ :sub:`i + distance`.
+  //!
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The calling thread's input item
+  //!
+  //! @param[out] output
+  //!   @rst
+  //!   The ``input`` item from thread
+  //!   *thread*\ :sub:`(i + distance>) % BLOCK_THREADS` (may be aliased to ``input``).
+  //!   This value is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+  //!   @endrst
+  //!
+  //! @param[in] distance
+  //!   Offset distance (`0 < distance < `BLOCK_THREADS`)
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Rotate(T input, T& output, unsigned int distance = 1)
+  {
+    temp_storage[linear_tid] = input;
+
+    CTA_SYNC();
+
+    unsigned int offset = linear_tid + distance;
+    if (offset >= BLOCK_THREADS)
+    {
+      offset -= BLOCK_THREADS;
+    }
+
+    output = temp_storage[offset];
+  }
+
+  //! @rst
+  //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>` of
+  //! ``input`` items, shifting it up by one item.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The calling thread's input items
+  //!
+  //! @param[out] prev
+  //!   @rst
+  //!   The corresponding predecessor items (may be aliased to ``input``).
+  //!   The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
+  {
+    temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+    {
+      prev[ITEM] = input[ITEM - 1];
+    }
+
+    if (linear_tid > 0)
+    {
+      prev[0] = temp_storage[linear_tid - 1];
+    }
+  }
+
+  //! @rst
+  //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of ``input`` items, shifting it up by one item. All threads receive the ``input`` provided by
+  //! *thread*\ :sub:`BLOCK_THREADS - 1`.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The calling thread's input items
+  //!
+  //! @param[out] prev
+  //!   @rst
+  //!   The corresponding predecessor items (may be aliased to ``input``).
+  //!   The item ``prev[0]`` is not updated for *thread*\ :sub:`0`.
+  //!   @endrst
+  //!
+  //! @param[out] block_suffix
+  //!   @rst
+  //!   The item ``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`, provided to all threads
+  //!   @endrst
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD], T& block_suffix)
+  {
+    Up(input, prev);
+    block_suffix = temp_storage[BLOCK_THREADS - 1];
+  }
+
+  //! @rst
+  //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>`
+  //! of ``input`` items, shifting it down by one item.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The calling thread's input items
+  //!
+  //! @param[out] prev
+  //!   @rst
+  //!   The corresponding predecessor items (may be aliased to ``input``).
+  //!   The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+  //!   @endrst
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
+  {
+    temp_storage[linear_tid] = input[0];
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+    {
+      prev[ITEM] = input[ITEM + 1];
+    }
+
+    if (linear_tid < BLOCK_THREADS - 1)
+    {
+      prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1];
+    }
+  }
+
+  //! @rst
+  //! The thread block rotates its :ref:`blocked arrangement <flexible-data-arrangement>` of input items,
+  //! shifting it down by one item. All threads receive ``input[0]`` provided by *thread*\ :sub:`0`.
+  //!
+  //! - @blocked
+  //! - @granularity
+  //! - @smemreuse
+  //!
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The calling thread's input items
+  //!
+  //! @param[out] prev
+  //!   @rst
+  //!   The corresponding predecessor items (may be aliased to ``input``).
+  //!   The value ``prev[0]`` is not updated for *thread*\ :sub:`BLOCK_THREADS - 1`.
+  //!   @endrst
+  //!
+  //! @param[out] block_prefix
+  //!   @rst
+  //!   The item ``input[0]`` from *thread*\ :sub:`0`, provided to all threads
+  //!   @endrst
+  template <int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD], T& block_prefix)
+  {
+    Down(input, prev);
+    block_prefix = temp_storage[0];
+  }
+
+  //! @} end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/block_store.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_store.cuh
new file mode 100644
index 000000000..9d057d7fe
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/block_store.cuh
@@ -0,0 +1,1240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! Operations for writing linear segments of data from the CUDA thread block
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_exchange.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @name Blocked arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a blocked arrangement of items across a thread block into a linear segment of items
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[in] items
+//!   Data to store
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+{
+  OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+// Store directly in thread-blocked order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    thread_itr[ITEM] = items[ITEM];
+  }
+}
+
+//! @rst
+//! Store a blocked arrangement of items across a
+//! thread block into a linear segment of items, guarded by range
+//!
+//! @blocked
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[in] items
+//!   Data to store
+//!
+//! @param[in] valid_items
+//!   Number of valid items to write
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+{
+  OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+// Store directly in thread-blocked order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+    {
+      thread_itr[ITEM] = items[ITEM];
+    }
+  }
+}
+
+//! @rst
+//! Store a blocked arrangement of items across a
+//! thread block into a linear segment of items.
+//!
+//! @blocked
+//!
+//! The output offset (``block_ptr + block_offset``) must be quad-item aligned,
+//! which is the default starting offset returned by ``cudaMalloc()``
+//!
+//! The following conditions will prevent vectorization and storing will
+//! fall back to cub::BLOCK_STORE_DIRECT:
+//!
+//!   - ``ITEMS_PER_THREAD`` is odd
+//!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+//!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., ``(threadIdx.y * blockDim.x) + linear_tid`` for 2D thread blocks)
+//!
+//! @param[in] block_ptr
+//!   Input pointer for storing from
+//!
+//! @param[in] items
+//!   Data to store
+template <typename T, int ITEMS_PER_THREAD>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectBlockedVectorized(int linear_tid, T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+{
+  enum
+  {
+    // Maximum CUDA vector size is 4 elements
+    MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+    // Vector size must be a power of two and an even divisor of the items per thread
+    VEC_SIZE =
+      ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? MAX_VEC_SIZE : 1,
+
+    VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+  };
+
+  // Vector type
+  using Vector = typename CubVector<T, VEC_SIZE>::Type;
+
+  // Alias global pointer
+  Vector* block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+  // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+  Vector raw_vector[VECTORS_PER_THREAD];
+  T* raw_items = reinterpret_cast<T*>(raw_vector);
+
+// Copy
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    raw_items[ITEM] = items[ITEM];
+  }
+
+  // Direct-store using vector types
+  StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+//! @}  end member group
+//! @name Striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a striped arrangement of data across the thread block into a
+//! linear segment of items.
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[in] items
+//!   Data to store
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+{
+  OutputIteratorT thread_itr = block_itr + linear_tid;
+
+// Store directly in striped order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+  }
+}
+
+//! @rst
+//! Store a striped arrangement of data across the thread block into
+//! a linear segment of items, guarded by range
+//!
+//! @striped
+//!
+//! @endrst
+//!
+//! @tparam BLOCK_THREADS
+//!   The thread block size in threads
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[in] items
+//!   Data to store
+//!
+//! @param[in] valid_items
+//!   Number of valid items to write
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+{
+  OutputIteratorT thread_itr = block_itr + linear_tid;
+
+// Store directly in striped order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+    {
+      thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+  }
+}
+
+//! @}  end member group
+//! @name Warp-striped arrangement I/O (direct)
+//! @{
+
+//! @rst
+//! Store a warp-striped arrangement of data across the
+//! thread block into a linear segment of items.
+//!
+//! @warpstriped
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[out] items
+//!   Data to load
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+{
+  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+  OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+// Store directly in warp-striped order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+  }
+}
+
+//! @rst
+//! Store a warp-striped arrangement of data across the thread block into a
+//! linear segment of items, guarded by range
+//!
+//! @warpstriped
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++
+//!
+//! The number of threads in the thread block must be a multiple of the architecture's warp size.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   **[inferred]** The data type to store.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   **[inferred]** The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam OutputIteratorT
+//!   **[inferred]** The random-access iterator type for output @iterator.
+//!
+//! @param[in] linear_tid
+//!   A suitable 1D thread-identifier for the calling thread
+//!   (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D thread blocks)
+//!
+//! @param[in] block_itr
+//!   The thread block's base output iterator for storing to
+//!
+//! @param[in] items
+//!   Data to store
+//!
+//! @param[in] valid_items
+//!   Number of valid items to write
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+{
+  int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+  int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+  OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+// Store directly in warp-striped order
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+  {
+    if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+    {
+      thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+  }
+}
+
+//! @}  end member group
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+//! cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a
+//! blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+enum BlockStoreAlgorithm
+{
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) decreases as the
+  //!   access stride between threads increases (i.e., the number items per thread).
+  //!
+  //! @endrst
+  BLOCK_STORE_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) remains high regardless
+  //! of items written per thread.
+  //!
+  //! @endrst
+  BLOCK_STORE_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly
+  //! to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+  //! For example, ``st.global.v4.s32`` instructions will be generated
+  //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high until the the
+  //!   access stride between threads (i.e., the number items per thread) exceeds the
+  //!   maximum vector store width (typically 4 items or 64B, whichever is lower).
+  //! - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+  //!
+  //!   - ``ITEMS_PER_THREAD`` is odd
+  //!   - The ``OutputIteratorT`` is not a simple pointer type
+  //!   - The block output offset is not quadword-aligned
+  //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
+  BLOCK_STORE_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_STORE_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+  //!
+  //! @endrst
+  BLOCK_STORE_WARP_TRANSPOSE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`warp-striped arrangement <flexible-data-arrangement>`.
+  //! To reduce the shared memory requirement, only one warp's worth of shared
+  //! memory is provisioned and is subsequently time-sliced among warps.
+  //!
+  //! Usage Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - BLOCK_THREADS must be a multiple of WARP_THREADS
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high regardless
+  //!   of items written per thread.
+  //! - Provisions less shared memory temporary storage, but incurs larger
+  //!   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+  //!
+  //! @endrst
+  BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+};
+
+//! @rst
+//! The BlockStore class provides :ref:`collective <collective-primitives>` data movement
+//! methods for writing a :ref:`blocked arrangement <flexible-data-arrangement>` of items
+//! partitioned across a CUDA thread block to a linear segment of memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! - The BlockStore class provides a single data movement abstraction that can be specialized
+//!   to implement different cub::BlockStoreAlgorithm strategies. This facilitates different
+//!   performance policies for different architectures, data types, granularity sizes, etc.
+//! - BlockStore can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_DIRECT`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_STRIPED`:
+//!      A :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_VECTORIZE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to memory
+//!      using CUDA's built-in vectorized stores as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_TRANSPOSE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!   #. :cpp:enumerator:`cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED`:
+//!      A :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into
+//!      a :ref:`warp-striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!      To reduce the shared memory requireent, only one warp's worth of shared memory is provisioned and is
+//!      subsequently time-sliced among warps.
+//!
+//! - @rowmajor
+//!
+//! A Simple Example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @blockcollective{BlockStore}
+//!
+//! The code snippet below illustrates the storing of a "blocked" arrangement
+//! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+//! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+//! meaning items are locally reordered among threads so that memory references will be
+//! efficiently coalesced using a warp-striped access pattern.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+//!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
+//!
+//!        // Allocate shared memory for BlockStore
+//!        __shared__ typename BlockStore::TempStorage temp_storage;
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Store items to linear memory
+//!        BlockStore(temp_storage).Store(d_data, thread_data);
+//!
+//! Suppose the set of ``thread_data`` across the block of threads is
+//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+//! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+//!
+//! Re-using dynamically allocating shared memory
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
+//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
+//! This example can be easily adapted to the storage required by BlockStore.
+//!
+//! @endrst
+//!
+//! @tparam T
+//!   The type of data to be written.
+//!
+//! @tparam BLOCK_DIM_X
+//!   The thread block length in threads along the X dimension
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   **[optional]** cub::BlockStoreAlgorithm tuning policy enumeration (default: cub::BLOCK_STORE_DIRECT)
+//!
+//! @tparam BLOCK_DIM_Y
+//!   **[optional]** The thread block length in threads along the Y dimension (default: 1)
+//!
+//! @tparam BLOCK_DIM_Z
+//!   **[optional]** The thread block length in threads along the Z dimension (default: 1)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT,
+          int BLOCK_DIM_Y               = 1,
+          int BLOCK_DIM_Z               = 1,
+          int LEGACY_PTX_ARCH           = 0>
+class BlockStore
+{
+private:
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /// Store helper
+  template <BlockStoreAlgorithm _POLICY, int DUMMY>
+  struct StoreInternal;
+
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+  {
+    /// Shared memory storage layout type
+    using TempStorage = NullType;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  /**
+   * BLOCK_STORE_STRIPED specialization of store helper
+   */
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
+  {
+    /// Shared memory storage layout type
+    using TempStorage = NullType;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  /**
+   * BLOCK_STORE_VECTORIZE specialization of store helper
+   */
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+  {
+    /// Shared memory storage layout type
+    using TempStorage = NullType;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory,
+     *        specialized for native pointer types (attempts vectorization)
+     *
+     * @param[in] block_ptr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory,
+     *        specialized for opaque input iterators (skips vectorization)
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  /**
+   * BLOCK_STORE_TRANSPOSE specialization of store helper
+   */
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+  {
+    // BlockExchange utility type for keys
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+    /// Shared memory storage layout type
+    struct _TempStorage : BlockExchange::TempStorage
+    {
+      /// Temporary storage for partially-full block guard
+      volatile int valid_items;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    /// Thread reference to shared storage
+    _TempStorage& temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      BlockExchange(temp_storage).BlockedToStriped(items);
+      StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      BlockExchange(temp_storage).BlockedToStriped(items);
+      if (linear_tid == 0)
+      {
+        // Move through volatile smem as a workaround to prevent RF spilling on
+        // subsequent loads
+        temp_storage.valid_items = valid_items;
+      }
+      CTA_SYNC();
+      StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+    }
+  };
+
+  /**
+   * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+   */
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+  {
+    enum
+    {
+      WARP_THREADS = CUB_WARP_THREADS(0)
+    };
+
+    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+    // BlockExchange utility type for keys
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+    /// Shared memory storage layout type
+    struct _TempStorage : BlockExchange::TempStorage
+    {
+      /// Temporary storage for partially-full block guard
+      volatile int valid_items;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    /// Thread reference to shared storage
+    _TempStorage& temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      BlockExchange(temp_storage).BlockedToWarpStriped(items);
+      StoreDirectWarpStriped(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      BlockExchange(temp_storage).BlockedToWarpStriped(items);
+      if (linear_tid == 0)
+      {
+        // Move through volatile smem as a workaround to prevent RF spilling on
+        // subsequent loads
+        temp_storage.valid_items = valid_items;
+      }
+      CTA_SYNC();
+      StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+    }
+  };
+
+  /**
+   * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+   */
+  template <int DUMMY>
+  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+  {
+    enum
+    {
+      WARP_THREADS = CUB_WARP_THREADS(0)
+    };
+
+    // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+    static_assert(int(BLOCK_THREADS) % int(WARP_THREADS) == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+    // BlockExchange utility type for keys
+    using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+    /// Shared memory storage layout type
+    struct _TempStorage : BlockExchange::TempStorage
+    {
+      /// Temporary storage for partially-full block guard
+      volatile int valid_items;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    /// Thread reference to shared storage
+    _TempStorage& temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    /**
+     * @brief Store items into a linear segment of memory
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      BlockExchange(temp_storage).BlockedToWarpStriped(items);
+      StoreDirectWarpStriped(linear_tid, block_itr, items);
+    }
+
+    /**
+     * @brief Store items into a linear segment of memory, guarded by range
+     *
+     * @param[in] block_itr
+     *   The thread block's base output iterator for storing to
+     *
+     * @param[in] items
+     *   Data to store
+     *
+     * @param[in] valid_items
+     *   Number of valid items to write
+     */
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      BlockExchange(temp_storage).BlockedToWarpStriped(items);
+      if (linear_tid == 0)
+      {
+        // Move through volatile smem as a workaround to prevent RF spilling on
+        // subsequent loads
+        temp_storage.valid_items = valid_items;
+      }
+      CTA_SYNC();
+      StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+    }
+  };
+
+  /// Internal load implementation to use
+  using InternalStore = StoreInternal<ALGORITHM, 0>;
+
+  /// Shared memory storage layout type
+  using _TempStorage = typename InternalStore::TempStorage;
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /// Thread reference to shared storage
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  int linear_tid;
+
+public:
+  //! @smemstorage{BlockStore}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  /**
+   * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore()
+      : temp_storage(PrivateStorage())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as temporary storage.
+   *
+   * @param temp_storage[in]
+   *   Reference to memory allocation having layout type TempStorage
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockStore(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //! @}  end member group
+  //! @name Data movement
+  //! @{
+
+  //! @rst
+  //! Store items into a linear segment of memory
+  //!
+  //! - @blocked
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the storing of a "blocked" arrangement
+  //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+  //! meaning items are locally reordered among threads so that memory references will be
+  //! efficiently coalesced using a warp-striped access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
+  //!
+  //!        // Allocate shared memory for BlockStore
+  //!        __shared__ typename BlockStore::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Store items to linear memory
+  //!        int thread_data[4];
+  //!        BlockStore(temp_storage).Store(d_data, thread_data);
+  //!
+  //! Suppose the set of ``thread_data`` across the block of threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
+  //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+  //!
+  //! @endrst
+  //!
+  //! @param block_itr[out]
+  //!   The thread block's base output iterator for storing to
+  //!
+  //! @param items[in]
+  //!   Data to store
+  template <typename OutputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+  }
+
+  //! @rst
+  //! Store items into a linear segment of memory, guarded by range.
+  //!
+  //! - @blocked
+  //! - @smemreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the guarded storing of a "blocked" arrangement
+  //! of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for ``BLOCK_STORE_WARP_TRANSPOSE``,
+  //! meaning items are locally reordered among threads so that memory references will be
+  //! efficiently coalesced using a warp-striped access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    {
+  //!        // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+  //!        using BlockStore = cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE>;
+  //!
+  //!        // Allocate shared memory for BlockStore
+  //!        __shared__ typename BlockStore::TempStorage temp_storage;
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Store items to linear memory
+  //!        int thread_data[4];
+  //!        BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+  //!
+  //! Suppose the set of ``thread_data`` across the block of threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
+  //! The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ?, ...``, with
+  //! only the first two threads being unmasked to store portions of valid data.
+  //!
+  //! @endrst
+  //!
+  //! @param block_itr[out]
+  //!   The thread block's base output iterator for storing to
+  //!
+  //! @param items[in]
+  //!   Data to store
+  //!
+  //! @param valid_items[in]
+  //!   Number of valid items to write
+  template <typename OutputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+  }
+
+  //! @}  end member group
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+template <class Policy, class It, class T = cub::detail::value_t<It>>
+struct BlockStoreType
+{
+  using type = cub::BlockStore<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::STORE_ALGORITHM>;
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/radix_rank_sort_operations.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/radix_rank_sort_operations.cuh
new file mode 100644
index 000000000..e56a0ec1e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/radix_rank_sort_operations.cuh
@@ -0,0 +1,615 @@
+/******************************************************************************
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * radix_rank_sort_operations.cuh contains common abstractions, definitions and
+ * operations used for radix sorting and ranking.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/type_traits.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/type_traits/integer_sequence.h>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+/** \brief Base struct for digit extractor. Contains common code to provide
+    special handling for floating-point -0.0.
+
+    \note This handles correctly both the case when the keys are
+    bitwise-complemented after twiddling for descending sort (in onesweep) as
+    well as when the keys are not bit-negated, but the implementation handles
+    descending sort separately (in other implementations in CUB). Twiddling
+    alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
+    subsequent bit patterns and bitwise complements of each other. For onesweep,
+    both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
+    ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
+    sort. For all other sorting implementations in CUB, both are always mapped
+    to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
+    and only one of them is used, the sorting works correctly. For double, the
+    same applies, but with 64-bit patterns.
+*/
+template <typename KeyT, Category TypeCategory = Traits<KeyT>::CATEGORY>
+struct BaseDigitExtractor
+{
+  using TraitsT      = Traits<KeyT>;
+  using UnsignedBits = typename TraitsT::UnsignedBits;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
+  {
+    return key;
+  }
+};
+
+template <typename KeyT>
+struct BaseDigitExtractor<KeyT, FLOATING_POINT>
+{
+  using TraitsT      = Traits<KeyT>;
+  using UnsignedBits = typename TraitsT::UnsignedBits;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
+  {
+    UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
+      TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
+    UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
+    return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
+  }
+};
+
+/** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
+ * key from a digit. */
+template <typename KeyT>
+struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
+{
+  using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+  ::cuda::std::uint32_t bit_start;
+  ::cuda::std::uint32_t num_bits;
+
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE
+  BFEDigitExtractor(::cuda::std::uint32_t bit_start = 0, ::cuda::std::uint32_t num_bits = 0)
+      : bit_start(bit_start)
+      , num_bits(num_bits)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key) const
+  {
+    return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
+  }
+};
+
+/** \brief A wrapper type to extract digits. Uses a combination of shift and
+ * bitwise and to extract digits. */
+template <typename KeyT>
+struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
+{
+  using typename BaseDigitExtractor<KeyT>::UnsignedBits;
+
+  ::cuda::std::uint32_t bit_start;
+  ::cuda::std::uint32_t mask;
+
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE
+  ShiftDigitExtractor(::cuda::std::uint32_t bit_start = 0, ::cuda::std::uint32_t num_bits = 0)
+      : bit_start(bit_start)
+      , mask((1 << num_bits) - 1)
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(UnsignedBits key) const
+  {
+    return ::cuda::std::uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
+  }
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+namespace detail
+{
+
+template <bool... Bs>
+struct logic_helper_t;
+
+template <bool>
+struct true_t
+{
+  static constexpr bool value = true;
+};
+
+template <bool... Bs>
+using all_t = //
+  ::cuda::std::is_same< //
+    logic_helper_t<Bs...>, //
+    logic_helper_t<true_t<Bs>::value...>>;
+
+struct identity_decomposer_t
+{
+  template <class T>
+  _CCCL_HOST_DEVICE T& operator()(T& key) const
+  {
+    return key;
+  }
+};
+
+template <class F, class... Ts, ::cuda::std::size_t... Is>
+_CCCL_HOST_DEVICE void
+for_each_member_impl_helper(F f, const ::cuda::std::tuple<Ts&...>& tpl, THRUST_NS_QUALIFIER::index_sequence<Is...>)
+{
+  auto sink = {(f(::cuda::std::get<Is>(tpl)), 0)...};
+  (void) sink;
+}
+
+template <class F, class... Ts>
+_CCCL_HOST_DEVICE void for_each_member_impl(F f, const ::cuda::std::tuple<Ts&...>& tpl)
+{
+  static_assert(sizeof...(Ts), "Empty aggregates are not supported");
+
+  // Most radix operations are indifferent to the order of operations.
+  // Conversely, the digit extractor traverses fields from the least significant
+  // to the most significant to imitate bitset printing where higher bits are on
+  // the left. It also maps to intuition, where something coming first is more
+  // important. Therefore, we traverse fields on the opposite order.
+  for_each_member_impl_helper(f, tpl, THRUST_NS_QUALIFIER::make_reversed_index_sequence<sizeof...(Ts)>{});
+}
+
+template <class F, class DecomposerT, class T>
+_CCCL_HOST_DEVICE void for_each_member(F f, DecomposerT decomposer, T& aggregate)
+{
+  for_each_member_impl(f, decomposer(aggregate));
+}
+
+namespace radix
+{
+template <class T, class = void>
+struct is_fundamental_type
+{
+  static constexpr bool value = false;
+};
+
+template <class T>
+struct is_fundamental_type<T, ::cuda::std::void_t<typename Traits<T>::UnsignedBits>>
+{
+  static constexpr bool value = true;
+};
+
+template <class T, class = void>
+struct is_tuple_of_references_to_fundamental_types_t : ::cuda::std::false_type
+{};
+
+template <class... Ts>
+struct is_tuple_of_references_to_fundamental_types_t< //
+  ::cuda::std::tuple<Ts&...>, //
+  typename ::cuda::std::enable_if< //
+    all_t<is_fundamental_type<Ts>::value...>::value //
+    >::type> //
+    : ::cuda::std::true_type
+{};
+
+template <class KeyT, class DecomposerT>
+using decomposer_check_t = is_tuple_of_references_to_fundamental_types_t<invoke_result_t<DecomposerT, KeyT&>>;
+
+template <class T>
+struct bit_ordered_conversion_policy_t
+{
+  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+
+  static _CCCL_HOST_DEVICE bit_ordered_type to_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
+  {
+    return Traits<T>::TwiddleIn(val);
+  }
+
+  static _CCCL_HOST_DEVICE bit_ordered_type from_bit_ordered(detail::identity_decomposer_t, bit_ordered_type val)
+  {
+    return Traits<T>::TwiddleOut(val);
+  }
+};
+
+template <class T>
+struct bit_ordered_inversion_policy_t
+{
+  using bit_ordered_type = typename Traits<T>::UnsignedBits;
+
+  static _CCCL_HOST_DEVICE bit_ordered_type inverse(detail::identity_decomposer_t, bit_ordered_type val)
+  {
+    return ~val;
+  }
+};
+
+template <class T, bool = is_fundamental_type<T>::value>
+struct traits_t
+{
+  using bit_ordered_type              = typename Traits<T>::UnsignedBits;
+  using bit_ordered_conversion_policy = bit_ordered_conversion_policy_t<T>;
+  using bit_ordered_inversion_policy  = bit_ordered_inversion_policy_t<T>;
+
+  template <class FundamentalExtractorT, class /* DecomposerT */>
+  using digit_extractor_t = FundamentalExtractorT;
+
+  static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(detail::identity_decomposer_t)
+  {
+    return Traits<T>::LOWEST_KEY;
+  }
+
+  static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(detail::identity_decomposer_t)
+  {
+    return Traits<T>::MAX_KEY;
+  }
+
+  static _CCCL_HOST_DEVICE int default_end_bit(detail::identity_decomposer_t)
+  {
+    return sizeof(T) * 8;
+  }
+
+  template <class FundamentalExtractorT>
+  static _CCCL_HOST_DEVICE digit_extractor_t<FundamentalExtractorT, detail::identity_decomposer_t>
+  digit_extractor(int begin_bit, int num_bits, detail::identity_decomposer_t)
+  {
+    return FundamentalExtractorT(begin_bit, num_bits);
+  }
+};
+
+template <class DecomposerT>
+struct min_raw_binary_key_f
+{
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    using traits                               = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+    using bit_ordered_type                     = typename traits::bit_ordered_type;
+    reinterpret_cast<bit_ordered_type&>(field) = traits::min_raw_binary_key(detail::identity_decomposer_t{});
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void min_raw_binary_key(DecomposerT decomposer, T& aggregate)
+{
+  detail::for_each_member(min_raw_binary_key_f<DecomposerT>{decomposer}, decomposer, aggregate);
+}
+
+template <class DecomposerT>
+struct max_raw_binary_key_f
+{
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    using traits                               = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+    using bit_ordered_type                     = typename traits::bit_ordered_type;
+    reinterpret_cast<bit_ordered_type&>(field) = traits::max_raw_binary_key(detail::identity_decomposer_t{});
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void max_raw_binary_key(DecomposerT decomposer, T& aggregate)
+{
+  detail::for_each_member(max_raw_binary_key_f<DecomposerT>{decomposer}, decomposer, aggregate);
+}
+
+template <class DecomposerT>
+struct to_bit_ordered_f
+{
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    using traits                 = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+    using bit_ordered_type       = typename traits::bit_ordered_type;
+    using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+    auto& ordered_field = reinterpret_cast<bit_ordered_type&>(field);
+    ordered_field       = bit_ordered_conversion::to_bit_ordered(detail::identity_decomposer_t{}, ordered_field);
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void to_bit_ordered(DecomposerT decomposer, T& aggregate)
+{
+  detail::for_each_member(to_bit_ordered_f<DecomposerT>{decomposer}, decomposer, aggregate);
+}
+
+template <class DecomposerT>
+struct from_bit_ordered_f
+{
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    using traits                 = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+    using bit_ordered_type       = typename traits::bit_ordered_type;
+    using bit_ordered_conversion = typename traits::bit_ordered_conversion_policy;
+
+    auto& ordered_field = reinterpret_cast<bit_ordered_type&>(field);
+    ordered_field       = bit_ordered_conversion::from_bit_ordered(detail::identity_decomposer_t{}, ordered_field);
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void from_bit_ordered(DecomposerT decomposer, T& aggregate)
+{
+  detail::for_each_member(from_bit_ordered_f<DecomposerT>{decomposer}, decomposer, aggregate);
+}
+
+template <class DecomposerT>
+struct inverse_f
+{
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    using traits           = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+    using bit_ordered_type = typename traits::bit_ordered_type;
+
+    auto& ordered_field = reinterpret_cast<bit_ordered_type&>(field);
+    ordered_field       = ~ordered_field;
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void inverse(DecomposerT decomposer, T& aggregate)
+{
+  detail::for_each_member(inverse_f<DecomposerT>{decomposer}, decomposer, aggregate);
+}
+
+template <class DecomposerT>
+struct default_end_bit_f
+{
+  int& result;
+  DecomposerT decomposer;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& field)
+  {
+    result += sizeof(field) * 8;
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE int default_end_bit(DecomposerT decomposer, T& aggregate)
+{
+  int result{};
+  detail::for_each_member(default_end_bit_f<DecomposerT>{result, decomposer}, decomposer, aggregate);
+  return result;
+}
+
+struct digit_f
+{
+  ::cuda::std::uint32_t& dst;
+  ::cuda::std::uint32_t& dst_bit_start;
+  ::cuda::std::uint32_t& src_bit_start;
+  ::cuda::std::uint32_t& num_bits;
+
+  template <class T>
+  _CCCL_HOST_DEVICE void operator()(T& src)
+  {
+    constexpr ::cuda::std::uint32_t src_size = sizeof(T) * 8;
+
+    if (src_bit_start >= src_size)
+    {
+      src_bit_start -= src_size;
+    }
+    else
+    {
+      using traits           = traits_t<typename ::cuda::std::remove_cv<T>::type>;
+      using bit_ordered_type = typename traits::bit_ordered_type;
+
+      const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits);
+
+      if (bits_to_copy)
+      {
+        bit_ordered_type ordered_src =
+          BaseDigitExtractor<T>::ProcessFloatMinusZero(reinterpret_cast<bit_ordered_type&>(src));
+
+        const ::cuda::std::uint32_t mask = (1 << bits_to_copy) - 1;
+        dst                              = dst | (((ordered_src >> src_bit_start) & mask) << dst_bit_start);
+
+        num_bits -= bits_to_copy;
+        dst_bit_start += bits_to_copy;
+      }
+      src_bit_start = 0;
+    }
+  }
+};
+
+template <class DecomposerT, class T>
+_CCCL_HOST_DEVICE void
+digit(DecomposerT decomposer,
+      ::cuda::std::uint32_t& dst,
+      T& src,
+      ::cuda::std::uint32_t& dst_bit_start,
+      ::cuda::std::uint32_t& src_bit_start,
+      ::cuda::std::uint32_t& num_bits)
+{
+  detail::for_each_member(digit_f{dst, dst_bit_start, src_bit_start, num_bits}, decomposer, src);
+}
+
+template <class DecomposerT>
+struct custom_digit_extractor_t
+{
+  DecomposerT decomposer;
+  ::cuda::std::uint32_t bit_start;
+  ::cuda::std::uint32_t num_bits;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+  custom_digit_extractor_t(DecomposerT decomposer, ::cuda::std::uint32_t bit_start, ::cuda::std::uint32_t num_bits)
+      : decomposer(decomposer)
+      , bit_start(bit_start)
+      , num_bits(num_bits)
+  {}
+
+  template <class T>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ::cuda::std::uint32_t Digit(T& key) const
+  {
+    ::cuda::std::uint32_t result{};
+    ::cuda::std::uint32_t dst_bit_start{};
+    ::cuda::std::uint32_t src_bit_start = bit_start;
+    ::cuda::std::uint32_t bits_remaining{num_bits};
+    digit(decomposer, result, key, dst_bit_start, src_bit_start, bits_remaining);
+    return result;
+  }
+};
+
+struct custom_bit_conversion_policy_t
+{
+  template <class DecomposerT, class T>
+  static _CCCL_HOST_DEVICE T to_bit_ordered(DecomposerT decomposer, T val)
+  {
+    detail::radix::to_bit_ordered(decomposer, val);
+    return val;
+  }
+
+  template <class DecomposerT, class T>
+  static _CCCL_HOST_DEVICE T from_bit_ordered(DecomposerT decomposer, T val)
+  {
+    detail::radix::from_bit_ordered(decomposer, val);
+    return val;
+  }
+};
+
+struct custom_bit_inversion_policy_t
+{
+  template <class DecomposerT, class T>
+  static _CCCL_HOST_DEVICE T inverse(DecomposerT decomposer, T val)
+  {
+    detail::radix::inverse(decomposer, val);
+    return val;
+  }
+};
+
+template <class T>
+struct traits_t<T, false /* is_fundamental */>
+{
+  using bit_ordered_type              = T;
+  using bit_ordered_conversion_policy = custom_bit_conversion_policy_t;
+  using bit_ordered_inversion_policy  = custom_bit_inversion_policy_t;
+
+  template <class FundamentalExtractorT, class DecomposerT>
+  using digit_extractor_t = custom_digit_extractor_t<DecomposerT>;
+
+  template <class DecomposerT>
+  static _CCCL_HOST_DEVICE bit_ordered_type min_raw_binary_key(DecomposerT decomposer)
+  {
+    T val{};
+    detail::radix::min_raw_binary_key(decomposer, val);
+    return val;
+  }
+
+  template <class DecomposerT>
+  static _CCCL_HOST_DEVICE bit_ordered_type max_raw_binary_key(DecomposerT decomposer)
+  {
+    T val{};
+    detail::radix::max_raw_binary_key(decomposer, val);
+    return val;
+  }
+
+  template <class DecomposerT>
+  static _CCCL_HOST_DEVICE int default_end_bit(DecomposerT decomposer)
+  {
+    T aggregate{};
+    return detail::radix::default_end_bit(decomposer, aggregate);
+  }
+
+  template <class FundamentalExtractorT, class DecomposerT>
+  static _CCCL_HOST_DEVICE digit_extractor_t<FundamentalExtractorT, DecomposerT>
+  digit_extractor(int begin_bit, int num_bits, DecomposerT decomposer)
+  {
+    return custom_digit_extractor_t<DecomposerT>(decomposer, begin_bit, num_bits);
+  }
+};
+
+} // namespace radix
+
+} // namespace detail
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+//! Twiddling keys for radix sort
+template <bool IS_DESCENDING, typename KeyT>
+struct RadixSortTwiddle
+{
+private:
+  using traits                        = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type              = typename traits::bit_ordered_type;
+  using bit_ordered_conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using bit_ordered_inversion_policy  = typename traits::bit_ordered_inversion_policy;
+
+public:
+  template <class DecomposerT = detail::identity_decomposer_t>
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE //
+  bit_ordered_type
+  In(bit_ordered_type key, DecomposerT decomposer = {})
+  {
+    key = bit_ordered_conversion_policy::to_bit_ordered(decomposer, key);
+    _CCCL_IF_CONSTEXPR (IS_DESCENDING)
+    {
+      key = bit_ordered_inversion_policy::inverse(decomposer, key);
+    }
+    return key;
+  }
+
+  template <class DecomposerT = detail::identity_decomposer_t>
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE //
+  bit_ordered_type
+  Out(bit_ordered_type key, DecomposerT decomposer = {})
+  {
+    _CCCL_IF_CONSTEXPR (IS_DESCENDING)
+    {
+      key = bit_ordered_inversion_policy::inverse(decomposer, key);
+    }
+    key = bit_ordered_conversion_policy::from_bit_ordered(decomposer, key);
+    return key;
+  }
+
+  template <class DecomposerT = detail::identity_decomposer_t>
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE //
+  bit_ordered_type
+  DefaultKey(DecomposerT decomposer = {})
+  {
+    return IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_atomic.cuh
similarity index 59%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh
rename to source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_atomic.cuh
index 3be0a3dfa..8edc8575c 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/block/specializations/block_histogram_atomic.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -27,56 +27,58 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
+ * histograms from data samples partitioned across a CUDA thread block.
  */
 
 #pragma once
 
-#include "../../config.cuh"
+#include <cub/config.cuh>
 
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
+CUB_NAMESPACE_BEGIN
 
 /**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
+ *        histograms from data samples partitioned across a CUDA thread block.
  */
 template <int BINS>
 struct BlockHistogramAtomic
 {
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
+  /// Shared memory storage layout type
+  struct TempStorage
+  {};
 
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramAtomic(
-        TempStorage &temp_storage)
-    {}
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogramAtomic(TempStorage& temp_storage) {}
 
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            T,
-        typename            CounterT,     
-        int                 ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+  /**
+   * @brief Composite data onto an existing histogram
+   *
+   * @param[in] items
+   *   Calling thread's input values to histogram
+   *
+   * @param[out] histogram
+   *   Reference to shared/device-accessible memory histogram
+   */
+  template <typename T, typename CounterT, int ITEMS_PER_THREAD>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
+  {
+// Update histogram
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
     {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
+      atomicAdd(histogram + items[i], 1);
     }
-
+  }
 };
 
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 000000000..7ef3c1264
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide
+ * histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/util_ptx.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
+ *        histograms from data samples partitioned across a CUDA thread block.
+ *
+ * @tparam T
+ *   Sample type
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of samples per thread
+ *
+ * @tparam BINS
+ *   The number of bins into which histogram samples may fall
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective (unused)
+ */
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          int BLOCK_DIM_Y,
+          int BLOCK_DIM_Z,
+          int LEGACY_PTX_ARCH = 0>
+struct BlockHistogramSort
+{
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  // Parameterize BlockRadixSort type for our thread block
+  using BlockRadixSortT =
+    BlockRadixSort<T,
+                   BLOCK_DIM_X,
+                   ITEMS_PER_THREAD,
+                   NullType,
+                   4,
+                   true,
+                   BLOCK_SCAN_WARP_SCANS,
+                   cudaSharedMemBankSizeFourByte,
+                   BLOCK_DIM_Y,
+                   BLOCK_DIM_Z>;
+
+  // Parameterize BlockDiscontinuity type for our thread block
+  using BlockDiscontinuityT = BlockDiscontinuity<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Shared memory
+  union _TempStorage
+  {
+    // Storage for sorting bin values
+    typename BlockRadixSortT::TempStorage sort;
+
+    struct Discontinuities
+    {
+      // Storage for detecting discontinuities in the tile of sorted bin values
+      typename BlockDiscontinuityT::TempStorage flag;
+
+      // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+      unsigned int run_begin[BINS];
+      unsigned int run_end[BINS];
+    } discontinuities;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  unsigned int linear_tid;
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockHistogramSort(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  // Discontinuity functor
+  struct DiscontinuityOp
+  {
+    // Reference to temp_storage
+    _TempStorage& temp_storage;
+
+    // Constructor
+    _CCCL_DEVICE _CCCL_FORCEINLINE DiscontinuityOp(_TempStorage& temp_storage)
+        : temp_storage(temp_storage)
+    {}
+
+    // Discontinuity predicate
+    _CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(const T& a, const T& b, int b_index)
+    {
+      if (a != b)
+      {
+        // Note the begin/end offsets in shared storage
+        temp_storage.discontinuities.run_begin[b] = b_index;
+        temp_storage.discontinuities.run_end[a]   = b_index;
+
+        return true;
+      }
+      else
+      {
+        return false;
+      }
+    }
+  };
+
+  /**
+   * @brief Composite data onto an existing histogram
+   *
+   * @param[in] items
+   *   Calling thread's input values to histogram
+   *
+   * @param[out] histogram
+   *   Reference to shared/device-accessible memory histogram
+   */
+  template <typename CounterT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Composite(T (&items)[ITEMS_PER_THREAD], CounterT histogram[BINS])
+  {
+    enum
+    {
+      TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    // Sort bytes in blocked arrangement
+    BlockRadixSortT(temp_storage.sort).Sort(items);
+
+    CTA_SYNC();
+
+    // Initialize the shared memory's run_begin and run_end for each bin
+    int histo_offset = 0;
+
+#pragma unroll
+    for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+    {
+      temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+      temp_storage.discontinuities.run_end[histo_offset + linear_tid]   = TILE_SIZE;
+    }
+    // Finish up with guarded initialization if necessary
+    if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+    {
+      temp_storage.discontinuities.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+      temp_storage.discontinuities.run_end[histo_offset + linear_tid]   = TILE_SIZE;
+    }
+
+    CTA_SYNC();
+
+    int flags[ITEMS_PER_THREAD]; // unused
+
+    // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+    DiscontinuityOp flag_op(temp_storage);
+    BlockDiscontinuityT(temp_storage.discontinuities.flag).FlagHeads(flags, items, flag_op);
+
+    // Update begin for first item
+    if (linear_tid == 0)
+    {
+      temp_storage.discontinuities.run_begin[items[0]] = 0;
+    }
+
+    CTA_SYNC();
+
+    // Composite into histogram
+    histo_offset = 0;
+
+#pragma unroll
+    for (; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+    {
+      int thread_offset = histo_offset + linear_tid;
+      CounterT count =
+        temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset];
+      histogram[thread_offset] += count;
+    }
+
+    // Finish up with guarded composition if necessary
+    if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+    {
+      int thread_offset = histo_offset + linear_tid;
+      CounterT count =
+        temp_storage.discontinuities.run_end[thread_offset] - temp_storage.discontinuities.run_begin[thread_offset];
+      histogram[thread_offset] += count;
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 000000000..4b305a965
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,256 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
+ * block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_raking_layout.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
+ *        block. Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRakingCommutativeOnly (which
+ * does not support non-commutative operators), this implementation requires a
+ * few extra rounds of inter-thread communication.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+struct BlockReduceRaking
+{
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /// Layout type for padded thread block raking grid
+  using BlockRakingLayout = BlockRakingLayout<T, BLOCK_THREADS>;
+
+  ///  WarpReduce utility type
+  using WarpReduce = typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce;
+
+  /// Constants
+  enum
+  {
+    /// Number of raking threads
+    RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+    /// Number of raking elements per warp synchronous raking thread
+    SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+    /// Cooperative work can be entirely warp synchronous
+    WARP_SYNCHRONOUS = (int(RAKING_THREADS) == int(BLOCK_THREADS)),
+
+    /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of
+    /// two
+    WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+    /// Whether or not accesses into smem are unguarded
+    RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+  };
+
+  /// Shared memory storage layout type
+  union _TempStorage
+  {
+    /// Storage for warp-synchronous reduction
+    typename WarpReduce::TempStorage warp_storage;
+
+    /// Padded thread block raking grid
+    typename BlockRakingLayout::TempStorage raking_grid;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  unsigned int linear_tid;
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceRaking(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] partial
+   *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T RakingReduction(
+    ReductionOp reduction_op, T* raking_segment, T partial, int num_valid, Int2Type<ITERATION> /*iteration*/)
+  {
+    // Update partial if addend is in range
+    if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+    {
+      T addend = raking_segment[ITERATION];
+      partial  = reduction_op(partial, addend);
+    }
+    return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+  }
+
+  /**
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] partial
+   *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool IS_FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T RakingReduction(
+    ReductionOp /*reduction_op*/,
+    T* /*raking_segment*/,
+    T partial,
+    int /*num_valid*/,
+    Int2Type<SEGMENT_LENGTH> /*iteration*/)
+  {
+    return partial;
+  }
+
+  /**
+   * @brief Computes a thread block-wide reduction using the specified reduction operator. The
+   *        first num_valid threads each contribute one reduction partial. The return value is
+   *        only valid for thread<sub>0</sub>.
+   *
+   * @param[in] partial
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <bool IS_FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T partial, int num_valid, ReductionOp reduction_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+      partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+    else
+    {
+      // Place partial into shared memory grid.
+      *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+      CTA_SYNC();
+
+      // Reduce parallelism to one warp
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking reduction in grid
+        T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        partial           = raking_segment[0];
+
+        partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+        int valid_raking_threads = (IS_FULL_TILE) ? RAKING_THREADS : (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+        partial = WarpReduce(temp_storage.warp_storage)
+                    .template Reduce<(IS_FULL_TILE && RAKING_UNGUARDED)>(partial, valid_raking_threads, reduction_op);
+      }
+    }
+
+    return partial;
+  }
+
+  /**
+   * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+   *        The first num_valid threads each contribute one reduction partial. The return value is
+   *        only valid for thread<sub>0</sub>.
+   *
+   * @param[in] partial
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool IS_FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T partial, int num_valid)
+  {
+    cub::Sum reduction_op;
+
+    return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 000000000..987df5f55
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across
+ * a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/specializations/block_reduce_raking.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
+ *        across a CUDA thread block. Does not support non-commutative reduction operators. Does not
+ *        support block sizes that are not a multiple of the warp size.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+struct BlockReduceRakingCommutativeOnly
+{
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have
+  // valid values
+  using FallBack = BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  /// Constants
+  enum
+  {
+    /// Number of warp threads
+    WARP_THREADS = CUB_WARP_THREADS(0),
+
+    /// Whether or not to use fall-back
+    USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+    /// Number of raking threads
+    RAKING_THREADS = WARP_THREADS,
+
+    /// Number of threads actually sharing items with the raking threads
+    SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+    /// Number of raking elements per warp synchronous raking thread
+    SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+  };
+
+  ///  WarpReduce utility type
+  using WarpReduce = WarpReduce<T, RAKING_THREADS>;
+
+  /// Layout type for padded thread block raking grid
+  using BlockRakingLayout = BlockRakingLayout<T, SHARING_THREADS>;
+
+  /// Shared memory storage layout type
+  union _TempStorage
+  {
+    struct DefaultStorage
+    {
+      /// Storage for warp-synchronous reduction
+      typename WarpReduce::TempStorage warp_storage;
+
+      /// Padded thread block raking grid
+      typename BlockRakingLayout::TempStorage raking_grid;
+    } default_storage;
+
+    /// Fall-back storage for non-commutative block reduction
+    typename FallBack::TempStorage fallback_storage;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  unsigned int linear_tid;
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceRakingCommutativeOnly(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  /**
+   * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+   *        The first num_valid threads each contribute one reduction partial.
+   *        The return value is only valid for thread<sub>0</sub>.
+   *
+   * @param[in] partial
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T partial, int num_valid)
+  {
+    if (USE_FALLBACK || !FULL_TILE)
+    {
+      return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+    }
+    else
+    {
+      // Place partial into shared memory grid
+      if (linear_tid >= RAKING_THREADS)
+      {
+        *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) =
+          partial;
+      }
+
+      CTA_SYNC();
+
+      // Reduce parallelism to one warp
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking reduction in grid
+        T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
+        partial           = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+        // Warp reduction
+        partial = WarpReduce(temp_storage.default_storage.warp_storage).Sum(partial);
+      }
+    }
+
+    return partial;
+  }
+
+  /**
+   * @brief Computes a thread block-wide reduction using the specified reduction operator.
+   *        The first num_valid threads each contribute one reduction partial.
+   *        The return value is only valid for thread<sub>0</sub>.
+   *
+   * @param[in] partial
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <bool FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T partial, int num_valid, ReductionOp reduction_op)
+  {
+    if (USE_FALLBACK || !FULL_TILE)
+    {
+      return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+    }
+    else
+    {
+      // Place partial into shared memory grid
+      if (linear_tid >= RAKING_THREADS)
+      {
+        *BlockRakingLayout::PlacementPtr(temp_storage.default_storage.raking_grid, linear_tid - RAKING_THREADS) =
+          partial;
+      }
+
+      CTA_SYNC();
+
+      // Reduce parallelism to one warp
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking reduction in grid
+        T* raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
+        partial           = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+        // Warp reduction
+        partial = WarpReduce(temp_storage.default_storage.warp_storage).Reduce(partial, reduction_op);
+      }
+    }
+
+    return partial;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 000000000..5b827b080
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,257 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
+ * across a CUDA thread block. Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
+ *        across a CUDA thread block. Supports non-commutative reduction operators.
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+struct BlockReduceWarpReductions
+{
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    /// Number of warp threads
+    WARP_THREADS = CUB_WARP_THREADS(0),
+
+    /// Number of active warps
+    WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+    /// The logical warp size for warp reductions
+    LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+    /// Whether or not the logical warp size evenly divides the thread block size
+    EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+  };
+
+  ///  WarpReduce utility type
+  using WarpReduce = typename WarpReduce<T, LOGICAL_WARP_SIZE>::InternalWarpReduce;
+
+  /// Shared memory storage layout type
+  struct _TempStorage
+  {
+    /// Buffer for warp-synchronous reduction
+    typename WarpReduce::TempStorage warp_reduce[WARPS];
+
+    /// Shared totals from each warp-synchronous reduction
+    T warp_aggregates[WARPS];
+
+    /// Shared prefix for the entire thread block
+    T block_prefix;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  int linear_tid;
+  int warp_id;
+  int lane_id;
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockReduceWarpReductions(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
+      , lane_id(LaneId())
+  {}
+
+  /**
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] warp_aggregate
+   *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates(
+    ReductionOp reduction_op, T warp_aggregate, int num_valid, Int2Type<SUCCESSOR_WARP> /*successor_warp*/)
+  {
+    if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+    {
+      T addend       = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+      warp_aggregate = reduction_op(warp_aggregate, addend);
+    }
+    return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+  }
+
+  /**
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] warp_aggregate
+   *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates(
+    ReductionOp /*reduction_op*/, T warp_aggregate, int /*num_valid*/, Int2Type<WARPS> /*successor_warp*/)
+  {
+    return warp_aggregate;
+  }
+
+  /**
+   * @brief Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] warp_aggregate
+   *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T ApplyWarpAggregates(ReductionOp reduction_op, T warp_aggregate, int num_valid)
+  {
+    // Share lane aggregates
+    if (lane_id == 0)
+    {
+      detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
+    }
+
+    CTA_SYNC();
+
+    // Update total aggregate in warp 0, lane 0
+    if (linear_tid == 0)
+    {
+      warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+    }
+
+    return warp_aggregate;
+  }
+
+  /**
+   * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+   *        The first num_valid threads each contribute one reduction partial. The return value is
+   *        only valid for thread<sub>0</sub>.
+   *
+   * @param[in] input
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   */
+  template <bool FULL_TILE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int num_valid)
+  {
+    cub::Sum reduction_op;
+    int warp_offset    = (warp_id * LOGICAL_WARP_SIZE);
+    int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid))
+                         ? LOGICAL_WARP_SIZE
+                         : num_valid - warp_offset;
+
+    // Warp reduction in every warp
+    T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id])
+                         .template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(input, warp_num_valid, cub::Sum());
+
+    // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+    return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+  }
+
+  /**
+   * @brief Computes a thread block-wide reduction using the specified reduction operator.
+   *        The first num_valid threads each contribute one reduction partial.
+   *        The return value is only valid for thread<sub>0</sub>.
+   *
+   * @param[in] input
+   *   Calling thread's input partial reductions
+   *
+   * @param[in] num_valid
+   *   Number of valid elements (may be less than BLOCK_THREADS)
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <bool FULL_TILE, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int num_valid, ReductionOp reduction_op)
+  {
+    int warp_offset    = warp_id * LOGICAL_WARP_SIZE;
+    int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid))
+                         ? LOGICAL_WARP_SIZE
+                         : num_valid - warp_offset;
+
+    // Warp reduction in every warp
+    T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id])
+                         .template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(input, warp_num_valid, reduction_op);
+
+    // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+    return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_raking.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 000000000..f0fe7a5ca
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,798 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a
+ * CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_raking_layout.cuh>
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/thread/thread_scan.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
+ * thread block.
+ *
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam MEMOIZE
+ *   Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the
+ * expense of higher register pressure
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, bool MEMOIZE, int LEGACY_PTX_ARCH = 0>
+struct BlockScanRaking
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// Constants
+  enum
+  {
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+  };
+
+  /// Layout type for padded thread block raking grid
+  using BlockRakingLayout = BlockRakingLayout<T, BLOCK_THREADS>;
+
+  /// Constants
+  enum
+  {
+    /// Number of raking threads
+    RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+    /// Number of raking elements per warp synchronous raking thread
+    SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+    /// Cooperative work can be entirely warp synchronous
+    WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)),
+  };
+
+  ///  WarpScan utility type
+  using WarpScan = WarpScan<T, RAKING_THREADS>;
+
+  /// Shared memory storage layout type
+  struct _TempStorage
+  {
+    /// Buffer for warp-synchronous scan
+    typename WarpScan::TempStorage warp_scan;
+
+    /// Padded thread block raking grid
+    typename BlockRakingLayout::TempStorage raking_grid;
+
+    /// Block aggregate
+    T block_aggregate;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  unsigned int linear_tid;
+  T cached_segment[SEGMENT_LENGTH];
+
+  //---------------------------------------------------------------------
+  // Utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Templated reduction
+   *
+   * @param[in] raking_ptr
+   *   Input array
+   *
+   * @param[in] scan_op
+   *   Binary reduction operator
+   *
+   * @param[in] raking_partial
+   *   Prefix to seed reduction with
+   */
+  template <int ITERATION, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  GuardedReduce(T* raking_ptr, ScanOp scan_op, T raking_partial, Int2Type<ITERATION> /*iteration*/)
+  {
+    if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+    {
+      T addend       = raking_ptr[ITERATION];
+      raking_partial = scan_op(raking_partial, addend);
+    }
+
+    return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+  }
+
+  /**
+   * @brief Templated reduction (base case)
+   *
+   * @param[in] raking_ptr
+   *   Input array
+   *
+   * @param[in] scan_op
+   *   Binary reduction operator
+   *
+   * @param[in] raking_partial
+   *   Prefix to seed reduction with
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  GuardedReduce(T* /*raking_ptr*/, ScanOp /*scan_op*/, T raking_partial, Int2Type<SEGMENT_LENGTH> /*iteration*/)
+  {
+    return raking_partial;
+  }
+
+  /**
+   * @brief Templated copy
+   *
+   * @param out
+   *   [out] Out array
+   *
+   * @param in
+   *   [in] Input array
+   */
+  template <int ITERATION>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void CopySegment(T* out, T* in, Int2Type<ITERATION> /*iteration*/)
+  {
+    out[ITERATION] = in[ITERATION];
+    CopySegment(out, in, Int2Type<ITERATION + 1>());
+  }
+
+  /**
+   * @brief Templated copy (base case)
+   *
+   * @param[out] out
+   *   Out array
+   *
+   * @param[in] in
+   *   Input array
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void CopySegment(T* /*out*/, T* /*in*/, Int2Type<SEGMENT_LENGTH> /*iteration*/) {}
+
+  /// Performs upsweep raking reduction, returning the aggregate
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Upsweep(ScanOp scan_op)
+  {
+    T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+    // Read data into registers
+    CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+    T raking_partial = cached_segment[0];
+
+    return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+  }
+
+  /// Performs exclusive downsweep raking scan
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix = true)
+  {
+    T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+    // Read data back into registers
+    if (!MEMOIZE)
+    {
+      CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+    }
+
+    internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+    // Write data back to smem
+    CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+  }
+
+  /// Performs inclusive downsweep raking scan
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveDownsweep(ScanOp scan_op, T raking_partial, bool apply_prefix = true)
+  {
+    T* smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+    // Read data back into registers
+    if (!MEMOIZE)
+    {
+      CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+    }
+
+    internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+    // Write data back to smem
+    CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+  }
+
+  //---------------------------------------------------------------------
+  // Constructors
+  //---------------------------------------------------------------------
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRaking(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+  {}
+
+  //---------------------------------------------------------------------
+  // Exclusive scans
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element. With no initial value,
+   *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+        // Exclusive raking downsweep scan
+        ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      exclusive_output = *placement_ptr;
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   * scan_op functor.  Each thread contributes one input element.
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] output
+   *   Calling thread's output items (may be aliased to \p input)
+   *
+   * @param[in] initial_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, const T& initial_value, ScanOp scan_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Exclusive Warp-synchronous scan
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+        // Exclusive raking downsweep scan
+        ExclusiveDownsweep(scan_op, exclusive_partial);
+      }
+
+      CTA_SYNC();
+
+      // Grab exclusive partial from shared memory
+      output = *placement_ptr;
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs.  With no initial value,
+   *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T inclusive_partial;
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+        // Exclusive raking downsweep scan
+        ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+        // Broadcast aggregate to all threads
+        if (linear_tid == RAKING_THREADS - 1)
+        {
+          temp_storage.block_aggregate = inclusive_partial;
+        }
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      output = *placement_ptr;
+
+      // Retrieve block aggregate
+      block_aggregate = temp_storage.block_aggregate;
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] output
+   *   Calling thread's output items (may be aliased to \p input)
+   *
+   * @param[in] initial_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& output, const T& initial_value, ScanOp scan_op, T& block_aggregate)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan)
+          .ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+        // Exclusive raking downsweep scan
+        ExclusiveDownsweep(scan_op, exclusive_partial);
+
+        // Broadcast aggregate to other threads
+        if (linear_tid == 0)
+        {
+          temp_storage.block_aggregate = block_aggregate;
+        }
+      }
+
+      CTA_SYNC();
+
+      // Grab exclusive partial from shared memory
+      output = *placement_ptr;
+
+      // Retrieve block aggregate
+      block_aggregate = temp_storage.block_aggregate;
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+   *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+   *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+   *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+   *        the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in-out] block_prefix_callback_op
+   *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+   *   block-wide prefix to be applied to all inputs.
+   */
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      T block_aggregate;
+      WarpScan warp_scan(temp_storage.warp_scan);
+      warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+      // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+      T block_prefix = block_prefix_callback_op(block_aggregate);
+      block_prefix   = warp_scan.Broadcast(block_prefix, 0);
+
+      output = scan_op(block_prefix, output);
+      if (linear_tid == 0)
+      {
+        output = block_prefix;
+      }
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        WarpScan warp_scan(temp_storage.warp_scan);
+
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T exclusive_partial, block_aggregate;
+        warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+        // Obtain block-wide prefix in lane0, then broadcast to other lanes
+        T block_prefix = block_prefix_callback_op(block_aggregate);
+        block_prefix   = warp_scan.Broadcast(block_prefix, 0);
+
+        // Update prefix with warpscan exclusive partial
+        T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+        if (linear_tid == 0)
+        {
+          downsweep_prefix = block_prefix;
+        }
+
+        // Exclusive raking downsweep scan
+        ExclusiveDownsweep(scan_op, downsweep_prefix);
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      output = *placement_ptr;
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Inclusive scans
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Exclusive Warp-synchronous scan
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+        // Inclusive raking downsweep scan
+        InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      output = *placement_ptr;
+    }
+  }
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element.  Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T inclusive_partial;
+        T exclusive_partial;
+        WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+        // Inclusive raking downsweep scan
+        InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+        // Broadcast aggregate to all threads
+        if (linear_tid == RAKING_THREADS - 1)
+        {
+          temp_storage.block_aggregate = inclusive_partial;
+        }
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      output = *placement_ptr;
+
+      // Retrieve block aggregate
+      block_aggregate = temp_storage.block_aggregate;
+    }
+  }
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+   *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+   *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+   *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+   *        the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in-out] block_prefix_callback_op
+   *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+   *   block-wide prefix to be applied to all inputs.
+   */
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    if (WARP_SYNCHRONOUS)
+    {
+      // Short-circuit directly to warp-synchronous scan
+      T block_aggregate;
+      WarpScan warp_scan(temp_storage.warp_scan);
+      warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+      // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+      T block_prefix = block_prefix_callback_op(block_aggregate);
+      block_prefix   = warp_scan.Broadcast(block_prefix, 0);
+
+      // Update prefix with exclusive warpscan partial
+      output = scan_op(block_prefix, output);
+    }
+    else
+    {
+      // Place thread partial into shared memory raking grid
+      T* placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+      detail::uninitialized_copy_single(placement_ptr, input);
+
+      CTA_SYNC();
+
+      // Reduce parallelism down to just raking threads
+      if (linear_tid < RAKING_THREADS)
+      {
+        WarpScan warp_scan(temp_storage.warp_scan);
+
+        // Raking upsweep reduction across shared partials
+        T upsweep_partial = Upsweep(scan_op);
+
+        // Warp-synchronous scan
+        T exclusive_partial, block_aggregate;
+        warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+        // Obtain block-wide prefix in lane0, then broadcast to other lanes
+        T block_prefix = block_prefix_callback_op(block_aggregate);
+        block_prefix   = warp_scan.Broadcast(block_prefix, 0);
+
+        // Update prefix with warpscan exclusive partial
+        T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+        if (linear_tid == 0)
+        {
+          downsweep_prefix = block_prefix;
+        }
+
+        // Inclusive raking downsweep scan
+        InclusiveDownsweep(scan_op, downsweep_prefix);
+      }
+
+      CTA_SYNC();
+
+      // Grab thread prefix from shared memory
+      output = *placement_ptr;
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_warp_scans.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 000000000..851a71cbe
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,539 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
+ *        thread block.
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
+struct BlockScanWarpScans
+{
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// Constants
+  enum
+  {
+    /// Number of warp threads
+    WARP_THREADS = CUB_WARP_THREADS(0),
+
+    /// The thread block size in threads
+    BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+    /// Number of active warps
+    WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+  };
+
+  ///  WarpScan utility type
+  using WarpScanT = WarpScan<T, WARP_THREADS>;
+
+  ///  WarpScan utility type
+  using WarpAggregateScan = WarpScan<T, WARPS>;
+
+  /// Shared memory storage layout type
+
+  struct __align__(32) _TempStorage
+  {
+    T warp_aggregates[WARPS];
+
+    /// Buffer for warp-synchronous scans
+    typename WarpScanT::TempStorage warp_scan[WARPS];
+
+    /// Shared prefix for the entire thread block
+    T block_prefix;
+  };
+
+  /// Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //---------------------------------------------------------------------
+  // Per-thread fields
+  //---------------------------------------------------------------------
+
+  // Thread fields
+  _TempStorage& temp_storage;
+  unsigned int linear_tid;
+  unsigned int warp_id;
+  unsigned int lane_id;
+
+  //---------------------------------------------------------------------
+  // Constructors
+  //---------------------------------------------------------------------
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE BlockScanWarpScans(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+      , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
+      , lane_id(LaneId())
+  {}
+
+  //---------------------------------------------------------------------
+  // Utility methods
+  //---------------------------------------------------------------------
+
+  /**
+   * @param[out] warp_prefix
+   *   The calling thread's partial reduction
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp, int WARP>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ApplyWarpAggregates(T& warp_prefix, ScanOp scan_op, T& block_aggregate, Int2Type<WARP> /*addend_warp*/)
+  {
+    if (warp_id == WARP)
+    {
+      warp_prefix = block_aggregate;
+    }
+
+    T addend        = temp_storage.warp_aggregates[WARP];
+    block_aggregate = scan_op(block_aggregate, addend);
+
+    ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+  }
+
+  /**
+   * @param[out] warp_prefix
+   *   The calling thread's partial reduction
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregat
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ApplyWarpAggregates(T& /*warp_prefix*/, ScanOp /*scan_op*/, T& /*block_aggregate*/, Int2Type<WARPS> /*addend_warp*/)
+  {}
+
+  /**
+   * @brief Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns
+   *        block-wide aggregate in all threads.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] warp_aggregate
+   *   <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of
+   *   input items
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T& block_aggregate)
+  {
+    // Last lane in each warp shares its warp-aggregate
+    if (lane_id == WARP_THREADS - 1)
+    {
+      detail::uninitialized_copy_single(temp_storage.warp_aggregates + warp_id, warp_aggregate);
+    }
+
+    CTA_SYNC();
+
+    // Accumulate block aggregates and save the one that is our warp's prefix
+    T warp_prefix;
+    block_aggregate = temp_storage.warp_aggregates[0];
+
+    // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+    // TODO(bgruber): does that still hold today? This is creating a lot of template instantiations
+    ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+    /*
+            #pragma unroll
+            for (int WARP = 1; WARP < WARPS; ++WARP)
+            {
+                if (warp_id == WARP)
+                    warp_prefix = block_aggregate;
+
+                T addend = temp_storage.warp_aggregates[WARP];
+                block_aggregate = scan_op(block_aggregate, addend);
+            }
+    */
+
+    return warp_prefix;
+  }
+
+  /**
+   * @brief Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.
+   *        Also returns block-wide aggregate in all threads.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] warp_aggregate
+   *   <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of
+   * input items
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   *
+   * @param[in] initial_value
+   *   Initial value to seed the exclusive scan
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T& block_aggregate, const T& initial_value)
+  {
+    T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+    warp_prefix = scan_op(initial_value, warp_prefix);
+
+    if (warp_id == 0)
+    {
+      warp_prefix = initial_value;
+    }
+
+    return warp_prefix;
+  }
+
+  //---------------------------------------------------------------------
+  // Exclusive scans
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  With no initial value,
+   *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op)
+  {
+    // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+    T block_aggregate;
+    ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output items (may be aliased to \p input)
+   *
+   * @param[in] initial_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, const T& initial_value, ScanOp scan_op)
+  {
+    T block_aggregate;
+    ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element.  Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs. With no initial value,
+   *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& block_aggregate)
+  {
+    // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+    T inclusive_output;
+    WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+    // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+    T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+    // Apply warp prefix to our lane's partial
+    if (warp_id != 0)
+    {
+      exclusive_output = scan_op(warp_prefix, exclusive_output);
+      if (lane_id == 0)
+      {
+        exclusive_output = warp_prefix;
+      }
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input items
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output items (may be aliased to \p input)
+   *
+   * @param[in] initial_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& exclusive_output, const T& initial_value, ScanOp scan_op, T& block_aggregate)
+  {
+    // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+    T inclusive_output;
+    WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+    // Compute the warp-wide prefix and block-wide aggregate for each warp
+    T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+    // Apply warp prefix to our lane's partial
+    exclusive_output = scan_op(warp_prefix, exclusive_output);
+    if (lane_id == 0)
+    {
+      exclusive_output = warp_prefix;
+    }
+  }
+
+  /**
+   * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+   *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+   *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+   *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+   *        the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in-out] block_prefix_callback_op
+   *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+   *   block-wide prefix to be applied to all inputs.
+   */
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+    T block_aggregate;
+    ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+    // Use the first warp to determine the thread block prefix, returning the result in lane0
+    if (warp_id == 0)
+    {
+      T block_prefix = block_prefix_callback_op(block_aggregate);
+      if (lane_id == 0)
+      {
+        // Share the prefix with all threads
+        detail::uninitialized_copy_single(&temp_storage.block_prefix, block_prefix);
+
+        exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
+      }
+    }
+
+    CTA_SYNC();
+
+    // Incorporate thread block prefix into outputs
+    T block_prefix = temp_storage.block_prefix;
+    if (linear_tid > 0)
+    {
+      exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Inclusive scans
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor.  Each thread contributes one input element.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op)
+  {
+    T block_aggregate;
+    InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+  }
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element. Also provides every
+   *        thread with the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] block_aggregate
+   *   Threadblock-wide aggregate reduction of input items
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& block_aggregate)
+  {
+    WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+    // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+    T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+    // Apply warp prefix to our lane's partial
+    if (warp_id != 0)
+    {
+      inclusive_output = scan_op(warp_prefix, inclusive_output);
+    }
+  }
+
+  /**
+   * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+   *        scan_op functor. Each thread contributes one input element. the call-back functor \p
+   *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+   *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+   *        logically prefixes the thread block's scan inputs. Also provides every thread with
+   *        the block-wide \p block_aggregate of all inputs.
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] exclusive_output
+   *   Calling thread's output item (may be aliased to \p input)
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in-out] block_prefix_callback_op
+   *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+   * block-wide prefix to be applied to all inputs.
+   */
+  template <typename ScanOp, typename BlockPrefixCallbackOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T input, T& exclusive_output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op)
+  {
+    T block_aggregate;
+    InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+    // Use the first warp to determine the thread block prefix, returning the result in lane0
+    if (warp_id == 0)
+    {
+      T block_prefix = block_prefix_callback_op(block_aggregate);
+      if (lane_id == 0)
+      {
+        // Share the prefix with all threads
+        detail::uninitialized_copy_single(&temp_storage.block_prefix, block_prefix);
+      }
+    }
+
+    CTA_SYNC();
+
+    // Incorporate thread block prefix into outputs
+    T block_prefix   = temp_storage.block_prefix;
+    exclusive_output = scan_op(block_prefix, exclusive_output);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/config.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/config.cuh
new file mode 100644
index 000000000..123f2df46
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/config.cuh
@@ -0,0 +1,51 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static configuration header for the CUB project.
+ */
+
+#pragma once
+
+// For _CCCL_IMPLICIT_SYSTEM_HEADER
+#include <cuda/__cccl_config> // IWYU pragma: export
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_arch.cuh> // IWYU pragma: export
+#include <cub/util_compiler.cuh> // IWYU pragma: export
+#include <cub/util_cpp_dialect.cuh> // IWYU pragma: export
+#include <cub/util_deprecated.cuh> // IWYU pragma: export
+#include <cub/util_macro.cuh> // IWYU pragma: export
+#include <cub/util_namespace.cuh> // IWYU pragma: export
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/cub.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/cub.cuh
new file mode 100644
index 000000000..2c4d6dd5f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/cub.cuh
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+// Static configuration
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// Block
+#include <cub/block/block_adjacent_difference.cuh>
+#include <cub/block/block_discontinuity.cuh>
+#include <cub/block/block_exchange.cuh>
+#include <cub/block/block_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+// #include <cub/block/block_shift.cuh>
+
+// Device
+#include <cub/device/device_adjacent_difference.cuh>
+#include <cub/device/device_copy.cuh>
+#include <cub/device/device_for.cuh>
+#include <cub/device/device_histogram.cuh>
+#include <cub/device/device_memcpy.cuh>
+#include <cub/device/device_merge.cuh>
+#include <cub/device/device_merge_sort.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+#include <cub/device/device_segmented_sort.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/device/device_spmv.cuh>
+#include <cub/device/device_transform.cuh>
+
+// Grid
+// #include <cub/grid/grid_barrier.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/grid/grid_mapping.cuh>
+#include <cub/grid/grid_queue.cuh>
+
+// Thread
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_reduce.cuh>
+#include <cub/thread/thread_scan.cuh>
+#include <cub/thread/thread_store.cuh>
+
+// Warp
+#include <cub/warp/warp_exchange.cuh>
+#include <cub/warp/warp_load.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/warp/warp_scan.cuh>
+#include <cub/warp/warp_store.cuh>
+
+// Iterator
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+// Util
+#include <cub/util_allocator.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_temporary_storage.cuh>
+#include <cub/util_type.cuh>
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/choose_offset.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/choose_offset.cuh
new file mode 100644
index 000000000..7dc5c6fd6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/choose_offset.cuh
@@ -0,0 +1,158 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstdint>
+#include <cuda/std/iterator>
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * choose_offset checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type based on it.
+ */
+template <typename NumItemsT>
+struct choose_offset
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(::cuda::std::is_integral<NumItemsT>::value
+                  && !::cuda::std::is_same<typename ::cuda::std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
+
+  // Unsigned integer type for global offsets.
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) <= 4), std::uint32_t, unsigned long long>;
+};
+
+/**
+ * choose_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type based on it.
+ */
+template <typename NumItemsT>
+using choose_offset_t = typename choose_offset<NumItemsT>::type;
+
+/**
+ * promote_small_offset checks NumItemsT, the type of the num_items parameter, and
+ * promotes any integral type smaller than 32 bits to a signed 32-bit integer type.
+ */
+template <typename NumItemsT>
+struct promote_small_offset
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(::cuda::std::is_integral<NumItemsT>::value
+                  && !::cuda::std::is_same<typename ::cuda::std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
+
+  // Unsigned integer type for global offsets.
+  using type = ::cuda::std::_If<(sizeof(NumItemsT) < 4), std::int32_t, NumItemsT>;
+};
+
+/**
+ * promote_small_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and
+ * promotes any integral type smaller than 32 bits to a signed 32-bit integer type.
+ */
+template <typename NumItemsT>
+using promote_small_offset_t = typename promote_small_offset<NumItemsT>::type;
+
+/**
+ * choose_signed_offset checks NumItemsT, the type of the num_items parameter, and
+ * selects the offset type to be either int32 or int64, such that the selected offset type covers the range of NumItemsT
+ * unless it was uint64, in which case int64 will be used.
+ */
+template <typename NumItemsT>
+struct choose_signed_offset
+{
+  // NumItemsT must be an integral type (but not bool).
+  static_assert(::cuda::std::is_integral<NumItemsT>::value
+                  && !::cuda::std::is_same<typename ::cuda::std::remove_cv<NumItemsT>::type, bool>::value,
+                "NumItemsT must be an integral type, but not bool");
+
+  // Signed integer type for global offsets.
+  // uint32 -> int64, else
+  // LEQ 4B -> int32, else
+  // int64
+  using type =
+    ::cuda::std::_If<(::cuda::std::is_integral<NumItemsT>::value && ::cuda::std::is_unsigned<NumItemsT>::value),
+                     ::cuda::std::int64_t,
+                     ::cuda::std::_If<(sizeof(NumItemsT) <= 4), ::cuda::std::int32_t, ::cuda::std::int64_t>>;
+
+  /**
+   * Checks if the given num_items can be covered by the selected offset type. If not, returns cudaErrorInvalidValue,
+   * otherwise returns cudaSuccess.
+   */
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t is_exceeding_offset_type(NumItemsT num_items)
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */
+    if (sizeof(NumItemsT) >= 8 && num_items > static_cast<NumItemsT>(::cuda::std::numeric_limits<type>::max()))
+    {
+      return cudaErrorInvalidValue;
+    }
+    _CCCL_DIAG_POP
+    return cudaSuccess;
+  }
+};
+
+/**
+ * choose_signed_offset_t is an alias template that checks NumItemsT, the type of the num_items parameter, and
+ * selects the corresponding signed offset type based on it.
+ */
+template <typename NumItemsT>
+using choose_signed_offset_t = typename choose_signed_offset<NumItemsT>::type;
+
+/**
+ * common_iterator_value sets member type to the common_type of
+ * value_type for all argument types. used to get OffsetT in
+ * DeviceSegmentedReduce.
+ */
+template <typename... Iter>
+struct common_iterator_value
+{
+  using type = ::cuda::std::__common_type_t<::cuda::std::__iter_value_type<Iter>...>;
+};
+template <typename... Iter>
+using common_iterator_value_t = typename common_iterator_value<Iter...>::type;
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/detect_cuda_runtime.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/detect_cuda_runtime.cuh
new file mode 100644
index 000000000..211e31345
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/detect_cuda_runtime.cuh
@@ -0,0 +1,121 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Utilities for CUDA dynamic parallelism.
+ */
+
+#pragma once
+
+// We cannot use `cub/config.cuh` here due to circular dependencies
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+// CUDA headers might not be present when using NVRTC, see NVIDIA/cccl#2095 for detail
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  include <cuda_runtime_api.h>
+#endif // !_CCCL_COMPILER_NVRTC
+
+#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+
+/**
+ * \def CUB_DISABLE_CDP
+ *
+ * If defined, support for device-side usage of CUB is disabled.
+ */
+#  define CUB_DISABLE_CDP
+
+/**
+ * \def CUB_RDC_ENABLED
+ *
+ * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined.
+ */
+#  define CUB_RDC_ENABLED
+
+/**
+ * \def CUB_RUNTIME_FUNCTION
+ *
+ * Execution space for functions that can use the CUDA runtime API (`__host__`
+ * when RDC is off, `__host__ __device__` when RDC is on).
+ */
+#  define CUB_RUNTIME_FUNCTION
+
+/**
+ * \def CUB_RUNTIME_ENABLED
+ *
+ * Whether or not the active compiler pass is allowed to invoke device kernels
+ * or methods from the CUDA runtime API.
+ *
+ * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__`
+ * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
+ * purposes only.
+ *
+ * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`.
+ */
+#  define CUB_RUNTIME_ENABLED
+
+#else // Non-doxygen pass:
+
+#  ifndef CUB_RUNTIME_FUNCTION
+
+#    if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP)
+
+#      define CUB_RDC_ENABLED
+#      define CUB_RUNTIME_FUNCTION _CCCL_HOST_DEVICE
+
+#    else // RDC disabled:
+
+#      define CUB_RUNTIME_FUNCTION _CCCL_HOST
+
+#    endif // RDC enabled
+
+#    if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
+// Legacy only -- do not use in new code.
+#      define CUB_RUNTIME_ENABLED
+#    endif
+
+#  endif // CUB_RUNTIME_FUNCTION predefined
+
+#  ifdef CUB_RDC_ENABLED
+// Detect available version of CDP:
+#    if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
+#      define CUB_DETAIL_CDPv1
+#    else
+#      define CUB_DETAIL_CDPv2
+#    endif
+#  endif
+
+#endif // Do not document
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_double_buffer.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_double_buffer.cuh
new file mode 100644
index 000000000..4a58fa3e0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_double_buffer.cuh
@@ -0,0 +1,96 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_namespace.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * @brief It's a double-buffer storage wrapper for multi-pass stream
+ *        transformations that require more than one storage array for
+ *        streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage buffers
+ * (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass). This structure wraps a set of device
+ * buffers.
+ *
+ * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member
+ * to track which buffer is "current". The main reason for this class existence
+ * is the performance difference. Since `cub::DoubleBuffer` relies on the
+ * runtime variable to index pointers arrays, they are placed in the local
+ * memory instead of registers. Local memory accesses significantly affect
+ * performance. On the contrary, this class swaps pointer, so all operations
+ * can be performed in registers.
+ */
+template <typename T>
+class device_double_buffer
+{
+  /// Pair of device buffer pointers
+  T* m_current_buffer{};
+  T* m_alternate_buffer{};
+
+public:
+  /**
+   * @param d_current
+   *   The currently valid buffer
+   *
+   * @param d_alternate
+   *   Alternate storage buffer of the same size as @p d_current
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE device_double_buffer(T* current, T* alternate)
+      : m_current_buffer(current)
+      , m_alternate_buffer(alternate)
+  {}
+
+  /// \brief Return pointer to the currently valid buffer
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* current() const
+  {
+    return m_current_buffer;
+  }
+
+  /// \brief Return pointer to the currently invalid buffer
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* alternate() const
+  {
+    return m_alternate_buffer;
+  }
+
+  _CCCL_HOST_DEVICE void swap()
+  {
+    T* tmp             = m_current_buffer;
+    m_current_buffer   = m_alternate_buffer;
+    m_alternate_buffer = tmp;
+  }
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_synchronize.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_synchronize.cuh
new file mode 100644
index 000000000..afe6cbd34
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/device_synchronize.cuh
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda_runtime_api.h>
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
+ * CUDA configuration.
+ */
+_CCCL_EXEC_CHECK_DISABLE
+CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
+{
+  cudaError_t result = cudaErrorNotSupported;
+
+  // Device-side sync is only available under CDPv1:
+#if defined(CUB_DETAIL_CDPv1)
+
+#  if ((__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
+  // CUDA >= 11.6
+#    define CUB_TMP_DEVICE_SYNC_IMPL result = __cudaDeviceSynchronizeDeprecationAvoidance();
+#  else // CUDA < 11.6:
+#    define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
+#  endif
+
+#else // CDPv2 or no CDP:
+
+#  define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
+
+#endif // CDP version
+
+  NV_IF_TARGET(NV_IS_HOST, (result = cudaDeviceSynchronize();), (CUB_TMP_DEVICE_SYNC_IMPL));
+
+#undef CUB_TMP_DEVICE_SYNC_IMPL
+
+  return result;
+}
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx.cuh
new file mode 100644
index 000000000..48eeb56cf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx.cuh
@@ -0,0 +1,111 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+//! When this macro is defined, no NVTX ranges are emitted by CCCL
+#  define CCCL_DISABLE_NVTX
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+// Enable the functionality of this header if:
+// * The NVTX3 C API is available in CTK
+// * NVTX is not explicitly disabled (via CCCL_DISABLE_NVTX or NVTX_DISABLE)
+// * C++14 is availabl for cuda::std::optional
+// * NVTX3 uses module as an identifier, which trips up NVHPC
+#if __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) \
+&& _CCCL_STD_VER >= 2014 &&(!defined(_CCCL_COMPILER_NVHPC) || _CCCL_STD_VER <= 2017)
+// Include our NVTX3 C++ wrapper if not available from the CTK
+#  if __has_include(<nvtx3/nvtx3.hpp>) // TODO(bgruber): replace by a check for the first CTK version shipping the header
+#    include <nvtx3/nvtx3.hpp>
+#  else // __has_include(<nvtx3/nvtx3.hpp>)
+#    include "nvtx3.hpp"
+#  endif // __has_include(<nvtx3/nvtx3.hpp>)
+
+// We expect the NVTX3 V1 C++ API to be available when nvtx3.hpp is available. This should work, because newer versions
+// of NVTX3 will continue to declare previous API versions. See also:
+// https://github.com/NVIDIA/NVTX/blob/release-v3/c/include/nvtx3/nvtx3.hpp#L2835-L2841.
+#  ifdef NVTX3_CPP_DEFINITIONS_V1_0
+#    include <cuda/std/optional>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+struct NVTXCCCLDomain
+{
+  static constexpr const char* name = "CCCL";
+};
+} // namespace detail
+CUB_NAMESPACE_END
+
+// Hook for the NestedNVTXRangeGuard from the unit tests
+#    ifndef CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE
+#      define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name)
+#    endif // !CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE
+
+// Conditionally inserts a NVTX range starting here until the end of the current function scope in host code. Does
+// nothing in device code.
+// The optional is needed to defer the construction of an NVTX range (host-only code) and message string registration
+// into a dispatch region running only on the host, while preserving the semantic scope where the range is declared.
+#    define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name)                                                             \
+      CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name)                                                                          \
+      ::cuda::std::optional<::nvtx3::v1::scoped_range_in<CUB_NS_QUALIFIER::detail::NVTXCCCLDomain>> __cub_nvtx3_range;  \
+      NV_IF_TARGET(                                                                                                     \
+        NV_IS_HOST,                                                                                                     \
+        static const ::nvtx3::v1::registered_string_in<CUB_NS_QUALIFIER::detail::NVTXCCCLDomain> __cub_nvtx3_func_name{ \
+          name};                                                                                                        \
+        static const ::nvtx3::v1::event_attributes __cub_nvtx3_func_attr{__cub_nvtx3_func_name};                        \
+        if (condition) __cub_nvtx3_range.emplace(__cub_nvtx3_func_attr);                                                \
+        (void) __cub_nvtx3_range;)
+
+#    define CUB_DETAIL_NVTX_RANGE_SCOPE(name) CUB_DETAIL_NVTX_RANGE_SCOPE_IF(true, name)
+#  else // NVTX3_CPP_DEFINITIONS_V1_0
+#    if defined(_CCCL_COMPILER_MSVC)
+#      pragma message( \
+        "warning: nvtx3.hpp is available but does not define the V1 API. This is odd. Please open a GitHub issue at: https://github.com/NVIDIA/cccl/issues.")
+#    else
+#      warning nvtx3.hpp is available but does not define the V1 API. This is odd. Please open a GitHub issue at: https://github.com/NVIDIA/cccl/issues.
+#    endif
+#    define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name)
+#    define CUB_DETAIL_NVTX_RANGE_SCOPE(name)
+#  endif // NVTX3_CPP_DEFINITIONS_V1_0
+#else // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER
+      // >= 2014
+#  define CUB_DETAIL_NVTX_RANGE_SCOPE_IF(condition, name)
+#  define CUB_DETAIL_NVTX_RANGE_SCOPE(name)
+#endif // __has_include(<nvtx3/nvToolsExt.h> ) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) && _CCCL_STD_VER
+       // >= 2014
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx3.hpp b/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx3.hpp
new file mode 100644
index 000000000..c4191298e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/nvtx3.hpp
@@ -0,0 +1,2953 @@
+/*
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/* Temporary helper #defines, #undef'ed at end of header */
+#define NVTX3_CPP_VERSION_MAJOR 1
+#define NVTX3_CPP_VERSION_MINOR 0
+
+/* This section handles the decision of whether to provide unversioned symbols.
+ * If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is #defined, unversioned symbols are
+ * not provided, and explicit-version symbols such as nvtx3::v1::scoped_range
+ * and NVTX3_V1_FUNC_RANGE must be used.  By default, the first #include of this
+ * header will define the unversioned symbols such as nvtx3::scoped_range and
+ * NVTX3_FUNC_RANGE.  Subsequently including a different major version of this
+ * header without #defining NVTX3_CPP_REQUIRE_EXPLICIT_VERSION triggers an error
+ * since the symbols would conflict.  Subsequently including of a different
+ * minor version within the same major version is allowed. Functionality of
+ * minor versions is cumulative, regardless of include order.
+ *
+ * Since NVTX3_CPP_REQUIRE_EXPLICIT_VERSION allows all combinations of versions
+ * to coexist without problems within a translation unit, the recommended best
+ * practice for instrumenting header-based libraries with NVTX C++ Wrappers is
+ * is to #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before including nvtx3.hpp,
+ * #undef it afterward, and only use explicit-version symbols.  This is not
+ * necessary in common cases, such as instrumenting a standalone application, or
+ * static/shared libraries in .cpp files or headers private to those projects.
+ */
+/* clang-format off */
+#if !defined(NVTX3_CPP_REQUIRE_EXPLICIT_VERSION)
+  /* Define macro used by all definitions in this header to indicate the
+   * unversioned symbols should be defined in addition to the versioned ones.
+   */
+  #define NVTX3_INLINE_THIS_VERSION
+
+  #if !defined(NVTX3_CPP_INLINED_VERSION_MAJOR)
+    /* First occurrence of this header in the translation unit.  Define macros
+     * indicating which version shall be used for unversioned symbols.
+     */
+
+    /**
+     * @brief Semantic major version number for NVTX C++ wrappers of unversioned symbols
+     *
+     * Breaking changes may occur between major versions, and different major versions
+     * cannot provide unversioned symbols in the same translation unit (.cpp file).
+     *
+     * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined.
+     *
+     * Not to be confused with the version number of the NVTX core library.
+     */
+    #define NVTX3_CPP_INLINED_VERSION_MAJOR 1  // NVTX3_CPP_VERSION_MAJOR
+
+    /**
+     * @brief Semantic minor version number for NVTX C++ wrappers of unversioned symbols
+     *
+     * No breaking changes occur between minor versions -- minor version changes within
+     * a major version are purely additive.
+     *
+     * Note: If NVTX3_CPP_REQUIRE_EXPLICIT_VERSION is defined, this macro is not defined.
+     *
+     * Not to be confused with the version number of the NVTX core library.
+     */
+    #define NVTX3_CPP_INLINED_VERSION_MINOR 0  // NVTX3_CPP_VERSION_MINOR
+  #elif NVTX3_CPP_INLINED_VERSION_MAJOR != NVTX3_CPP_VERSION_MAJOR
+    /* Unsupported case -- cannot define unversioned symbols for different major versions
+     * in the same translation unit.
+     */
+    #error \
+      "Two different major versions of the NVTX C++ Wrappers are being included in a single .cpp file, with unversioned symbols enabled in both.  Only one major version can enable unversioned symbols in a .cpp file.  To disable unversioned symbols, #define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION before #including nvtx3.hpp, and use the explicit-version symbols instead -- this is the preferred way to use nvtx3.hpp from a header file."
+  #elif (NVTX3_CPP_INLINED_VERSION_MAJOR == NVTX3_CPP_VERSION_MAJOR) && \
+    (NVTX3_CPP_INLINED_VERSION_MINOR < NVTX3_CPP_VERSION_MINOR)
+    /* An older minor version of the same major version already defined unversioned
+     * symbols.  The new features provided in this header will be inlined
+     * redefine the minor version macro to this header's version.
+     */
+    #undef NVTX3_CPP_INLINED_VERSION_MINOR
+    #define NVTX3_CPP_INLINED_VERSION_MINOR 0  // NVTX3_CPP_VERSION_MINOR
+    // else, already have this version or newer, nothing to do
+  #endif
+#endif
+/* clang-format on */
+
+/**
+ * @file nvtx3.hpp
+ *
+ * @brief Provides C++ constructs making the NVTX library safer and easier to
+ * use with zero overhead.
+ */
+
+/**
+ * \mainpage
+ * \tableofcontents
+ *
+ * \section QUICK_START Quick Start
+ *
+ * To add NVTX ranges to your code, use the `nvtx3::scoped_range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function() {
+ *    // Begins a NVTX range with the messsage "some_function"
+ *    // The range ends when some_function() returns and `r` is destroyed
+ *    nvtx3::scoped_range r{"some_function"};
+ *
+ *    for(int i = 0; i < 6; ++i) {
+ *       nvtx3::scoped_range loop{"loop range"};
+ *       std::this_thread::sleep_for(std::chrono::seconds{1});
+ *    }
+ * } // Range ends when `r` is destroyed
+ * \endcode
+ *
+ * The example code above generates the following timeline view in Nsight
+ * Systems:
+ *
+ * \image html
+ * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
+ *
+ * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
+ * ranges to your code that automatically use the name of the enclosing function
+ * as the range's message.
+ *
+ * \code{.cpp}
+ * #include "nvtx3.hpp"
+ * void some_function() {
+ *    // Creates a range with a message "some_function" that ends when the
+ *    // enclosing function returns
+ *    NVTX3_FUNC_RANGE();
+ *    ...
+ * }
+ * \endcode
+ *
+ *
+ * \section Overview
+ *
+ * The NVTX library provides a set of functions for users to annotate their code
+ * to aid in performance profiling and optimization. These annotations provide
+ * information to tools like Nsight Systems to improve visualization of
+ * application timelines.
+ *
+ * \ref RANGES are one of the most commonly used NVTX constructs for annotating
+ * a span of time. For example, imagine a user wanted to see every time a
+ * function, `my_function`, is called and how long it takes to execute. This can
+ * be accomplished with an NVTX range created on the entry to the function and
+ * terminated on return from `my_function` using the push/pop C APIs:
+ *
+ * \code{.cpp}
+ * void my_function(...) {
+ *    nvtxRangePushA("my_function"); // Begins NVTX range
+ *    // do work
+ *    nvtxRangePop(); // Ends NVTX range
+ * }
+ * \endcode
+ *
+ * One of the challenges with using the NVTX C API is that it requires manually
+ * terminating the end of the range with `nvtxRangePop`. This can be challenging
+ * if `my_function()` has multiple returns or can throw exceptions as it
+ * requires calling `nvtxRangePop()` before all possible return points.
+ *
+ * NVTX C++ solves this inconvenience through the "RAII" technique by providing
+ * a `nvtx3::scoped_range` class that begins a range at construction and ends
+ * the range on destruction. The above example then becomes:
+ *
+ * \code{.cpp}
+ * void my_function(...) {
+ *    nvtx3::scoped_range r{"my_function"}; // Begins NVTX range
+ *    // do work
+ * } // Range ends on exit from `my_function` when `r` is destroyed
+ * \endcode
+ *
+ * The range object `r` is deterministically destroyed whenever `my_function`
+ * returns---ending the NVTX range without manual intervention. For more
+ * information, see \ref RANGES and `nvtx3::scoped_range_in`.
+ *
+ * Another inconvenience of the NVTX C APIs are the several constructs where the
+ * user is expected to initialize an object at the beginning of an application
+ * and reuse that object throughout the lifetime of the application. For example
+ * see domains, categories, and registered messages.
+ *
+ * Example:
+ * \code{.cpp}
+ * nvtxDomainHandle_t D = nvtxDomainCreateA("my domain");
+ * // Reuse `D` throughout the rest of the application
+ * \endcode
+ *
+ * This can be problematic if the user application or library does not have an
+ * explicit initialization function called before all other functions to
+ * ensure that these long-lived objects are initialized before being used.
+ *
+ * NVTX C++ makes use of the "construct on first use" technique to alleviate
+ * this inconvenience. In short, a function local static object is constructed
+ * upon the first invocation of a function and returns a reference to that
+ * object on all future invocations. See the documentation for `nvtx3::domain`,
+ * `nvtx3::named_category`, `nvtx3::registered_string`, and
+ * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use for more
+ * information.
+ *
+ * Using construct on first use, the above example becomes:
+ * \code{.cpp}
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // The first invocation of `domain::get` for the type `my_domain` will
+ * // construct a `nvtx3::domain` object and return a reference to it. Future
+ * // invocations simply return a reference.
+ * nvtx3::domain const& D = nvtx3::domain::get<my_domain>();
+ * \endcode
+ * For more information about NVTX and how it can be used, see
+ * https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx and
+ * https://devblogs.nvidia.com/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+ * for more information.
+ *
+ * \section RANGES Ranges
+ *
+ * Ranges are used to describe a span of time during the execution of an
+ * application. Common examples are using ranges to annotate the time it takes
+ * to execute a function or an iteration of a loop.
+ *
+ * NVTX C++ uses RAII to automate the generation of ranges that are tied to the
+ * lifetime of objects. Similar to `std::lock_guard` in the C++ Standard
+ * Template Library.
+ *
+ * \subsection scoped_range Scoped Range
+ *
+ * `nvtx3::scoped_range_in` is a class that begins a range upon construction
+ * and ends the range at destruction. This is one of the most commonly used
+ * constructs in NVTX C++ and is useful for annotating spans of time on a
+ * particular thread. These ranges can be nested to arbitrary depths.
+ *
+ * `nvtx3::scoped_range` is an alias for a `nvtx3::scoped_range_in` in the
+ * global NVTX domain. For more information about Domains, see \ref DOMAINS.
+ *
+ * Various attributes of a range can be configured constructing a
+ * `nvtx3::scoped_range_in` with a `nvtx3::event_attributes` object. For
+ * more information, see \ref ATTRIBUTES.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * void some_function() {
+ *    // Creates a range for the duration of `some_function`
+ *    nvtx3::scoped_range r{};
+ *
+ *    while(true) {
+ *       // Creates a range for every loop iteration
+ *       // `loop_range` is nested inside `r`
+ *       nvtx3::scoped_range loop_range{};
+ *    }
+ * }
+ * \endcode
+ *
+ * \subsection unique_range Unique Range
+ *
+ * `nvtx3::unique_range` is similar to `nvtx3::scoped_range`, with a few key differences:
+ * - `unique_range` objects can be destroyed in any order whereas `scoped_range` objects must be
+ *    destroyed in exact reverse creation order
+ * - `unique_range` can start and end on different threads
+ * - `unique_range` is moveable
+ * - `unique_range` objects can be constructed as heap objects
+ *
+ * There is extra overhead associated with `unique_range` constructs and therefore use of
+ * `nvtx3::scoped_range_in` should be preferred.
+ *
+ * \section MARKS Marks
+ *
+ * `nvtx3::mark` annotates an instantaneous point in time with a "marker".
+ *
+ * Unlike a "range" which has a beginning and an end, a marker is a single event
+ * in an application, such as detecting a problem:
+ *
+ * \code{.cpp}
+ * bool success = do_operation(...);
+ * if (!success) {
+ *    nvtx3::mark("operation failed!");
+ * }
+ * \endcode
+ *
+ * \section DOMAINS Domains
+ *
+ * Similar to C++ namespaces, domains allow for scoping NVTX events. By default,
+ * all NVTX events belong to the "global" domain. Libraries and applications
+ * should scope their events to use a custom domain to differentiate where the
+ * events originate from.
+ *
+ * It is common for a library or application to have only a single domain and
+ * for the name of that domain to be known at compile time. Therefore, Domains
+ * in NVTX C++ are represented by _tag types_.
+ *
+ * For example, to define a custom domain, simply define a new concrete type
+ * (a `class` or `struct`) with a `static` member called `name` that contains
+ * the desired name of the domain.
+ *
+ * \code{.cpp}
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ * \endcode
+ *
+ * For any NVTX C++ construct that can be scoped to a domain, the type
+ * `my_domain` can be passed as an explicit template argument to scope it to
+ * the custom domain.
+ *
+ * The tag type `nvtx3::domain::global` represents the global NVTX domain.
+ *
+ * \code{.cpp}
+ * // By default, `scoped_range_in` belongs to the global domain
+ * nvtx3::scoped_range_in<> r0{};
+ *
+ * // Alias for a `scoped_range_in` in the global domain
+ * nvtx3::scoped_range r1{};
+ *
+ * // `r` belongs to the custom domain
+ * nvtx3::scoped_range_in<my_domain> r{};
+ * \endcode
+ *
+ * When using a custom domain, it is recommended to define type aliases for NVTX
+ * constructs in the custom domain.
+ * \code{.cpp}
+ * using my_scoped_range = nvtx3::scoped_range_in<my_domain>;
+ * using my_registered_string = nvtx3::registered_string_in<my_domain>;
+ * using my_named_category = nvtx3::named_category_in<my_domain>;
+ * \endcode
+ *
+ * See `nvtx3::domain` for more information.
+ *
+ * \section ATTRIBUTES Event Attributes
+ *
+ * NVTX events can be customized with various attributes to provide additional
+ * information (such as a custom message) or to control visualization of the
+ * event (such as the color used). These attributes can be specified per-event
+ * via arguments to a `nvtx3::event_attributes` object.
+ *
+ * NVTX events can be customized via four "attributes":
+ * - \ref COLOR : color used to visualize the event in tools.
+ * - \ref MESSAGES :  Custom message string.
+ * - \ref PAYLOAD :  User-defined numerical value.
+ * - \ref CATEGORY : Intra-domain grouping.
+ *
+ * It is possible to construct a `nvtx3::event_attributes` from any number of
+ * attribute objects (nvtx3::color, nvtx3::message, nvtx3::payload,
+ * nvtx3::category) in any order. If an attribute is not specified, a tool
+ * specific default value is used. See `nvtx3::event_attributes` for more
+ * information.
+ *
+ * \code{.cpp}
+ * // Set message, same as passing nvtx3::message{"message"}
+ * nvtx3::event_attributes attr{"message"};
+ *
+ * // Set message and color
+ * nvtx3::event_attributes attr{"message", nvtx3::rgb{127, 255, 0}};
+ *
+ * // Set message, color, payload, category
+ * nvtx3::event_attributes attr{"message",
+ *                              nvtx3::rgb{127, 255, 0},
+ *                              nvtx3::payload{42},
+ *                              nvtx3::category{1}};
+ *
+ * // Same as above -- can use any order of arguments
+ * nvtx3::event_attributes attr{nvtx3::payload{42},
+ *                              nvtx3::category{1},
+ *                              "message",
+ *                              nvtx3::rgb{127, 255, 0}};
+ *
+ * // Multiple arguments of the same type are allowed, but only the first is
+ * // used -- in this example, payload is set to 42:
+ * nvtx3::event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} };
+ *
+ * // Using the nvtx3 namespace in a local scope makes the syntax more succinct:
+ * using namespace nvtx3;
+ * event_attributes attr{"message", rgb{127, 255, 0}, payload{42}, category{1}};
+ * \endcode
+ *
+ * \subsection MESSAGES message
+ *
+ * `nvtx3::message` sets the message string for an NVTX event.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Create an `event_attributes` with the message "my message"
+ * nvtx3::event_attributes attr{nvtx3::message{"my message"}};
+ *
+ * // strings and string literals implicitly assumed to be a `nvtx3::message`
+ * nvtx3::event_attributes attr{"my message"};
+ * \endcode
+ *
+ * \subsubsection REGISTERED_MESSAGE Registered Messages
+ *
+ * Associating a `nvtx3::message` with an event requires copying the contents of
+ * the message every time the message is used, i.e., copying the entire message
+ * string. This may cause non-trivial overhead in performance sensitive code.
+ *
+ * To eliminate this overhead, NVTX allows registering a message string,
+ * yielding a "handle" that is inexpensive to copy that may be used in place of
+ * a message string. When visualizing the events, tools such as Nsight Systems
+ * will take care of mapping the message handle to its string.
+ *
+ * A message should be registered once and the handle reused throughout the rest
+ * of the application. This can be done by either explicitly creating static
+ * `nvtx3::registered_string` objects, or using the
+ * `nvtx3::registered_string::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::registered_string::get` requires defining a
+ * custom tag type with a static `message` member whose value will be the
+ * contents of the registered string.
+ *
+ * Example:
+ * \code{.cpp}
+ * // Explicitly constructed, static `registered_string` in my_domain:
+ * static registered_string_in<my_domain> static_message{"my message"};
+ *
+ * // Or use construct on first use:
+ * // Define a tag type with a `message` member string to register
+ * struct my_message{ static constexpr char const* message{ "my message" }; };
+ *
+ * // Uses construct on first use to register the contents of
+ * // `my_message::message`
+ * auto& msg = nvtx3::registered_string_in<my_domain>::get<my_message>();
+ * \endcode
+ *
+ * \subsection COLOR color
+ *
+ * Associating a `nvtx3::color` with an event allows controlling how the event
+ * is visualized in a tool such as Nsight Systems. This is a convenient way to
+ * visually differentiate among different events.
+ *
+ * \code{.cpp}
+ * // Define a color via rgb color values
+ * nvtx3::color c{nvtx3::rgb{127, 255, 0}};
+ * nvtx3::event_attributes attr{c};
+ *
+ * // rgb color values can be passed directly to an `event_attributes`
+ * nvtx3::event_attributes attr1{nvtx3::rgb{127,255,0}};
+ * \endcode
+ *
+ * \subsection CATEGORY category
+ *
+ * A `nvtx3::category` is simply an integer id that allows for fine-grain
+ * grouping of NVTX events. For example, one might use separate categories for
+ * IO, memory allocation, compute, etc.
+ *
+ * \code{.cpp}
+ * nvtx3::event_attributes{nvtx3::category{1}};
+ * \endcode
+ *
+ * \subsubsection NAMED_CATEGORIES Named Categories
+ *
+ * Associates a `name` string with a category `id` to help differentiate among
+ * categories.
+ *
+ * For any given category id `Id`, a `named_category{Id, "name"}` should only
+ * be constructed once and reused throughout an application. This can be done by
+ * either explicitly creating static `nvtx3::named_category` objects, or using
+ * the `nvtx3::named_category::get` construct on first use helper (recommended).
+ *
+ * Similar to \ref DOMAINS, `nvtx3::named_category::get` requires defining a
+ * custom tag type with static `name` and `id` members.
+ *
+ * \code{.cpp}
+ * // Explicitly constructed, static `named_category` in my_domain:
+ * static nvtx3::named_category_in<my_domain> static_category{42, "my category"};
+ *
+ * // Or use construct on first use:
+ * // Define a tag type with `name` and `id` members
+ * struct my_category {
+ *    static constexpr char const* name{"my category"}; // category name
+ *    static constexpr uint32_t id{42}; // category id
+ * };
+ *
+ * // Use construct on first use to name the category id `42`
+ * // with name "my category":
+ * auto& cat = named_category_in<my_domain>::get<my_category>();
+ *
+ * // Range `r` associated with category id `42`
+ * nvtx3::event_attributes attr{cat};
+ * \endcode
+ *
+ * \subsection PAYLOAD payload
+ *
+ * Allows associating a user-defined numerical value with an event.
+ *
+ * \code{.cpp}
+ * // Constructs a payload from the `int32_t` value 42
+ * nvtx3:: event_attributes attr{nvtx3::payload{42}};
+ * \endcode
+ *
+ *
+ * \section EXAMPLE Example
+ *
+ * Putting it all together:
+ * \code{.cpp}
+ * // Define a custom domain tag type
+ * struct my_domain{ static constexpr char const* name{"my domain"}; };
+ *
+ * // Define a named category tag type
+ * struct my_category{
+ *    static constexpr char const* name{"my category"};
+ *    static constexpr uint32_t id{42};
+ * };
+ *
+ * // Define a registered string tag type
+ * struct my_message{ static constexpr char const* message{"my message"}; };
+ *
+ * // For convenience, use aliases for domain scoped objects
+ * using my_scoped_range = nvtx3::scoped_range_in<my_domain>;
+ * using my_registered_string = nvtx3::registered_string_in<my_domain>;
+ * using my_named_category = nvtx3::named_category_in<my_domain>;
+ *
+ * // Default values for all attributes
+ * nvtx3::event_attributes attr{};
+ * my_scoped_range r0{attr};
+ *
+ * // Custom (unregistered) message, and unnamed category
+ * nvtx3::event_attributes attr1{"message", nvtx3::category{2}};
+ * my_scoped_range r1{attr1};
+ *
+ * // Alternatively, pass arguments of `event_attributes` ctor directly to
+ * // `my_scoped_range`
+ * my_scoped_range r2{"message", nvtx3::category{2}};
+ *
+ * // construct on first use a registered string
+ * auto& msg = my_registered_string::get<my_message>();
+ *
+ * // construct on first use a named category
+ * auto& cat = my_named_category::get<my_category>();
+ *
+ * // Use registered string and named category with a custom payload
+ * my_scoped_range r3{msg, cat, nvtx3::payload{42}};
+ *
+ * // Any number of arguments in any order
+ * my_scoped_range r{nvtx3::rgb{127, 255,0}, msg};
+ *
+ * \endcode
+ * \section MACROS Convenience Macros
+ *
+ * Oftentimes users want to quickly and easily add NVTX ranges to their library
+ * or application to aid in profiling and optimization.
+ *
+ * A convenient way to do this is to use the \ref NVTX3_FUNC_RANGE and
+ * \ref NVTX3_FUNC_RANGE_IN macros. These macros take care of constructing an
+ * `nvtx3::scoped_range_in` with the name of the enclosing function as the
+ * range's message.
+ *
+ * \code{.cpp}
+ * void some_function() {
+ *    // Automatically generates an NVTX range for the duration of the function
+ *    // using "some_function" as the event's message.
+ *    NVTX3_FUNC_RANGE();
+ * }
+ * \endcode
+ *
+ */
+
+/* Temporary helper #defines, removed with #undef at end of header */
+
+#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
+#  if defined(_MSC_VER) && _MSC_VER < 1914
+/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
+ * that does not work with domain::get's specialization for domain::global,
+ * and would require extra conditions to make SFINAE work for the overloaded
+ * get() functions.  This macro disables use of overloaded get() in order to
+ * work with VS2015 and versions of VS2017 below 15.7, without penalizing
+ * users of newer compilers.  Building with this flag set to 0 means errors
+ * when defining tag structs (see documentation for domain, named_category,
+ * and registered_string) will have more complex compiler error messages
+ * instead of the clear static_assert messages from the get() overloads.
+ */
+#    define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
+#  else
+#    define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
+#  endif
+#  define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE
+#endif
+
+/* Within this header, nvtx3::NVTX3_VERSION_NAMESPACE resolves to nvtx3::vX,
+ * where "X" is the major version number. */
+#define NVTX3_CONCAT(A, B)           A##B
+#define NVTX3_NAMESPACE_FOR(VERSION) NVTX3_CONCAT(v, VERSION)
+#define NVTX3_VERSION_NAMESPACE      NVTX3_NAMESPACE_FOR(NVTX3_CPP_VERSION_MAJOR)
+
+/* Avoid duplicating #if defined(NVTX3_INLINE_THIS_VERSION) for namespaces
+ * in each minor version by making a macro to use unconditionally, which
+ * resolves to "inline" or nothing as appropriate. */
+#if defined(NVTX3_INLINE_THIS_VERSION)
+#  define NVTX3_INLINE_IF_REQUESTED inline
+#else
+#  define NVTX3_INLINE_IF_REQUESTED
+#endif
+
+/* Enables the use of constexpr when support for C++14 constexpr is present.
+ *
+ * Initialization of a class member that is a union to a specific union member
+ * can only be done in the body of a constructor, not in a member initializer
+ * list.  A constexpr constructor must have an empty body until C++14, so there
+ * is no way to make an initializer of a member union constexpr in C++11.  This
+ * macro allows making functions constexpr in C++14 or newer, but non-constexpr
+ * in C++11 compilation.  It is used here on constructors that initialize their
+ * member unions.
+ */
+#if __cpp_constexpr >= 201304L
+#  define NVTX3_CONSTEXPR_IF_CPP14 constexpr
+#else
+#  define NVTX3_CONSTEXPR_IF_CPP14
+#endif
+
+/* Use a macro for static asserts, which defaults to static_assert, but that
+ * testing tools can replace with a logging function.  For example:
+ * #define NVTX3_STATIC_ASSERT(c, m) \
+ *   do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0)
+ */
+#if !defined(NVTX3_STATIC_ASSERT)
+#  define NVTX3_STATIC_ASSERT(condition, message) static_assert(condition, message);
+#  define NVTX3_STATIC_ASSERT_DEFINED_HERE
+#endif
+
+/* Implementation sections, enclosed in guard macros for each minor version */
+
+#ifndef NVTX3_CPP_DEFINITIONS_V1_0
+#  define NVTX3_CPP_DEFINITIONS_V1_0
+
+#  include <cstddef>
+#  include <memory>
+#  include <string>
+#  include <type_traits>
+#  include <utility>
+
+#  include <nvtx3/nvToolsExt.h> // NOTE(bgruber): "nvtx3/" prefix added and switched to angle brackets
+
+namespace nvtx3
+{
+
+NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE
+{
+  namespace detail
+  {
+
+  template <typename Unused>
+  struct always_false : std::false_type
+  {};
+
+  template <typename T, typename = void>
+  struct has_name : std::false_type
+  {};
+  template <typename T>
+  struct has_name<T, decltype((void) T::name, void())> : std::true_type
+  {};
+
+  template <typename T, typename = void>
+  struct has_id : std::false_type
+  {};
+  template <typename T>
+  struct has_id<T, decltype((void) T::id, void())> : std::true_type
+  {};
+
+  template <typename T, typename = void>
+  struct has_message : std::false_type
+  {};
+  template <typename T>
+  struct has_message<T, decltype((void) T::message, void())> : std::true_type
+  {};
+
+  template <typename T, typename = void>
+  struct is_c_string : std::false_type
+  {};
+  template <typename T>
+  struct is_c_string<T,
+                     typename std::enable_if<std::is_convertible<T, char const*>::value
+                                             || std::is_convertible<T, wchar_t const*>::value>::type> : std::true_type
+  {};
+
+  template <typename T>
+  using is_uint32 = std::is_same<typename std::decay<T>::type, uint32_t>;
+
+  } // namespace detail
+
+  /**
+   * @brief `domain`s allow for grouping NVTX events into a single scope to
+   * differentiate them from events in other `domain`s.
+   *
+   * By default, all NVTX constructs are placed in the "global" NVTX domain.
+   *
+   * A custom `domain` may be used in order to differentiate a library's or
+   * application's NVTX events from other events.
+   *
+   * `domain`s are expected to be long-lived and unique to a library or
+   * application. As such, it is assumed a domain's name is known at compile
+   * time. Therefore, all NVTX constructs that can be associated with a domain
+   * require the domain to be specified via a *type* `D` passed as an
+   * explicit template parameter.
+   *
+   * The type `domain::global` may be used to indicate that the global NVTX
+   * domain should be used.
+   *
+   * None of the C++ NVTX constructs require the user to manually construct a
+   * `domain` object. Instead, if a custom domain is desired, the user is
+   * expected to define a type `D` that contains a member
+   * `D::name` which resolves to either a `char const*` or `wchar_t
+   * const*`. The value of `D::name` is used to name and uniquely
+   * identify the custom domain.
+   *
+   * Upon the first use of an NVTX construct associated with the type
+   * `D`, the "construct on first use" pattern is used to construct a
+   * function local static `domain` object. All future NVTX constructs
+   * associated with `D` will use a reference to the previously
+   * constructed `domain` object. See `domain::get`.
+   *
+   * Example:
+   * \code{.cpp}
+   * // The type `my_domain` defines a `name` member used to name and identify
+   * // the `domain` object identified by `my_domain`.
+   * struct my_domain{ static constexpr char const* name{"my_domain"}; };
+   *
+   * // The NVTX range `r` will be grouped with all other NVTX constructs
+   * // associated with  `my_domain`.
+   * nvtx3::scoped_range_in<my_domain> r{};
+   *
+   * // An alias can be created for a `scoped_range_in` in the custom domain
+   * using my_scoped_range = nvtx3::scoped_range_in<my_domain>;
+   * my_scoped_range my_range{};
+   *
+   * // `domain::global` indicates that the global NVTX domain is used
+   * nvtx3::scoped_range_in<domain::global> r2{};
+   *
+   * // For convenience, `nvtx3::scoped_range` is an alias for a range in the
+   * // global domain
+   * nvtx3::scoped_range r3{};
+   * \endcode
+   */
+  class domain
+  {
+  public:
+    domain(domain const&)            = delete;
+    domain& operator=(domain const&) = delete;
+    domain(domain&&)                 = delete;
+    domain& operator=(domain&&)      = delete;
+
+    /**
+     * @brief Tag type for the "global" NVTX domain.
+     *
+     * This type may be passed as a template argument to any function/class
+     * expecting a type to identify a domain to indicate that the global domain
+     * should be used.
+     *
+     * All NVTX events in the global domain across all libraries and
+     * applications will be grouped together.
+     *
+     */
+    struct global
+    {};
+
+#  if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET
+    /**
+     * @brief Returns reference to an instance of a function local static
+     * `domain` object.
+     *
+     * Uses the "construct on first use" idiom to safely ensure the `domain`
+     * object is initialized exactly once upon first invocation of
+     * `domain::get<D>()`. All following invocations will return a
+     * reference to the previously constructed `domain` object. See
+     * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+     *
+     * None of the constructs in this header require the user to directly invoke
+     * `domain::get`. It is automatically invoked when constructing objects like
+     * a `scoped_range_in` or `category`. Advanced users may wish to use
+     * `domain::get` for the convenience of the "construct on first use" idiom
+     * when using domains with their own use of the NVTX C API.
+     *
+     * This function is threadsafe as of C++11. If two or more threads call
+     * `domain::get<D>` concurrently, exactly one of them is guaranteed
+     * to construct the `domain` object and the other(s) will receive a
+     * reference to the object after it is fully constructed.
+     *
+     * The domain's name is specified via the type `D` pass as an
+     * explicit template parameter. `D` is required to contain a
+     * member `D::name` that resolves to either a `char const*` or
+     * `wchar_t const*`. The value of `D::name` is used to name and
+     * uniquely identify the `domain`.
+     *
+     * Example:
+     * \code{.cpp}
+     * // The type `my_domain` defines a `name` member used to name and identify
+     * // the `domain` object identified by `my_domain`.
+     * struct my_domain{ static constexpr char const* name{"my domain"}; };
+     *
+     * auto& D1 = domain::get<my_domain>(); // First invocation constructs a
+     *                                      // `domain` with the name "my domain"
+     *
+     * auto& D2 = domain::get<my_domain>(); // Quickly returns reference to
+     *                                      // previously constructed `domain`.
+     * \endcode
+     *
+     * @tparam D Type that contains a `D::name` member used to
+     * name the `domain` object.
+     * @return Reference to the `domain` corresponding to the type `D`.
+     */
+    template <typename D = global, typename std::enable_if<detail::is_c_string<decltype(D::name)>::value, int>::type = 0>
+    static domain const& get() noexcept
+    {
+      static domain const d(D::name);
+      return d;
+    }
+
+    /**
+     * @brief Overload of `domain::get` to provide a clear compile error when
+     * `D` has a `name` member that is not directly convertible to either
+     * `char const*` or `wchar_t const*`.
+     */
+    template <typename D                                                                         = global,
+              typename std::enable_if<!detail::is_c_string<decltype(D::name)>::value, int>::type = 0>
+    static domain const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::always_false<D>::value,
+                          "Type used to identify an NVTX domain must contain a static constexpr member "
+                          "called 'name' of type const char* or const wchar_t* -- 'name' member is not "
+                          "convertible to either of those types");
+      static domain const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+
+    /**
+     * @brief Overload of `domain::get` to provide a clear compile error when
+     * `D` does not have a `name` member.
+     */
+    template <typename D = global, typename std::enable_if<!detail::has_name<D>::value, int>::type = 0>
+    static domain const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::always_false<D>::value,
+                          "Type used to identify an NVTX domain must contain a static constexpr member "
+                          "called 'name' of type const char* or const wchar_t* -- 'name' member is missing");
+      static domain const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+#  else
+    template <typename D = global>
+    static domain const& get() noexcept
+    {
+      static domain const d(D::name);
+      return d;
+    }
+#  endif
+
+    /**
+     * @brief Conversion operator to `nvtxDomainHandle_t`.
+     *
+     * Allows transparently passing a domain object into an API expecting a
+     * native `nvtxDomainHandle_t` object.
+     */
+    operator nvtxDomainHandle_t() const noexcept
+    {
+      return _domain;
+    }
+
+  private:
+    /**
+     * @brief Construct a new domain with the specified `name`.
+     *
+     * This constructor is private as it is intended that `domain` objects only
+     * be created through the `domain::get` function.
+     *
+     * @param name A unique name identifying the domain
+     */
+    explicit domain(char const* name) noexcept
+        : _domain{nvtxDomainCreateA(name)}
+    {}
+
+    /**
+     * @brief Construct a new domain with the specified `name`.
+     *
+     * This constructor is private as it is intended that `domain` objects only
+     * be created through the `domain::get` function.
+     *
+     * @param name A unique name identifying the domain
+     */
+    explicit domain(wchar_t const* name) noexcept
+        : _domain{nvtxDomainCreateW(name)}
+    {}
+
+    /**
+     * @brief Construct a new domain with the specified `name`.
+     *
+     * This constructor is private as it is intended that `domain` objects only
+     * be created through the `domain::get` function.
+     *
+     * @param name A unique name identifying the domain
+     */
+    explicit domain(std::string const& name) noexcept
+        : domain{name.c_str()}
+    {}
+
+    /**
+     * @brief Construct a new domain with the specified `name`.
+     *
+     * This constructor is private as it is intended that `domain` objects only
+     * be created through the `domain::get` function.
+     *
+     * @param name A unique name identifying the domain
+     */
+    explicit domain(std::wstring const& name) noexcept
+        : domain{name.c_str()}
+    {}
+
+    /**
+     * @brief Default constructor creates a `domain` representing the
+     * "global" NVTX domain.
+     *
+     * All events not associated with a custom `domain` are grouped in the
+     * "global" NVTX domain.
+     *
+     */
+    domain() noexcept {}
+
+    /**
+     * @brief Intentionally avoid calling nvtxDomainDestroy on the `domain` object.
+     *
+     * No currently-available tools attempt to free domain resources when the
+     * nvtxDomainDestroy function is called, due to the thread-safety and
+     * efficiency challenges of freeing thread-local storage for other threads.
+     * Since libraries may be disallowed from introducing static destructors,
+     * and destroying the domain is likely to have no effect, the destructor
+     * for `domain` intentionally chooses to not destroy the domain.
+     *
+     * In a situation where domain destruction is necessary, either manually
+     * call nvtxDomainDestroy on the domain's handle, or make a class that
+     * derives from `domain` and calls nvtxDomainDestroy in its destructor.
+     */
+    ~domain() = default;
+
+  private:
+    nvtxDomainHandle_t const _domain{}; ///< The `domain`s NVTX handle
+  };
+
+  /**
+   * @brief Returns reference to the `domain` object that represents the global
+   * NVTX domain.
+   *
+   * This specialization for `domain::global` returns a default constructed,
+   * `domain` object for use when the "global" domain is desired.
+   *
+   * All NVTX events in the global domain across all libraries and applications
+   * will be grouped together.
+   *
+   * @return Reference to the `domain` corresponding to the global NVTX domain.
+   *
+   */
+  template <>
+  inline domain const& domain::get<domain::global>() noexcept
+  {
+    static domain const d{};
+    return d;
+  }
+
+  /**
+   * @brief Indicates the values of the red, green, and blue color channels for
+   * an RGB color to use as an event attribute (assumes no transparency).
+   *
+   */
+  struct rgb
+  {
+    /// Type used for component values
+    using component_type = uint8_t;
+
+    /**
+     * @brief Construct a rgb with red, green, and blue channels
+     * specified by `red_`, `green_`, and `blue_`, respectively.
+     *
+     * Valid values are in the range `[0,255]`.
+     *
+     * @param red_ Value of the red channel
+     * @param green_ Value of the green channel
+     * @param blue_ Value of the blue channel
+     */
+    constexpr rgb(component_type red_, component_type green_, component_type blue_) noexcept
+        : red{red_}
+        , green{green_}
+        , blue{blue_}
+    {}
+
+    component_type red{}; ///< Red channel value
+    component_type green{}; ///< Green channel value
+    component_type blue{}; ///< Blue channel value
+  };
+
+  /**
+   * @brief Indicates the value of the alpha, red, green, and blue color
+   * channels for an ARGB color to use as an event attribute.
+   *
+   */
+  struct argb final : rgb
+  {
+    /**
+     * @brief Construct an argb with alpha, red, green, and blue channels
+     * specified by `alpha_`, `red_`, `green_`, and `blue_`, respectively.
+     *
+     * Valid values are in the range `[0,255]`.
+     *
+     * @param alpha_  Value of the alpha channel (opacity)
+     * @param red_  Value of the red channel
+     * @param green_  Value of the green channel
+     * @param blue_  Value of the blue channel
+     *
+     */
+    constexpr argb(component_type alpha_, component_type red_, component_type green_, component_type blue_) noexcept
+        : rgb{red_, green_, blue_}
+        , alpha{alpha_}
+    {}
+
+    component_type alpha{}; ///< Alpha channel value
+  };
+
+  /**
+   * @brief Represents a custom color that can be associated with an NVTX event
+   * via it's `event_attributes`.
+   *
+   * Specifying colors for NVTX events is a convenient way to visually
+   * differentiate among different events in a visualization tool such as Nsight
+   * Systems.
+   *
+   */
+  class color
+  {
+  public:
+    /// Type used for the color's value
+    using value_type = uint32_t;
+
+    /**
+     * @brief Constructs a `color` using the value provided by `hex_code`.
+     *
+     * `hex_code` is expected to be a 4 byte argb hex code.
+     *
+     * The most significant byte indicates the value of the alpha channel
+     * (opacity) (0-255)
+     *
+     * The next byte indicates the value of the red channel (0-255)
+     *
+     * The next byte indicates the value of the green channel (0-255)
+     *
+     * The least significant byte indicates the value of the blue channel
+     * (0-255)
+     *
+     * @param hex_code The hex code used to construct the `color`
+     */
+    constexpr explicit color(value_type hex_code) noexcept
+        : _value{hex_code}
+    {}
+
+    /**
+     * @brief Construct a `color` using the alpha, red, green, blue components
+     * in `argb`.
+     *
+     * @param argb The alpha, red, green, blue components of the desired `color`
+     */
+    constexpr color(argb argb_) noexcept
+        : color{from_bytes_msb_to_lsb(argb_.alpha, argb_.red, argb_.green, argb_.blue)}
+    {}
+
+    /**
+     * @brief Construct a `color` using the red, green, blue components in
+     * `rgb`.
+     *
+     * Uses maximum value for the alpha channel (opacity) of the `color`.
+     *
+     * @param rgb The red, green, blue components of the desired `color`
+     */
+    constexpr color(rgb rgb_) noexcept
+        : color{from_bytes_msb_to_lsb(0xFF, rgb_.red, rgb_.green, rgb_.blue)}
+    {}
+
+    /**
+     * @brief Returns the `color`s argb hex code
+     *
+     */
+    constexpr value_type get_value() const noexcept
+    {
+      return _value;
+    }
+
+    /**
+     * @brief Return the NVTX color type of the color.
+     *
+     */
+    constexpr nvtxColorType_t get_type() const noexcept
+    {
+      return _type;
+    }
+
+    color()                        = delete;
+    ~color()                       = default;
+    color(color const&)            = default;
+    color& operator=(color const&) = default;
+    color(color&&)                 = default;
+    color& operator=(color&&)      = default;
+
+  private:
+    /**
+     * @brief Constructs an unsigned, 4B integer from the component bytes in
+     * most to least significant byte order.
+     *
+     */
+    constexpr static value_type
+    from_bytes_msb_to_lsb(uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0) noexcept
+    {
+      return uint32_t{byte3} << 24 | uint32_t{byte2} << 16 | uint32_t{byte1} << 8 | uint32_t{byte0};
+    }
+
+    value_type _value{}; ///< color's argb color code
+    nvtxColorType_t _type{NVTX_COLOR_ARGB}; ///< NVTX color type code
+  };
+
+  /**
+   * @brief Object for intra-domain grouping of NVTX events.
+   *
+   * A `category` is simply an integer id that allows for fine-grain grouping of
+   * NVTX events. For example, one might use separate categories for IO, memory
+   * allocation, compute, etc.
+   *
+   * Example:
+   * \code{.cpp}
+   * nvtx3::category cat1{1};
+   *
+   * // Range `r1` belongs to the category identified by the value `1`.
+   * nvtx3::scoped_range r1{cat1};
+   *
+   * // Range `r2` belongs to the same category as `r1`
+   * nvtx3::scoped_range r2{nvtx3::category{1}};
+   * \endcode
+   *
+   * To associate a name string with a category id, see `named_category`.
+   *
+   */
+  class category
+  {
+  public:
+    /// Type used for `category`s integer id.
+    using id_type = uint32_t;
+
+    /**
+     * @brief Construct a `category` with the specified `id`.
+     *
+     * The `category` will be unnamed and identified only by its `id` value.
+     *
+     * All `category`s in a domain sharing the same `id` are equivalent.
+     *
+     * @param[in] id The `category`'s identifying value
+     */
+    constexpr explicit category(id_type id) noexcept
+        : id_{id}
+    {}
+
+    /**
+     * @brief Returns the id of the category.
+     *
+     */
+    constexpr id_type get_id() const noexcept
+    {
+      return id_;
+    }
+
+    category()                           = delete;
+    ~category()                          = default;
+    category(category const&)            = default;
+    category& operator=(category const&) = default;
+    category(category&&)                 = default;
+    category& operator=(category&&)      = default;
+
+  private:
+    id_type id_{}; ///< category's unique identifier
+  };
+
+  /**
+   * @brief A `category` with an associated name string.
+   *
+   * Associates a `name` string with a category `id` to help differentiate among
+   * categories.
+   *
+   * For any given category id `Id`, a `named_category(Id, "name")` should only
+   * be constructed once and reused throughout an application. This can be done
+   * by either explicitly creating static `named_category` objects, or using the
+   * `named_category::get` construct on first use helper (recommended).
+   *
+   * Creating two or more `named_category` objects with the same value for `id`
+   * in the same domain results in undefined behavior.
+   *
+   * Similarly, behavior is undefined when a `named_category` and `category`
+   * share the same value of `id`.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Explicitly constructed, static `named_category` in global domain:
+   * static nvtx3::named_category static_category{42, "my category"};
+   *
+   * // Range `r` associated with category id `42`
+   * nvtx3::scoped_range r{static_category};
+   *
+   * // OR use construct on first use:
+   *
+   * // Define a type with `name` and `id` members
+   * struct my_category {
+   *    static constexpr char const* name{"my category"}; // category name
+   *    static constexpr uint32_t id{42}; // category id
+   * };
+   *
+   * // Use construct on first use to name the category id `42`
+   * // with name "my category"
+   * auto& cat = named_category_in<my_domain>::get<my_category>();
+   *
+   * // Range `r` associated with category id `42`
+   * nvtx3::scoped_range r{cat};
+   * \endcode
+   *
+   * `named_category_in<D>`'s association of a name to a category id is local to
+   * the domain specified by the type `D`. An id may have a different name in
+   * another domain.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain` to
+   * which the `named_category_in` belongs. Else, `domain::global` to indicate
+   * that the global NVTX domain should be used.
+   */
+  template <typename D = domain::global>
+  class named_category_in final : public category
+  {
+  public:
+#  if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET
+    /**
+     * @brief Returns a global instance of a `named_category_in` as a
+     * function-local static.
+     *
+     * Creates a `named_category_in<D>` with name and id specified by the contents
+     * of a type `C`. `C::name` determines the name and `C::id` determines the
+     * category id.
+     *
+     * This function is useful for constructing a named `category` exactly once
+     * and reusing the same instance throughout an application.
+     *
+     * Example:
+     * \code{.cpp}
+     * // Define a type with `name` and `id` members
+     * struct my_category {
+     *    static constexpr char const* name{"my category"}; // category name
+     *    static constexpr uint32_t id{42}; // category id
+     * };
+     *
+     * // Use construct on first use to name the category id `42`
+     * // with name "my category"
+     * auto& cat = named_category_in<my_domain>::get<my_category>();
+     *
+     * // Range `r` associated with category id `42`
+     * nvtx3::scoped_range r{cat};
+     * \endcode
+     *
+     * Uses the "construct on first use" idiom to safely ensure the `category`
+     * object is initialized exactly once. See
+     * https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
+     *
+     * @tparam C Type containing a member `C::name` that resolves to either a
+     * `char const*` or `wchar_t const*` and `C::id`.
+     */
+    template <
+      typename C,
+      typename std::enable_if<detail::is_c_string<decltype(C::name)>::value && detail::is_uint32<decltype(C::id)>::value,
+                              int>::type = 0>
+    static named_category_in const& get() noexcept
+    {
+      static named_category_in const cat(C::id, C::name);
+      return cat;
+    }
+
+    /**
+     * @brief Overload of `named_category_in::get` to provide a clear compile error
+     * when `C` has the required `name` and `id` members, but they are not the
+     * required types.  `name` must be directly convertible to `char const*` or
+     * `wchar_t const*`, and `id` must be `uint32_t`.
+     */
+    template <typename C,
+              typename std::enable_if<!detail::is_c_string<decltype(C::name)>::value
+                                        || !detail::is_uint32<decltype(C::id)>::value,
+                                      int>::type = 0>
+    static named_category_in const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::is_c_string<decltype(C::name)>::value,
+                          "Type used to name an NVTX category must contain a static constexpr member "
+                          "called 'name' of type const char* or const wchar_t* -- 'name' member is not "
+                          "convertible to either of those types");
+      NVTX3_STATIC_ASSERT(detail::is_uint32<decltype(C::id)>::value,
+                          "Type used to name an NVTX category must contain a static constexpr member "
+                          "called 'id' of type uint32_t -- 'id' member is the wrong type");
+      static named_category_in const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+
+    /**
+     * @brief Overload of `named_category_in::get` to provide a clear compile error
+     * when `C` does not have the required `name` and `id` members.
+     */
+    template <typename C,
+              typename std::enable_if<!detail::has_name<C>::value || !detail::has_id<C>::value, int>::type = 0>
+    static named_category_in const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::has_name<C>::value,
+                          "Type used to name an NVTX category must contain a static constexpr member "
+                          "called 'name' of type const char* or const wchar_t* -- 'name' member is missing");
+      NVTX3_STATIC_ASSERT(detail::has_id<C>::value,
+                          "Type used to name an NVTX category must contain a static constexpr member "
+                          "called 'id' of type uint32_t -- 'id' member is missing");
+      static named_category_in const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+#  else
+    template <typename C>
+    static named_category_in const& get() noexcept
+    {
+      static named_category_in const cat(C::id, C::name);
+      return cat;
+    }
+#  endif
+
+  private:
+    // Default constructor is only used internally for static_assert(false) cases.
+    named_category_in() noexcept
+        : category{0}
+    {}
+
+  public:
+    /**
+     * @brief Construct a `named_category_in` with the specified `id` and `name`.
+     *
+     * The name `name` will be registered with `id`.
+     *
+     * Every unique value of `id` should only be named once.
+     *
+     * @param[in] id The category id to name
+     * @param[in] name The name to associated with `id`
+     */
+    named_category_in(id_type id, char const* name) noexcept
+        : category{id}
+    {
+#  ifndef NVTX_DISABLE
+      nvtxDomainNameCategoryA(domain::get<D>(), get_id(), name);
+#  else
+      (void) id;
+      (void) name;
+#  endif
+    };
+
+    /**
+     * @brief Construct a `named_category_in` with the specified `id` and `name`.
+     *
+     * The name `name` will be registered with `id`.
+     *
+     * Every unique value of `id` should only be named once.
+     *
+     * @param[in] id The category id to name
+     * @param[in] name The name to associated with `id`
+     */
+    named_category_in(id_type id, wchar_t const* name) noexcept
+        : category{id}
+    {
+#  ifndef NVTX_DISABLE
+      nvtxDomainNameCategoryW(domain::get<D>(), get_id(), name);
+#  else
+      (void) id;
+      (void) name;
+#  endif
+    };
+  };
+
+  /**
+   * @brief Alias for a `named_category_in` in the global NVTX domain.
+   *
+   */
+  using named_category = named_category_in<domain::global>;
+
+  /**
+   * @brief A message registered with NVTX.
+   *
+   * Normally, associating a `message` with an NVTX event requires copying the
+   * contents of the message string. This may cause non-trivial overhead in
+   * highly performance sensitive regions of code.
+   *
+   * message registration is an optimization to lower the overhead of
+   * associating a message with an NVTX event. Registering a message yields a
+   * handle that is inexpensive to copy that may be used in place of a message
+   * string.
+   *
+   * A particular message should only be registered once and the handle
+   * reused throughout the rest of the application. This can be done by either
+   * explicitly creating static `registered_string_in` objects, or using the
+   * `registered_string_in::get` construct on first use helper (recommended).
+   *
+   * Example:
+   * \code{.cpp}
+   * // Explicitly constructed, static `registered_string` in my_domain:
+   * static registered_string_in<my_domain> static_message{"message"};
+   *
+   * // "message" is associated with the range `r`
+   * nvtx3::scoped_range r{static_message};
+   *
+   * // Or use construct on first use:
+   *
+   * // Define a type with a `message` member that defines the contents of the
+   * // registered string
+   * struct my_message{ static constexpr char const* message{ "my message" }; };
+   *
+   * // Uses construct on first use to register the contents of
+   * // `my_message::message`
+   * auto& msg = registered_string_in<my_domain>::get<my_message>();
+   *
+   * // "my message" is associated with the range `r`
+   * nvtx3::scoped_range r{msg};
+   * \endcode
+   *
+   * `registered_string_in`s are local to a particular domain specified via
+   * the type `D`.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain` to
+   * which the `registered_string_in` belongs. Else, `domain::global` to indicate
+   * that the global NVTX domain should be used.
+   */
+  template <typename D = domain::global>
+  class registered_string_in
+  {
+  public:
+#  if NVTX3_USE_CHECKED_OVERLOADS_FOR_GET
+    /**
+     * @brief Returns a global instance of a `registered_string_in` as a function
+     * local static.
+     *
+     * Provides a convenient way to register a message with NVTX without having
+     * to explicitly register the message.
+     *
+     * Upon first invocation, constructs a `registered_string_in` whose contents
+     * are specified by `message::message`.
+     *
+     * All future invocations will return a reference to the object constructed
+     * in the first invocation.
+     *
+     * Example:
+     * \code{.cpp}
+     * // Define a type with a `message` member that defines the contents of the
+     * // registered string
+     * struct my_message{ static constexpr char const* message{ "my message" };
+     * };
+     *
+     * // Uses construct on first use to register the contents of
+     * // `my_message::message`
+     * auto& msg = registered_string_in<my_domain>::get<my_message>();
+     *
+     * // "my message" is associated with the range `r`
+     * nvtx3::scoped_range r{msg};
+     * \endcode
+     *
+     * @tparam M Type required to contain a member `M::message` that
+     * resolves to either a `char const*` or `wchar_t const*` used as the
+     * registered string's contents.
+     * @return Reference to a `registered_string_in` associated with the type `M`.
+     */
+    template <typename M, typename std::enable_if<detail::is_c_string<decltype(M::message)>::value, int>::type = 0>
+    static registered_string_in const& get() noexcept
+    {
+      static registered_string_in const regstr(M::message);
+      return regstr;
+    }
+
+    /**
+     * @brief Overload of `registered_string_in::get` to provide a clear compile error
+     * when `M` has a `message` member that is not directly convertible to either
+     * `char const*` or `wchar_t const*`.
+     */
+    template <typename M, typename std::enable_if<!detail::is_c_string<decltype(M::message)>::value, int>::type = 0>
+    static registered_string_in const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::always_false<M>::value,
+                          "Type used to register an NVTX string must contain a static constexpr member "
+                          "called 'message' of type const char* or const wchar_t* -- 'message' member is "
+                          "not convertible to either of those types");
+      static registered_string_in const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+
+    /**
+     * @brief Overload of `registered_string_in::get` to provide a clear compile error when
+     * `M` does not have a `message` member.
+     */
+    template <typename M, typename std::enable_if<!detail::has_message<M>::value, int>::type = 0>
+    static registered_string_in const& get() noexcept
+    {
+      NVTX3_STATIC_ASSERT(detail::always_false<M>::value,
+                          "Type used to register an NVTX string must contain a static constexpr member "
+                          "called 'message' of type const char* or const wchar_t* -- 'message' member "
+                          "is missing");
+      static registered_string_in const unused;
+      return unused; // Function must compile for static_assert to be triggered
+    }
+#  else
+    template <typename M>
+    static registered_string_in const& get() noexcept
+    {
+      static registered_string_in const regstr(M::message);
+      return regstr;
+    }
+#  endif
+
+    /**
+     * @brief Constructs a `registered_string_in` from the specified `msg` string.
+     *
+     * Registers `msg` with NVTX and associates a handle with the registered
+     * message.
+     *
+     * A particular message should should only be registered once and the handle
+     * reused throughout the rest of the application.
+     *
+     * @param msg The contents of the message
+     */
+    explicit registered_string_in(char const* msg) noexcept
+        : handle_{nvtxDomainRegisterStringA(domain::get<D>(), msg)}
+    {}
+
+    /**
+     * @brief Constructs a `registered_string_in` from the specified `msg` string.
+     *
+     * Registers `msg` with NVTX and associates a handle with the registered
+     * message.
+     *
+     * A particular message should should only be registered once and the handle
+     * reused throughout the rest of the application.
+     *
+     * @param msg The contents of the message
+     */
+    explicit registered_string_in(std::string const& msg) noexcept
+        : registered_string_in{msg.c_str()}
+    {}
+
+    /**
+     * @brief Constructs a `registered_string_in` from the specified `msg` string.
+     *
+     * Registers `msg` with NVTX and associates a handle with the registered
+     * message.
+     *
+     * A particular message should should only be registered once and the handle
+     * reused throughout the rest of the application.
+     *
+     * @param msg The contents of the message
+     */
+    explicit registered_string_in(wchar_t const* msg) noexcept
+        : handle_{nvtxDomainRegisterStringW(domain::get<D>(), msg)}
+    {}
+
+    /**
+     * @brief Constructs a `registered_string_in` from the specified `msg` string.
+     *
+     * Registers `msg` with NVTX and associates a handle with the registered
+     * message.
+     *
+     * A particular message should only be registered once and the handle
+     * reused throughout the rest of the application.
+     *
+     * @param msg The contents of the message
+     */
+    explicit registered_string_in(std::wstring const& msg) noexcept
+        : registered_string_in{msg.c_str()}
+    {}
+
+    /**
+     * @brief Returns the registered string's handle
+     *
+     */
+    nvtxStringHandle_t get_handle() const noexcept
+    {
+      return handle_;
+    }
+
+  private:
+    // Default constructor is only used internally for static_assert(false) cases.
+    registered_string_in() noexcept {};
+
+  public:
+    ~registered_string_in()                                      = default;
+    registered_string_in(registered_string_in const&)            = default;
+    registered_string_in& operator=(registered_string_in const&) = default;
+    registered_string_in(registered_string_in&&)                 = default;
+    registered_string_in& operator=(registered_string_in&&)      = default;
+
+  private:
+    nvtxStringHandle_t handle_{}; ///< The handle returned from
+                                  ///< registering the message with NVTX
+  };
+
+  /**
+   * @brief Alias for a `registered_string_in` in the global NVTX domain.
+   *
+   */
+  using registered_string = registered_string_in<domain::global>;
+
+  /**
+   * @brief Allows associating a message string with an NVTX event via
+   * its `EventAttribute`s.
+   *
+   * Associating a `message` with an NVTX event through its `event_attributes`
+   * allows for naming events to easily differentiate them from other events.
+   *
+   * Every time an NVTX event is created with an associated `message`, the
+   * contents of the message string must be copied.  This may cause non-trivial
+   * overhead in highly performance sensitive sections of code. Use of a
+   * `nvtx3::registered_string` is recommended in these situations.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Creates an `event_attributes` with message "message 0"
+   * nvtx3::event_attributes attr0{nvtx3::message{"message 0"}};
+   *
+   * // `range0` contains message "message 0"
+   * nvtx3::scoped_range range0{attr0};
+   *
+   * // `std::string` and string literals are implicitly assumed to be
+   * // the contents of an `nvtx3::message`
+   * // Creates an `event_attributes` with message "message 1"
+   * nvtx3::event_attributes attr1{"message 1"};
+   *
+   * // `range1` contains message "message 1"
+   * nvtx3::scoped_range range1{attr1};
+   *
+   * // `range2` contains message "message 2"
+   * nvtx3::scoped_range range2{nvtx3::Mesage{"message 2"}};
+   *
+   * // `std::string` and string literals are implicitly assumed to be
+   * // the contents of an `nvtx3::message`
+   * // `range3` contains message "message 3"
+   * nvtx3::scoped_range range3{"message 3"};
+   * \endcode
+   */
+  class message
+  {
+  public:
+    using value_type = nvtxMessageValue_t;
+
+    /**
+     * @brief Construct a `message` whose contents are specified by `msg`.
+     *
+     * @param msg The contents of the message
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 message(char const* msg) noexcept
+        : type_{NVTX_MESSAGE_TYPE_ASCII}
+    {
+      value_.ascii = msg;
+    }
+
+    /**
+     * @brief Construct a `message` whose contents are specified by `msg`.
+     *
+     * @param msg The contents of the message
+     */
+    message(std::string const& msg) noexcept
+        : message{msg.c_str()}
+    {}
+
+    /**
+     * @brief Disallow construction for `std::string` r-value
+     *
+     * `message` is a non-owning type and therefore cannot take ownership of an
+     * r-value. Therefore, constructing from an r-value is disallowed to prevent
+     * a dangling pointer.
+     *
+     */
+    message(std::string&&) = delete;
+
+    /**
+     * @brief Construct a `message` whose contents are specified by `msg`.
+     *
+     * @param msg The contents of the message
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 message(wchar_t const* msg) noexcept
+        : type_{NVTX_MESSAGE_TYPE_UNICODE}
+    {
+      value_.unicode = msg;
+    }
+
+    /**
+     * @brief Construct a `message` whose contents are specified by `msg`.
+     *
+     * @param msg The contents of the message
+     */
+    message(std::wstring const& msg) noexcept
+        : message{msg.c_str()}
+    {}
+
+    /**
+     * @brief Disallow construction for `std::wstring` r-value
+     *
+     * `message` is a non-owning type and therefore cannot take ownership of an
+     * r-value. Therefore, constructing from an r-value is disallowed to prevent
+     * a dangling pointer.
+     *
+     */
+    message(std::wstring&&) = delete;
+
+    /**
+     * @brief Construct a `message` from a `registered_string_in`.
+     *
+     * @tparam D Type containing `name` member used to identify the `domain`
+     * to which the `registered_string_in` belongs. Else, `domain::global` to
+     * indicate that the global NVTX domain should be used.
+     * @param msg The message that has already been registered with NVTX.
+     */
+    template <typename D>
+    NVTX3_CONSTEXPR_IF_CPP14 message(registered_string_in<D> const& msg) noexcept
+        : type_{NVTX_MESSAGE_TYPE_REGISTERED}
+    {
+      value_.registered = msg.get_handle();
+    }
+
+    /**
+     * @brief Construct a `message` from NVTX C API type and value.
+     *
+     * @param type nvtxMessageType_t enum value indicating type of the payload
+     * @param value nvtxMessageValue_t union containing message
+     */
+    constexpr message(nvtxMessageType_t const& type, nvtxMessageValue_t const& value) noexcept
+        : type_{type}
+        , value_(value)
+    {}
+
+    /**
+     * @brief Construct a `message` from NVTX C API registered string handle.
+     *
+     * @param handle nvtxStringHandle_t value of registered string handle
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 message(nvtxStringHandle_t handle) noexcept
+        : type_{NVTX_MESSAGE_TYPE_REGISTERED}
+    {
+      value_.registered = handle;
+    }
+
+    /**
+     * @brief Return the union holding the value of the message.
+     *
+     */
+    constexpr value_type get_value() const noexcept
+    {
+      return value_;
+    }
+
+    /**
+     * @brief Return the type information about the value the union holds.
+     *
+     */
+    constexpr nvtxMessageType_t get_type() const noexcept
+    {
+      return type_;
+    }
+
+  private:
+    nvtxMessageType_t type_{}; ///< message type
+    nvtxMessageValue_t value_{}; ///< message contents
+  };
+
+  /**
+   * @brief A numerical value that can be associated with an NVTX event via
+   * its `event_attributes`.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Constructs a payload from the int32_t value 42
+   * nvtx3:: event_attributes attr{nvtx3::payload{42}};
+   *
+   * // `range0` will have an int32_t payload of 42
+   * nvtx3::scoped_range range0{attr};
+   *
+   * // range1 has double payload of 3.14
+   * nvtx3::scoped_range range1{nvtx3::payload{3.14}};
+   * \endcode
+   */
+  class payload
+  {
+  public:
+    using value_type = typename nvtxEventAttributes_v2::payload_t;
+
+    /**
+     * @brief Construct a `payload` from a signed, 8 byte integer.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int64_t value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_INT64}
+        , value_{}
+    {
+      value_.llValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from a signed, 4 byte integer.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(int32_t value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_INT32}
+        , value_{}
+    {
+      value_.iValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from an unsigned, 8 byte integer.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint64_t value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT64}
+        , value_{}
+    {
+      value_.ullValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from an unsigned, 4 byte integer.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(uint32_t value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_UNSIGNED_INT32}
+        , value_{}
+    {
+      value_.uiValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from a single-precision floating point
+     * value.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(float value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_FLOAT}
+        , value_{}
+    {
+      value_.fValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from a double-precision floating point
+     * value.
+     *
+     * @param value Value to use as contents of the payload
+     */
+    NVTX3_CONSTEXPR_IF_CPP14 explicit payload(double value) noexcept
+        : type_{NVTX_PAYLOAD_TYPE_DOUBLE}
+        , value_{}
+    {
+      value_.dValue = value;
+    }
+
+    /**
+     * @brief Construct a `payload` from NVTX C API type and value.
+     *
+     * @param type nvtxPayloadType_t enum value indicating type of the payload
+     * @param value nvtxEventAttributes_t::payload_t union containing payload
+     */
+    constexpr payload(nvtxPayloadType_t const& type, value_type const& value) noexcept
+        : type_{type}
+        , value_(value)
+    {}
+
+    /**
+     * @brief Return the union holding the value of the payload
+     *
+     */
+    constexpr value_type get_value() const noexcept
+    {
+      return value_;
+    }
+
+    /**
+     * @brief Return the information about the type the union holds.
+     *
+     */
+    constexpr nvtxPayloadType_t get_type() const noexcept
+    {
+      return type_;
+    }
+
+  private:
+    nvtxPayloadType_t type_; ///< Type of the payload value
+    value_type value_; ///< Union holding the payload value
+  };
+
+  /**
+   * @brief Describes the attributes of a NVTX event.
+   *
+   * NVTX events can be customized via four "attributes":
+   *
+   * - color:    color used to visualize the event in tools such as Nsight
+   *             Systems. See `color`.
+   * - message:  Custom message string. See `message`.
+   * - payload:  User-defined numerical value. See `payload`.
+   * - category: Intra-domain grouping. See `category`.
+   *
+   * These component attributes are specified via an `event_attributes` object.
+   * See `nvtx3::color`, `nvtx3::message`, `nvtx3::payload`, and
+   * `nvtx3::category` for how these individual attributes are constructed.
+   *
+   * While it is possible to specify all four attributes, it is common to want
+   * to only specify a subset of attributes and use default values for the
+   * others. For convenience, `event_attributes` can be constructed from any
+   * number of attribute components in any order.
+   *
+   * Example:
+   * \code{.cpp}
+   * // Set message, same as using nvtx3::message{"message"}
+   * event_attributes attr{"message"};
+   *
+   * // Set message and color
+   * event_attributes attr{"message", nvtx3::rgb{127, 255, 0}};
+   *
+   * // Set message, color, payload, category
+   * event_attributes attr{"message",
+   *                       nvtx3::rgb{127, 255, 0},
+   *                       nvtx3::payload{42},
+   *                       nvtx3::category{1}};
+   *
+   * // Same as above -- can use any order of arguments
+   * event_attributes attr{nvtx3::payload{42},
+   *                       nvtx3::category{1},
+   *                       "message",
+   *                       nvtx3::rgb{127, 255, 0}};
+   *
+   * // Multiple arguments of the same type are allowed, but only the first is
+   * // used -- in this example, payload is set to 42:
+   * event_attributes attr{ nvtx3::payload{42}, nvtx3::payload{7} };
+   *
+   * // Range `r` will be customized according the attributes in `attr`
+   * nvtx3::scoped_range r{attr};
+   *
+   * // For convenience, `event_attributes` constructor arguments may be passed
+   * // to the `scoped_range_in` contructor -- they are forwarded to the
+   * // `event_attributes` constructor
+   * nvtx3::scoped_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
+   *
+   * // Using the nvtx3 namespace in a local scope makes the syntax more succinct:
+   * using namespace nvtx3;
+   * scoped_range r{payload{42}, category{1}, "message"};
+   * \endcode
+   *
+   */
+  class event_attributes
+  {
+  public:
+    using value_type = nvtxEventAttributes_t;
+
+    /**
+     * @brief Default constructor creates an `event_attributes` with no
+     * category, color, payload, nor message.
+     */
+    constexpr event_attributes() noexcept
+        : attributes_{
+            NVTX_VERSION, // version
+            sizeof(nvtxEventAttributes_t), // size
+            0, // category
+            NVTX_COLOR_UNKNOWN, // color type
+            0, // color value
+            NVTX_PAYLOAD_UNKNOWN, // payload type
+            0, // reserved 4B
+            {0}, // payload value (union) // NOTE(bgruber): added braces
+            NVTX_MESSAGE_UNKNOWN, // message type
+            {0} // message value (union) // NOTE(bgruber): added braces
+          }
+    {}
+
+    /**
+     * @brief Variadic constructor where the first argument is a `category`.
+     *
+     * Sets the value of the `EventAttribute`s category based on `c` and
+     * forwards the remaining variadic parameter pack to the next constructor.
+     *
+     */
+    template <typename... Args>
+    NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c, Args const&... args) noexcept
+        : event_attributes(args...)
+    {
+      attributes_.category = c.get_id();
+    }
+
+    /**
+     * @brief Variadic constructor where the first argument is a `color`.
+     *
+     * Sets the value of the `EventAttribute`s color based on `c` and forwards
+     * the remaining variadic parameter pack to the next constructor.
+     *
+     */
+    template <typename... Args>
+    NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, Args const&... args) noexcept
+        : event_attributes(args...)
+    {
+      attributes_.color     = c.get_value();
+      attributes_.colorType = c.get_type();
+    }
+
+    /**
+     * @brief Variadic constructor where the first argument is a `payload`.
+     *
+     * Sets the value of the `EventAttribute`s payload based on `p` and forwards
+     * the remaining variadic parameter pack to the next constructor.
+     *
+     */
+    template <typename... Args>
+    NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p, Args const&... args) noexcept
+        : event_attributes(args...)
+    {
+      attributes_.payload     = p.get_value();
+      attributes_.payloadType = p.get_type();
+    }
+
+    /**
+     * @brief Variadic constructor where the first argument is a `message`.
+     *
+     * Sets the value of the `EventAttribute`s message based on `m` and forwards
+     * the remaining variadic parameter pack to the next constructor.
+     *
+     */
+    template <typename... Args>
+    NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m, Args const&... args) noexcept
+        : event_attributes(args...)
+    {
+      attributes_.message     = m.get_value();
+      attributes_.messageType = m.get_type();
+    }
+
+    ~event_attributes()                                  = default;
+    event_attributes(event_attributes const&)            = default;
+    event_attributes& operator=(event_attributes const&) = default;
+    event_attributes(event_attributes&&)                 = default;
+    event_attributes& operator=(event_attributes&&)      = default;
+
+    /**
+     * @brief Get raw pointer to underlying NVTX attributes object.
+     *
+     */
+    constexpr value_type const* get() const noexcept
+    {
+      return &attributes_;
+    }
+
+  private:
+    value_type attributes_{}; ///< The NVTX attributes structure
+  };
+
+  /**
+   * @brief A RAII object for creating a NVTX range local to a thread within a
+   * domain.
+   *
+   * When constructed, begins a nested NVTX range on the calling thread in the
+   * specified domain. Upon destruction, ends the NVTX range.
+   *
+   * Behavior is undefined if a `scoped_range_in` object is
+   * created/destroyed on different threads.
+   *
+   * `scoped_range_in` is neither moveable nor copyable.
+   *
+   * `scoped_range_in`s may be nested within other ranges.
+   *
+   * The domain of the range is specified by the template type parameter `D`.
+   * By default, the `domain::global` is used, which scopes the range to the
+   * global NVTX domain. The convenience alias `scoped_range` is provided for
+   * ranges scoped to the global domain.
+   *
+   * A custom domain can be defined by creating a type, `D`, with a static
+   * member `D::name` whose value is used to name the domain associated with
+   * `D`. `D::name` must resolve to either `char const*` or `wchar_t const*`
+   *
+   * Example:
+   * \code{.cpp}
+   * // Define a type `my_domain` with a member `name` used to name the domain
+   * // associated with the type `my_domain`.
+   * struct my_domain{
+   *    static constexpr char const* name{"my domain"};
+   * };
+   * \endcode
+   *
+   * Usage:
+   * \code{.cpp}
+   * nvtx3::scoped_range_in<my_domain> r1{"range 1"}; // Range in my domain
+   *
+   * // Three equivalent ways to make a range in the global domain:
+   * nvtx3::scoped_range_in<nvtx3::domain::global> r2{"range 2"};
+   * nvtx3::scoped_range_in<> r3{"range 3"};
+   * nvtx3::scoped_range r4{"range 4"};
+   *
+   * // Create an alias to succinctly make ranges in my domain:
+   * using my_scoped_range = nvtx3::scoped_range_in<my_domain>;
+   *
+   * my_scoped_range r3{"range 3"};
+   * \endcode
+   */
+  template <class D = domain::global>
+  class scoped_range_in
+  {
+  public:
+    /**
+     * @brief Construct a `scoped_range_in` with the specified
+     * `event_attributes`
+     *
+     * Example:
+     * \code{cpp}
+     * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+     * nvtx3::scoped_range range{attr}; // Creates a range with message contents
+     *                                  // "msg" and green color
+     * \endcode
+     *
+     * @param[in] attr `event_attributes` that describes the desired attributes
+     * of the range.
+     */
+    explicit scoped_range_in(event_attributes const& attr) noexcept
+    {
+#  ifndef NVTX_DISABLE
+      nvtxDomainRangePushEx(domain::get<D>(), attr.get());
+#  else
+      (void) attr;
+#  endif
+    }
+
+    /**
+     * @brief Constructs a `scoped_range_in` from the constructor arguments
+     * of an `event_attributes`.
+     *
+     * Forwards the arguments `args...` to construct an
+     * `event_attributes` object. The `event_attributes` object is then
+     * associated with the `scoped_range_in`.
+     *
+     * For more detail, see `event_attributes` documentation.
+     *
+     * Example:
+     * \code{cpp}
+     * // Creates a range with message "message" and green color
+     * nvtx3::scoped_range r{"message", nvtx3::rgb{127,255,0}};
+     * \endcode
+     *
+     * @param[in] args Arguments to used to construct an `event_attributes` associated with this
+     * range.
+     *
+     */
+    template <typename... Args>
+    explicit scoped_range_in(Args const&... args) noexcept
+        : scoped_range_in{event_attributes{args...}}
+    {}
+
+    /**
+     * @brief Default constructor creates a `scoped_range_in` with no
+     * message, color, payload, nor category.
+     *
+     */
+    scoped_range_in() noexcept
+        : scoped_range_in{event_attributes{}}
+    {}
+
+    /**
+     * @brief Delete `operator new` to disallow heap allocated objects.
+     *
+     * `scoped_range_in` must follow RAII semantics to guarantee proper push/pop semantics.
+     *
+     */
+    void* operator new(std::size_t) = delete;
+
+    scoped_range_in(scoped_range_in const&)            = delete;
+    scoped_range_in& operator=(scoped_range_in const&) = delete;
+    scoped_range_in(scoped_range_in&&)                 = delete;
+    scoped_range_in& operator=(scoped_range_in&&)      = delete;
+
+    /**
+     * @brief Destroy the scoped_range_in, ending the NVTX range event.
+     */
+    ~scoped_range_in() noexcept
+    {
+#  ifndef NVTX_DISABLE
+      nvtxDomainRangePop(domain::get<D>());
+#  endif
+    }
+  };
+
+  /**
+   * @brief Alias for a `scoped_range_in` in the global NVTX domain.
+   *
+   */
+  using scoped_range = scoped_range_in<domain::global>;
+
+  namespace detail
+  {
+
+  /// @cond internal
+  template <typename D = domain::global>
+  class optional_scoped_range_in
+  {
+  public:
+    optional_scoped_range_in() = default;
+
+    void begin(event_attributes const& attr) noexcept
+    {
+#  ifndef NVTX_DISABLE
+      // This class is not meant to be part of the public NVTX C++ API and should
+      // only be used in the `NVTX3_FUNC_RANGE_IF` and `NVTX3_FUNC_RANGE_IF_IN`
+      // macros. However, to prevent developers from misusing this class, make
+      // sure to not start multiple ranges.
+      if (initialized)
+      {
+        return;
+      }
+
+      nvtxDomainRangePushEx(domain::get<D>(), attr.get());
+      initialized = true;
+#  endif
+    }
+
+    ~optional_scoped_range_in() noexcept
+    {
+#  ifndef NVTX_DISABLE
+      if (initialized)
+      {
+        nvtxDomainRangePop(domain::get<D>());
+      }
+#  endif
+    }
+
+    void* operator new(std::size_t)                                      = delete;
+    optional_scoped_range_in(optional_scoped_range_in const&)            = delete;
+    optional_scoped_range_in& operator=(optional_scoped_range_in const&) = delete;
+    optional_scoped_range_in(optional_scoped_range_in&&)                 = delete;
+    optional_scoped_range_in& operator=(optional_scoped_range_in&&)      = delete;
+
+  private:
+#  ifndef NVTX_DISABLE
+    bool initialized = false;
+#  endif
+  };
+  /// @endcond
+
+  } // namespace detail
+
+  /**
+   * @brief Handle used for correlating explicit range start and end events.
+   *
+   * A handle is "null" if it does not correspond to any range.
+   *
+   */
+  struct range_handle
+  {
+    /// Type used for the handle's value
+    using value_type = nvtxRangeId_t;
+
+    /**
+     * @brief Construct a `range_handle` from the given id.
+     *
+     */
+    constexpr explicit range_handle(value_type id) noexcept
+        : _range_id{id}
+    {}
+
+    /**
+     * @brief Constructs a null range handle.
+     *
+     * A null range_handle corresponds to no range. Calling `end_range` on a
+     * null handle is undefined behavior when a tool is active.
+     *
+     */
+    constexpr range_handle() noexcept = default;
+
+    /**
+     * @brief Checks whether this handle is null
+     *
+     * Provides contextual conversion to `bool`.
+     *
+     * \code{cpp}
+     * range_handle handle{};
+     * if (handle) {...}
+     * \endcode
+     *
+     */
+    constexpr explicit operator bool() const noexcept
+    {
+      return get_value() != null_range_id;
+    };
+
+    /**
+     * @brief Implicit conversion from `nullptr` constructs a null handle.
+     *
+     * Satisfies the "NullablePointer" requirement to make `range_handle` comparable with `nullptr`.
+     *
+     */
+    constexpr range_handle(std::nullptr_t) noexcept {}
+
+    /**
+     * @brief Returns the `range_handle`'s value
+     *
+     * @return value_type The handle's value
+     */
+    constexpr value_type get_value() const noexcept
+    {
+      return _range_id;
+    }
+
+  private:
+    /// Sentinel value for a null handle that corresponds to no range
+    static constexpr value_type null_range_id = nvtxRangeId_t{0};
+
+    value_type _range_id{null_range_id}; ///< The underlying NVTX range id
+  };
+
+  /**
+   * @brief Compares two range_handles for equality
+   *
+   * @param lhs The first range_handle to compare
+   * @param rhs The second range_handle to compare
+   */
+  inline constexpr bool operator==(range_handle lhs, range_handle rhs) noexcept
+  {
+    return lhs.get_value() == rhs.get_value();
+  }
+
+  /**
+   * @brief Compares two range_handles for inequality
+   *
+   * @param lhs The first range_handle to compare
+   * @param rhs The second range_handle to compare
+   */
+  inline constexpr bool operator!=(range_handle lhs, range_handle rhs) noexcept
+  {
+    return !(lhs == rhs);
+  }
+
+  /**
+   * @brief Manually begin an NVTX range.
+   *
+   * Explicitly begins an NVTX range and returns a unique handle. To end the
+   * range, pass the handle to `end_range_in<D>()`.
+   *
+   * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and
+   * `nvtx3::start_range_in<nvtx3::domain::global>(...)`.
+   *
+   * `start_range_in/end_range_in` are the most explicit and lowest level APIs
+   * provided for creating ranges.  Use of `nvtx3::unique_range_in` should be
+   * preferred unless one is unable to tie the range to the lifetime of an object.
+   *
+   * Example:
+   * \code{.cpp}
+   * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+   * // Manually begin a range
+   * nvtx3::range_handle h = nvtx3::start_range_in<my_domain>(attr);
+   * ...
+   * nvtx3::end_range_in<my_domain>(h); // End the range
+   * \endcode
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the range belongs. Else, `domain::global` to indicate that the
+   * global NVTX domain should be used.
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the range.
+   * @return Unique handle to be passed to `end_range_in` to end the range.
+   */
+  template <typename D = domain::global>
+  inline range_handle start_range_in(event_attributes const& attr) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    return range_handle{nvtxDomainRangeStartEx(domain::get<D>(), attr.get())};
+#  else
+    (void) attr;
+    return {};
+#  endif
+  }
+
+  /**
+   * @brief Manually begin an NVTX range.
+   *
+   * Explicitly begins an NVTX range and returns a unique handle. To end the
+   * range, pass the handle to `end_range_in<D>()`.
+   *
+   * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and
+   * `nvtx3::start_range_in<nvtx3::domain::global>(...)`.
+   *
+   * `start_range_in/end_range_in` are the most explicit and lowest level APIs
+   * provided for creating ranges.  Use of `nvtx3::unique_range_in` should be
+   * preferred unless one is unable to tie the range to the lifetime of an object.
+   *
+   * This overload uses `args...` to construct an  `event_attributes` to
+   * associate with the range.  For more detail, see `event_attributes`.
+   *
+   * Example:
+   * \code{cpp}
+   * // Manually begin a range
+   * nvtx3::range_handle h = nvtx3::start_range_in<D>("msg", nvtx3::rgb{127,255,0});
+   * ...
+   * nvtx3::end_range_in<D>(h); // Ends the range
+   * \endcode
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the range belongs. Else, `domain::global` to indicate that the
+   * global NVTX domain should be used.
+   * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`.
+   * @return Unique handle to be passed to `end_range` to end the range.
+   */
+  template <typename D = domain::global, typename... Args>
+  inline range_handle start_range_in(Args const&... args) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    return start_range_in<D>(event_attributes{args...});
+#  else
+    return {};
+#  endif
+  }
+
+  /**
+   * @brief Manually begin an NVTX range in the global domain.
+   *
+   * Explicitly begins an NVTX range and returns a unique handle. To end the
+   * range, pass the handle to `end_range()`.
+   *
+   * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and
+   * `nvtx3::start_range_in<nvtx3::domain::global>(...)`.
+   *
+   * `start_range/end_range` are the most explicit and lowest level APIs
+   * provided for creating ranges.  Use of `nvtx3::unique_range` should be
+   * preferred unless one is unable to tie the range to the lifetime of an object.
+   *
+   * Example:
+   * \code{.cpp}
+   * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+   * // Manually begin a range
+   * nvtx3::range_handle h = nvtx3::start_range(attr);
+   * ...
+   * nvtx3::end_range(h); // End the range
+   * \endcode
+   *
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the range.
+   * @return Unique handle to be passed to `end_range_in` to end the range.
+   */
+  inline range_handle start_range(event_attributes const& attr) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    return start_range_in<domain::global>(attr);
+#  else
+    (void) attr;
+    return {};
+#  endif
+  }
+
+  /**
+   * @brief Manually begin an NVTX range in the global domain.
+   *
+   * Explicitly begins an NVTX range and returns a unique handle. To end the
+   * range, pass the handle to `end_range_in<D>()`.
+   *
+   * `nvtx3::start_range(...)` is equivalent to `nvtx3::start_range_in<>(...)` and
+   * `nvtx3::start_range_in<nvtx3::domain::global>(...)`.
+   *
+   * `start_range_in/end_range_in` are the most explicit and lowest level APIs
+   * provided for creating ranges.  Use of `nvtx3::unique_range_in` should be
+   * preferred unless one is unable to tie the range to the lifetime of an object.
+   *
+   * This overload uses `args...` to construct an  `event_attributes` to
+   * associate with the range.  For more detail, see `event_attributes`.
+   *
+   * Example:
+   * \code{cpp}
+   * // Manually begin a range
+   * nvtx3::range_handle h = nvtx3::start_range("msg", nvtx3::rgb{127,255,0});
+   * ...
+   * nvtx3::end_range(h); // Ends the range
+   * \endcode
+   *
+   * @param args[in] Variadic parameter pack of the arguments for an `event_attributes`.
+   * @return Unique handle to be passed to `end_range` to end the range.
+   */
+  template <typename... Args>
+  inline range_handle start_range(Args const&... args) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    return start_range_in<domain::global>(args...);
+#  else
+    return {};
+#  endif
+  }
+
+  /**
+   * @brief Manually end the range associated with the handle `r` in domain `D`.
+   *
+   * Explicitly ends the NVTX range indicated by the handle `r` returned from a
+   * prior call to `start_range_in<D>`. The range may end on a different thread
+   * from where it began.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain` to
+   * which the range belongs. Else, `domain::global` to indicate that the global
+   * NVTX domain should be used.
+   * @param r Handle to a range started by a prior call to `start_range_in`.
+   *
+   * @warning The domain type specified as template parameter to this function
+   * must be the same that was specified on the associated `start_range_in` call.
+   */
+  template <typename D = domain::global>
+  inline void end_range_in(range_handle r) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    nvtxDomainRangeEnd(domain::get<D>(), r.get_value());
+#  else
+    (void) r;
+#  endif
+  }
+
+  /**
+   * @brief Manually end the range associated with the handle `r` in the global
+   * domain.
+   *
+   * Explicitly ends the NVTX range indicated by the handle `r` returned from a
+   * prior call to `start_range`. The range may end on a different thread from
+   * where it began.
+   *
+   * @param r Handle to a range started by a prior call to `start_range`.
+   *
+   * @warning The domain type specified as template parameter to this function
+   * must be the same that was specified on the associated `start_range` call.
+   */
+  inline void end_range(range_handle r) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    end_range_in<domain::global>(r);
+#  else
+    (void) r;
+#  endif
+  }
+
+  /**
+   * @brief A RAII object for creating a NVTX range within a domain that can
+   * be created and destroyed on different threads.
+   *
+   * When constructed, begins a NVTX range in the specified domain. Upon
+   * destruction, ends the NVTX range.
+   *
+   * Similar to `nvtx3::scoped_range_in`, with a few key differences:
+   * - `unique_range` objects can be destroyed in an order whereas `scoped_range` objects must be
+   *    destroyed in exact reverse creation order
+   * - `unique_range` can start and end on different threads
+   * - `unique_range` is moveable
+   * - `unique_range` objects can be constructed as heap objects
+   *
+   * There is extra overhead associated with `unique_range` constructs and therefore use of
+   * `nvtx3::scoped_range_in` should be preferred.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the `unique_range_in` belongs. Else, `domain::global` to
+   * indicate that the global NVTX domain should be used.
+   */
+  template <typename D = domain::global>
+  class unique_range_in
+  {
+  public:
+    /**
+     * @brief Construct a new unique_range_in object with the specified event attributes
+     *
+     * Example:
+     * \code{cpp}
+     * nvtx3::event_attributes attr{"msg", nvtx3::rgb{127,255,0}};
+     * nvtx3::unique_range_in<my_domain> range{attr}; // Creates a range with message contents
+     *                                            // "msg" and green color
+     * \endcode
+     *
+     * @param[in] attr `event_attributes` that describes the desired attributes
+     * of the range.
+     */
+    explicit unique_range_in(event_attributes const& attr) noexcept
+        : handle_{start_range_in<D>(attr)}
+    {}
+
+    /**
+     * @brief Constructs a `unique_range_in` from the constructor arguments
+     * of an `event_attributes`.
+     *
+     * Forwards the arguments `args...` to construct an
+     * `event_attributes` object. The `event_attributes` object is then
+     * associated with the `unique_range_in`.
+     *
+     * For more detail, see `event_attributes` documentation.
+     *
+     * Example:
+     * \code{.cpp}
+     * // Creates a range with message "message" and green color
+     * nvtx3::unique_range_in<> r{"message", nvtx3::rgb{127,255,0}};
+     * \endcode
+     *
+     * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes`
+     * associated with this range.
+     */
+    template <typename... Args>
+    explicit unique_range_in(Args const&... args) noexcept
+        : unique_range_in{event_attributes{args...}}
+    {}
+
+    /**
+     * @brief Default constructor creates a `unique_range_in` with no
+     * message, color, payload, nor category.
+     *
+     */
+    constexpr unique_range_in() noexcept
+        : unique_range_in{event_attributes{}}
+    {}
+
+    /**
+     * @brief Destroy the `unique_range_in` ending the range.
+     *
+     */
+    ~unique_range_in() noexcept = default;
+
+    /**
+     * @brief Move constructor allows taking ownership of the NVTX range from
+     * another `unique_range_in`.
+     *
+     * @param other The range to take ownership of
+     */
+    unique_range_in(unique_range_in&& other) noexcept = default;
+
+    /**
+     * @brief Move assignment operator allows taking ownership of an NVTX range
+     * from another `unique_range_in`.
+     *
+     * @param other The range to take ownership of
+     */
+    unique_range_in& operator=(unique_range_in&& other) noexcept = default;
+
+    /// Copy construction is not allowed to prevent multiple objects from owning
+    /// the same range handle
+    unique_range_in(unique_range_in const&) = delete;
+
+    /// Copy assignment is not allowed to prevent multiple objects from owning the
+    /// same range handle
+    unique_range_in& operator=(unique_range_in const&) = delete;
+
+  private:
+    struct end_range_handle
+    {
+      using pointer = range_handle; /// Override the pointer type of the unique_ptr
+      void operator()(range_handle h) const noexcept
+      {
+        end_range_in<D>(h);
+      }
+    };
+
+    /// Range handle used to correlate the start/end of the range
+    std::unique_ptr<range_handle, end_range_handle> handle_;
+  };
+
+  /**
+   * @brief Alias for a `unique_range_in` in the global NVTX domain.
+   *
+   */
+  using unique_range = unique_range_in<domain::global>;
+
+  /**
+   * @brief Annotates an instantaneous point in time with a "marker", using the
+   * attributes specified by `attr`.
+   *
+   * Unlike a "range" which has a beginning and an end, a marker is a single event
+   * in an application, such as detecting a problem:
+   *
+   * \code{.cpp}
+   * bool success = do_operation(...);
+   * if (!success) {
+   *    nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}};
+   *    nvtx3::mark_in<my_domain>(attr);
+   * }
+   * \endcode
+   *
+   * Note that nvtx3::mark_in<D> is a function, not a class like scoped_range_in<D>.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the `unique_range_in` belongs. Else, `domain::global` to
+   * indicate that the global NVTX domain should be used.
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the mark.
+   */
+  template <typename D = domain::global>
+  inline void mark_in(event_attributes const& attr) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    nvtxDomainMarkEx(domain::get<D>(), attr.get());
+#  else
+    (void) (attr);
+#  endif
+  }
+
+  /**
+   * @brief Annotates an instantaneous point in time with a "marker", using the
+   * arguments to construct an `event_attributes`.
+   *
+   * Unlike a "range" which has a beginning and an end, a marker is a single event
+   * in an application, such as detecting a problem:
+   *
+   * \code{.cpp}
+   * bool success = do_operation(...);
+   * if (!success) {
+   *    nvtx3::mark_in<my_domain>("operation failed!", nvtx3::rgb{255,0,0});
+   * }
+   * \endcode
+   *
+   * Note that nvtx3::mark_in<D> is a function, not a class like scoped_range_in<D>.
+   *
+   * Forwards the arguments `args...` to construct an `event_attributes` object.
+   * The attributes are then associated with the marker. For more detail, see
+   * the `event_attributes` documentation.
+   *
+   * @tparam D Type containing `name` member used to identify the `domain`
+   * to which the `unique_range_in` belongs. Else `domain::global` to
+   * indicate that the global NVTX domain should be used.
+   * @param[in] args Variadic parameter pack of arguments to construct an `event_attributes`
+   * associated with this range.
+   *
+   */
+  template <typename D = domain::global, typename... Args>
+  inline void mark_in(Args const&... args) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    mark_in<D>(event_attributes{args...});
+#  endif
+  }
+
+  /**
+   * @brief Annotates an instantaneous point in time with a "marker", using the
+   * attributes specified by `attr`, in the global domain.
+   *
+   * Unlike a "range" which has a beginning and an end, a marker is a single event
+   * in an application, such as detecting a problem:
+   *
+   * \code{.cpp}
+   * bool success = do_operation(...);
+   * if (!success) {
+   *    nvtx3::event_attributes attr{"operation failed!", nvtx3::rgb{255,0,0}};
+   *    nvtx3::mark(attr);
+   * }
+   * \endcode
+   *
+   * Note that nvtx3::mark is a function, not a class like scoped_range.
+   *
+   * @param[in] attr `event_attributes` that describes the desired attributes
+   * of the mark.
+   */
+  inline void mark(event_attributes const& attr) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    mark_in<domain::global>(attr);
+#  endif
+  }
+
+  /**
+   * @brief Annotates an instantaneous point in time with a "marker", using the
+   * arguments to construct an `event_attributes`, in the global domain.
+   *
+   * Unlike a "range" which has a beginning and an end, a marker is a single event
+   * in an application, such as detecting a problem:
+   *
+   * \code{.cpp}
+   * bool success = do_operation(...);
+   * if (!success) {
+   *    nvtx3::mark("operation failed!", nvtx3::rgb{255,0,0});
+   * }
+   * \endcode
+   *
+   * Note that nvtx3::mark is a function, not a class like scoped_range.
+   *
+   * Forwards the arguments `args...` to construct an `event_attributes` object.
+   * The attributes are then associated with the marker. For more detail, see
+   * the `event_attributes` documentation.
+   *
+   * @param[in] args Variadic parameter pack of arguments to construct an
+   * `event_attributes` associated with this range.
+   *
+   */
+  template <typename... Args>
+  inline void mark(Args const&... args) noexcept
+  {
+#  ifndef NVTX_DISABLE
+    mark_in<domain::global>(args...);
+#  endif
+  }
+
+} // namespace NVTX3_VERSION_NAMESPACE
+
+} // namespace nvtx3
+
+#  ifndef NVTX_DISABLE
+/**
+ * @brief Convenience macro for generating a range in the specified `domain`
+ * from the lifetime of a function
+ *
+ * This macro is useful for generating an NVTX range in `domain` from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_string_in` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::scoped_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * \code{.cpp}
+ * struct my_domain{static constexpr char const* name{"my_domain"};};
+ *
+ * void foo(...) {
+ *    NVTX3_FUNC_RANGE_IN(my_domain); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * \endcode
+ *
+ * @param[in] D Type containing `name` member used to identify the
+ * `domain` to which the `registered_string_in` belongs. Else,
+ * `domain::global` to  indicate that the global NVTX domain should be used.
+ */
+#    define NVTX3_V1_FUNC_RANGE_IN(D)                                                  \
+      static ::nvtx3::v1::registered_string_in<D> const nvtx3_func_name__{__func__};   \
+      static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+      ::nvtx3::v1::scoped_range_in<D> const nvtx3_range__{nvtx3_func_attr__};
+
+/**
+ * @brief Convenience macro for generating a range in the specified `domain`
+ * from the lifetime of a function if the given boolean expression evaluates
+ * to true.
+ *
+ * Similar to `NVTX3_V1_FUNC_RANGE_IN(D)`, the only difference being that
+ * `NVTX3_V1_FUNC_RANGE_IF_IN(D, C)` only generates a range if the given boolean
+ * expression evaluates to true.
+ *
+ * @param[in] D Type containing `name` member used to identify the
+ * `domain` to which the `registered_string_in` belongs. Else,
+ * `domain::global` to indicate that the global NVTX domain should be used.
+ *
+ * @param[in] C Boolean expression used to determine if a range should be
+ * generated.
+ */
+#    define NVTX3_V1_FUNC_RANGE_IF_IN(D, C)                                              \
+      ::nvtx3::v1::detail::optional_scoped_range_in<D> optional_nvtx3_range__;           \
+      if (C)                                                                             \
+      {                                                                                  \
+        static ::nvtx3::v1::registered_string_in<D> const nvtx3_func_name__{__func__};   \
+        static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+        optional_nvtx3_range__.begin(nvtx3_func_attr__);                                 \
+      }
+#  else
+#    define NVTX3_V1_FUNC_RANGE_IN(D)
+#    define NVTX3_V1_FUNC_RANGE_IF_IN(D, C)
+#  endif // NVTX_DISABLE
+
+/**
+ * @brief Convenience macro for generating a range in the global domain from the
+ * lifetime of a function.
+ *
+ * This macro is useful for generating an NVTX range in the global domain from
+ * the entry point of a function to its exit. It is intended to be the first
+ * line of the function.
+ *
+ * Constructs a static `registered_string_in` using the name of the immediately
+ * enclosing function returned by `__func__` and constructs a
+ * `nvtx3::scoped_range` using the registered function name as the range's
+ * message.
+ *
+ * Example:
+ * \code{.cpp}
+ * void foo(...) {
+ *    NVTX3_FUNC_RANGE(); // Range begins on entry to foo()
+ *    // do stuff
+ *    ...
+ * } // Range ends on return from foo()
+ * \endcode
+ */
+#  define NVTX3_V1_FUNC_RANGE() NVTX3_V1_FUNC_RANGE_IN(::nvtx3::v1::domain::global)
+
+/**
+ * @brief Convenience macro for generating a range in the global domain from the
+ * lifetime of a function if the given boolean expression evaluates to true.
+ *
+ * Similar to `NVTX3_V1_FUNC_RANGE()`, the only difference being that
+ * `NVTX3_V1_FUNC_RANGE_IF(C)` only generates a range if the given boolean
+ * expression evaluates to true.
+ *
+ * @param[in] C Boolean expression used to determine if a range should be
+ * generated.
+ */
+#  define NVTX3_V1_FUNC_RANGE_IF(C) NVTX3_V1_FUNC_RANGE_IF_IN(::nvtx3::v1::domain::global, C)
+
+/* When inlining this version, versioned macros must have unversioned aliases.
+ * For each NVTX3_Vx_ #define, make an NVTX3_ alias of it here.*/
+#  if defined(NVTX3_INLINE_THIS_VERSION)
+/* clang format off */
+#    define NVTX3_FUNC_RANGE       NVTX3_V1_FUNC_RANGE
+#    define NVTX3_FUNC_RANGE_IF    NVTX3_V1_FUNC_RANGE_IF
+#    define NVTX3_FUNC_RANGE_IN    NVTX3_V1_FUNC_RANGE_IN
+#    define NVTX3_FUNC_RANGE_IF_IN NVTX3_V1_FUNC_RANGE_IF_IN
+/* clang format on */
+#  endif
+
+#endif // NVTX3_CPP_DEFINITIONS_V1_0
+
+/* Add functionality for new minor versions here, by copying the above section enclosed
+ * in #ifndef NVTX3_CPP_DEFINITIONS_Vx_y, and incrementing the minor version.  This code
+ * is an example of how additions for version 1.2 would look, indented for clarity.  Note
+ * that the versioned symbols and macros are always provided, and the unversioned symbols
+ * are only provided if NVTX3_INLINE_THIS_VERSION was defined at the top of this header.
+ *
+ * \code{.cpp}
+ * #ifndef NVTX3_CPP_DEFINITIONS_V1_2
+ * #define NVTX3_CPP_DEFINITIONS_V1_2
+ *     namespace nvtx3 {
+ *         NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE {
+ *             class new_class {};
+ *             inline void new_function() {}
+ *         }
+ *     }
+ *
+ *     // Macros must have the major version in their names:
+ *     #define NVTX3_V1_NEW_MACRO_A() ...
+ *     #define NVTX3_V1_NEW_MACRO_B() ...
+ *
+ *     // If inlining, make aliases for the macros with the version number omitted
+ *     #if defined(NVTX3_INLINE_THIS_VERSION)
+ *         #define NVTX3_NEW_MACRO_A NVTX3_V1_NEW_MACRO_A
+ *         #define NVTX3_NEW_MACRO_B NVTX3_V1_NEW_MACRO_B
+ *     #endif
+ * #endif // NVTX3_CPP_DEFINITIONS_V1_2
+ * \endcode
+ */
+
+/* Undefine all temporarily-defined unversioned macros, which would conflict with
+ * subsequent includes of different versions of this header. */
+#undef NVTX3_CPP_VERSION_MAJOR
+#undef NVTX3_CPP_VERSION_MINOR
+#undef NVTX3_CONCAT
+#undef NVTX3_NAMESPACE_FOR
+#undef NVTX3_VERSION_NAMESPACE
+#undef NVTX3_INLINE_IF_REQUESTED
+#undef NVTX3_CONSTEXPR_IF_CPP14
+
+#if defined(NVTX3_INLINE_THIS_VERSION)
+#  undef NVTX3_INLINE_THIS_VERSION
+#endif
+
+#if defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE)
+#  undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET_DEFINED_HERE
+#  undef NVTX3_USE_CHECKED_OVERLOADS_FOR_GET
+#endif
+
+#if defined(NVTX3_STATIC_ASSERT_DEFINED_HERE)
+#  undef NVTX3_STATIC_ASSERT_DEFINED_HERE
+#  undef NVTX3_STATIC_ASSERT
+#endif
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_load.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_load.cuh
new file mode 100644
index 000000000..ff38900a2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_load.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file Utilities for strong memory operations.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE uint4 load_relaxed(uint4 const* ptr)
+{
+  uint4 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];"
+                  : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+                  : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_relaxed(ulonglong2 const* ptr)
+{
+  ulonglong2 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];"
+                  : "=l"(retval.x), "=l"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];"
+                  : "=l"(retval.x), "=l"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 load_relaxed(ushort4 const* ptr)
+{
+  ushort4 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];"
+                  : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];"
+                  : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_relaxed(uint2 const* ptr)
+{
+  uint2 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];"
+                  : "=r"(retval.x), "=r"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];"
+                  : "=r"(retval.x), "=r"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long load_relaxed(unsigned long long const* ptr)
+{
+  unsigned long long retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.u64 %0, [%1];"
+                  : "=l"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.u64 %0, [%1];"
+                  : "=l"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_relaxed(unsigned int const* ptr)
+{
+  unsigned int retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.u32 %0, [%1];"
+                  : "=r"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.u32 %0, [%1];"
+                  : "=r"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short load_relaxed(unsigned short const* ptr)
+{
+  unsigned short retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.relaxed.gpu.u16 %0, [%1];"
+                  : "=h"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.u16 %0, [%1];"
+                  : "=h"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char load_relaxed(unsigned char const* ptr)
+{
+  unsigned short retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile(
+       "{"
+       "  .reg .u8 datum;"
+       "  ld.relaxed.gpu.u8 datum, [%1];"
+       "  cvt.u16.u8 %0, datum;"
+       "}"
+       : "=h"(retval)
+       : _CUB_ASM_PTR_(ptr)
+       : "memory");),
+    (asm volatile(
+       "{"
+       "  .reg .u8 datum;"
+       "  ld.cg.u8 datum, [%1];"
+       "  cvt.u16.u8 %0, datum;"
+       "}"
+       : "=h"(retval)
+       : _CUB_ASM_PTR_(ptr)
+       : "memory");));
+  return (unsigned char) retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 load_acquire(ulonglong2 const* ptr)
+{
+  ulonglong2 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.acquire.gpu.v2.u64 {%0, %1}, [%2];"
+                  : "=l"(retval.x), "=l"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];"
+                  : "=l"(retval.x), "=l"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");
+     __threadfence();));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE uint2 load_acquire(uint2 const* ptr)
+{
+  uint2 retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.acquire.gpu.v2.u32 {%0, %1}, [%2];"
+                  : "=r"(retval.x), "=r"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];"
+                  : "=r"(retval.x), "=r"(retval.y)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");
+     __threadfence();));
+  return retval;
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int load_acquire(unsigned int const* ptr)
+{
+  unsigned int retval;
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("ld.acquire.gpu.u32 %0, [%1];"
+                  : "=r"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");),
+    (asm volatile("ld.cg.u32 %0, [%1];"
+                  : "=r"(retval)
+                  : _CUB_ASM_PTR_(ptr)
+                  : "memory");
+     __threadfence();));
+
+  return retval;
+}
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_store.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_store.cuh
new file mode 100644
index 000000000..d39148d08
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/strong_store.cuh
@@ -0,0 +1,307 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file Utilities for strong memory operations.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint4* ptr, uint4 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                  : "memory");),
+    (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ulonglong2* ptr, ulonglong2 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                  : "memory");),
+    (asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(ushort4* ptr, ushort4 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                  : "memory");),
+    (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(uint2* ptr, uint2 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                  : "memory");),
+    (asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned long long* ptr, unsigned long long val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.u64 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val)
+                  : "memory");),
+    (asm volatile("st.cg.u64 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned int* ptr, unsigned int val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.u32 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val)
+                  : "memory");),
+    (asm volatile("st.cg.u32 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned short* ptr, unsigned short val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.relaxed.gpu.u16 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val)
+                  : "memory");),
+    (asm volatile("st.cg.u16 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val)
+                  : "memory");));
+}
+
+static _CCCL_DEVICE _CCCL_FORCEINLINE void store_relaxed(unsigned char* ptr, unsigned char val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("{"
+                  "  .reg .u8 datum;"
+                  "  cvt.u8.u16 datum, %1;"
+                  "  st.relaxed.gpu.u8 [%0], datum;"
+                  "}"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val)
+                  : "memory");),
+    (asm volatile("{"
+                  "  .reg .u8 datum;"
+                  "  cvt.u8.u16 datum, %1;"
+                  "  st.cg.u8 [%0], datum;"
+                  "}"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint4* ptr, uint4 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ulonglong2* ptr, ulonglong2 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(ushort4* ptr, ushort4 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(uint2* ptr, uint2 val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned long long* ptr, unsigned long long val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.u64 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.u64 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "l"(val)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned int* ptr, unsigned int val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.u32 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.u32 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "r"(val)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned short* ptr, unsigned short val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("st.release.gpu.u16 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val)
+                  : "memory");),
+    (__threadfence();
+     asm volatile("st.cg.u16 [%0], %1;"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"(val)
+                  : "memory");));
+}
+
+_CCCL_DEVICE _CCCL_FORCEINLINE void store_release(unsigned char* ptr, unsigned char val)
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_70,
+    (asm volatile("{"
+                  "  .reg .u8 datum;"
+                  "  cvt.u8.u16 datum, %1;"
+                  "  st.release.gpu.u8 [%0], datum;"
+                  "}"
+                  :
+                  : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val)
+                  : "memory");),
+    (__threadfence(); asm volatile(
+       "{"
+       "  .reg .u8 datum;"
+       "  cvt.u8.u16 datum, %1;"
+       "  st.cg.u8 [%0], datum;"
+       "}"
+       :
+       : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val)
+       : "memory");));
+}
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/temporary_storage.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/temporary_storage.cuh
new file mode 100644
index 000000000..cf5f98e77
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/temporary_storage.cuh
@@ -0,0 +1,353 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_namespace.cuh>
+#include <cub/util_temporary_storage.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace temporary_storage
+{
+
+class slot;
+
+template <typename T>
+class alias;
+
+template <int SlotsCount>
+class layout;
+
+/**
+ * @brief Temporary storage slot that can be considered a C++ union with an
+ *        arbitrary fields count.
+ *
+ * @warning slot lifetime is defined by the lifetime of the associated layout.
+ *          It's impossible to request new array if layout is already mapped.
+ *
+ * @par A Simple Example
+ * @code
+ * auto slot = temporary_storage.get_slot(0);
+ *
+ * // Add fields into the slot
+ * // Create an int alias with 0 elements:
+ * auto int_array = slot->create_alias<int>();
+ * // Create a double alias with 2 elements:
+ * auto double_array = slot->create_alias<double>(2);
+ * // Create a char alias with 0 elements:
+ * auto empty_array = slot->create_alias<char>();
+ * // Slot size is defined by double_array size (2 * sizeof(double))
+ *
+ * if (condition)
+ * {
+ *   int_array.grow(42);
+ *   // Now slot size is defined by int_array size (42 * sizeof(int))
+ * }
+ *
+ * // Temporary storage mapping
+ * // ...
+
+ * int *d_int_array = int_array.get();
+ * double *d_double_array = double_array.get();
+ * char *d_empty_array = empty_array.get(); // Guaranteed to return nullptr
+ * @endcode
+ */
+class slot
+{
+  std::size_t m_size{};
+  void* m_pointer{};
+
+public:
+  slot() = default;
+
+  /**
+   * @brief Returns an array of type @p T and length @p elements
+   */
+  template <typename T>
+  _CCCL_HOST_DEVICE alias<T> create_alias(std::size_t elements = 0);
+
+private:
+  _CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size)
+  {
+    m_size = (max) (m_size, new_size);
+  }
+
+  _CCCL_HOST_DEVICE std::size_t get_bytes_required() const
+  {
+    return m_size;
+  }
+
+  _CCCL_HOST_DEVICE void set_storage(void* ptr)
+  {
+    m_pointer = ptr;
+  }
+  _CCCL_HOST_DEVICE void* get_storage() const
+  {
+    return m_pointer;
+  }
+
+  template <typename T>
+  friend class alias;
+
+  template <int>
+  friend class layout;
+};
+
+/**
+ * @brief Named memory region of a temporary storage slot
+ *
+ * @par Overview
+ * This class provides a typed wrapper of a temporary slot memory region.
+ * It can be considered as a field in the C++ union. It's only possible to
+ * increase the array size.
+ *
+ * @warning alias lifetime is defined by the lifetime of the associated slot
+ *          It's impossible to grow the array if the layout is already mapped.
+ */
+template <typename T>
+class alias
+{
+  slot& m_slot;
+  std::size_t m_elements{};
+
+  _CCCL_HOST_DEVICE explicit alias(slot& slot, std::size_t elements = 0)
+      : m_slot(slot)
+      , m_elements(elements)
+  {
+    this->update_slot();
+  }
+
+  _CCCL_HOST_DEVICE void update_slot()
+  {
+    m_slot.set_bytes_required(m_elements * sizeof(T));
+  }
+
+public:
+  alias() = delete;
+
+  /**
+   * @brief Increases the number of elements
+   *
+   * @warning
+   *   This method should be called before temporary storage mapping stage.
+   *
+   * @param[in] new_elements Increases the memory region occupied in the
+   *                         temporary slot to fit up to @p new_elements items
+   *                         of type @p T.
+   */
+  _CCCL_HOST_DEVICE void grow(std::size_t new_elements)
+  {
+    m_elements = new_elements;
+    this->update_slot();
+  }
+
+  /**
+   * @brief Returns pointer to array
+   *
+   * If the @p elements number is equal to zero, or storage layout isn't mapped,
+   * @p nullptr is returned.
+   */
+  _CCCL_HOST_DEVICE T* get() const
+  {
+    if (m_elements == 0)
+    {
+      return nullptr;
+    }
+
+    return reinterpret_cast<T*>(m_slot.get_storage());
+  }
+
+  friend class slot;
+};
+
+template <typename T>
+_CCCL_HOST_DEVICE alias<T> slot::create_alias(std::size_t elements)
+{
+  return alias<T>(*this, elements);
+}
+
+/**
+ * @brief Temporary storage layout represents a structure with
+ *        @p SlotsCount union-like fields
+ *
+ * The layout can be mapped to a temporary buffer only once.
+ *
+ * @par A Simple Example
+ * @code
+ * cub::detail::temporary_storage::layout<3> temporary_storage;
+ *
+ * auto slot_1 = temporary_storage.get_slot(0);
+ * auto slot_2 = temporary_storage.get_slot(1);
+ *
+ * // Add fields into the first slot
+ * auto int_array = slot_1->create_alias<int>(1);
+ * auto double_array = slot_1->create_alias<double>(2);
+ *
+ * // Add fields into the second slot
+ * auto char_array = slot_2->create_alias<char>();
+ *
+ * // The equivalent C++ structure could look like
+ * // struct StorageLayout
+ * // {
+ * //   union {
+ * //   } slot_0;
+ * //   std::byte padding_0[256 - sizeof (slot_0)];
+ * //
+ * //   union {
+ * //     int alias_0[1];
+ * //     double alias_1[2];
+ * //   } slot_1;
+ * //   std::byte padding_1[256 - sizeof (slot_1)];
+ * //
+ * //   union {
+ * //     char alias_0[0];
+ * //   } slot_2;
+ * //   std::byte padding_2[256 - sizeof (slot_2)];
+ * // };
+ *
+ * // The third slot is empty
+ *
+ * // Temporary storage mapping
+ * if (d_temp_storage == nullptr)
+ * {
+ *   temp_storage_bytes = temporary_storage.get_size();
+ *   return;
+ * }
+ * else
+ * {
+ *   temporary_storage.map_to_buffer(d_temp_storage, temp_storage_bytes);
+ * }
+ *
+ * // Use pointers
+ * int *d_int_array = int_array.get();
+ * double *d_double_array = double_array.get();
+ * char *d_char_array = char_array.get();
+ * @endcode
+ */
+template <int SlotsCount>
+class layout
+{
+  slot m_slots[SlotsCount];
+  std::size_t m_sizes[SlotsCount];
+  void* m_pointers[SlotsCount];
+  bool m_layout_was_mapped{};
+
+public:
+  layout() = default;
+
+  _CCCL_HOST_DEVICE slot* get_slot(int slot_id)
+  {
+    if (slot_id < SlotsCount)
+    {
+      return &m_slots[slot_id];
+    }
+
+    return nullptr;
+  }
+
+  /**
+   * @brief Returns required temporary storage size in bytes
+   */
+  _CCCL_HOST_DEVICE std::size_t get_size()
+  {
+    this->prepare_interface();
+
+    // AliasTemporaries can return error only in mapping stage,
+    // so it's safe to ignore it here.
+    std::size_t temp_storage_bytes{};
+    AliasTemporaries(nullptr, temp_storage_bytes, m_pointers, m_sizes);
+
+    if (temp_storage_bytes == 0)
+    {
+      // The current CUB convention implies that there are two stages for each
+      // device-scope function call. The first one returns the required storage
+      // size. The second stage consumes temporary storage to perform some work.
+      // The only way to distinguish between the two stages is by checking the
+      // value of the temporary storage pointer. If zero bytes are requested,
+      // `cudaMalloc` will return `nullptr`. This fact makes it impossible to
+      // distinguish between the two stages, so we request some fixed amount of
+      // bytes (even if we don't need it) to have a non-null temporary storage
+      // pointer.
+      return 1;
+    }
+
+    return temp_storage_bytes;
+  }
+
+  /**
+   * @brief Maps the layout to the temporary storage buffer.
+   */
+  _CCCL_HOST_DEVICE cudaError_t map_to_buffer(void* d_temp_storage, std::size_t temp_storage_bytes)
+  {
+    if (m_layout_was_mapped)
+    {
+      return cudaErrorAlreadyMapped;
+    }
+
+    this->prepare_interface();
+
+    cudaError_t error = cudaSuccess;
+    if ((error = AliasTemporaries(d_temp_storage, temp_storage_bytes, m_pointers, m_sizes)))
+    {
+      return error;
+    }
+
+    for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++)
+    {
+      m_slots[slot_id].set_storage(m_pointers[slot_id]);
+    }
+
+    m_layout_was_mapped = true;
+    return error;
+  }
+
+private:
+  _CCCL_HOST_DEVICE void prepare_interface()
+  {
+    if (m_layout_was_mapped)
+    {
+      return;
+    }
+
+    for (std::size_t slot_id = 0; slot_id < SlotsCount; slot_id++)
+    {
+      const std::size_t slot_size = m_slots[slot_id].get_bytes_required();
+
+      m_sizes[slot_id]    = slot_size;
+      m_pointers[slot_id] = nullptr;
+    }
+  }
+};
+
+} // namespace temporary_storage
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/type_traits.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/type_traits.cuh
new file mode 100644
index 000000000..12dce69c1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/type_traits.cuh
@@ -0,0 +1,164 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Wrappers and extensions around <type_traits> utilities.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_namespace.cuh>
+
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+#include <cuda/std/functional>
+_CCCL_SUPPRESS_DEPRECATED_POP
+#include <cuda/std/type_traits>
+
+#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::__enable_if_t<(__VA_ARGS__)>* = nullptr
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+
+template <typename Invokable, typename... Args>
+using invoke_result_t =
+#if _CCCL_STD_VER < 2017
+  typename ::cuda::std::result_of<Invokable(Args...)>::type;
+#else // 2017+
+  ::cuda::std::invoke_result_t<Invokable, Args...>;
+#endif
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool are_same()
+{
+  return ::cuda::std::conjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename T, typename... TArgs>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_one_of()
+{
+  return ::cuda::std::disjunction<::cuda::std::is_same<T, TArgs>...>::value;
+}
+
+template <typename...>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool always_false()
+{
+  return false;
+}
+
+template <typename T, typename V, typename = void>
+struct has_binary_call_operator : ::cuda::std::false_type
+{};
+
+template <typename T, typename V>
+struct has_binary_call_operator<
+  T,
+  V,
+  ::cuda::std::void_t<decltype(::cuda::std::declval<T>()(::cuda::std::declval<V>(), ::cuda::std::declval<V>()))>>
+    : ::cuda::std::true_type
+{};
+
+/***********************************************************************************************************************
+ * Array like type traits
+ **********************************************************************************************************************/
+
+template <typename T, typename = void>
+struct has_subscript : ::cuda::std::false_type
+{};
+
+template <typename T>
+struct has_subscript<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>()[0])>> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_subscript_t = typename has_subscript<T>::type;
+
+template <typename T, typename = void>
+struct has_size : ::cuda::std::false_type
+{};
+
+// TODO: use ::cuda::std::size(::cuda::std::declval<T>()) when std::size will be available in libcu++
+template <typename T>
+struct has_size<T, ::cuda::std::void_t<decltype(::cuda::std::declval<T>().size())>> : ::cuda::std::true_type
+{};
+
+template <typename T, ::cuda::std::size_t N>
+struct has_size<T[N], void> : ::cuda::std::true_type
+{};
+
+template <typename T>
+using has_size_t = typename has_size<T>::type;
+
+/***********************************************************************************************************************
+ * StaticSize: a type trait that returns the number of elements in an Array-like type
+ **********************************************************************************************************************/
+// StaticSize is useful where size(obj) cannot be checked at compile time
+// e.g.
+// using Array = NonTriviallyConstructible[8];
+// std::size(Array{})   // compile error
+// static_size<Array>() // ok
+
+template <typename T, typename = void>
+struct StaticSize
+{
+  static_assert(detail::always_false<T>(), "StaticSize not supported for this type");
+};
+
+template <typename T>
+struct StaticSize<T,
+                  ::cuda::std::void_t<decltype(::cuda::std::integral_constant<int, ::cuda::std::declval<T>().size()>{})>>
+{
+  static_assert(::cuda::std::is_trivially_constructible<T>::value, "T must be trivially constructible");
+  static constexpr auto value = T{}.size();
+};
+
+template <typename T, ::cuda::std::size_t N>
+struct StaticSize<T[N], void>
+{
+  static constexpr auto value = N;
+};
+
+template <typename T>
+_CCCL_NODISCARD _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::size_t static_size()
+{
+  return StaticSize<T>::value;
+}
+
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/detail/uninitialized_copy.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/detail/uninitialized_copy.cuh
new file mode 100644
index 000000000..326826c0d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/detail/uninitialized_copy.cuh
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+#if defined(_NVHPC_CUDA)
+template <typename T, typename U>
+_CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
+{
+  // NVBug 3384810
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#else
+template <typename T,
+          typename U,
+          typename ::cuda::std::enable_if<::cuda::std::is_trivially_copyable<T>::value, int>::type = 0>
+_CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
+{
+  // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better
+  *ptr = ::cuda::std::forward<U>(val);
+}
+
+template <typename T,
+          typename U,
+          typename ::cuda::std::enable_if<!::cuda::std::is_trivially_copyable<T>::value, int>::type = 0>
+_CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
+{
+  new (ptr) T(::cuda::std::forward<U>(val));
+}
+#endif
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_adjacent_difference.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_adjacent_difference.cuh
new file mode 100644
index 000000000..53d8cc2cf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_adjacent_difference.cuh
@@ -0,0 +1,682 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/detail/type_traits.cuh>
+#include <cub/device/dispatch/dispatch_adjacent_difference.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/detail/integer_traits.h>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceAdjacentDifference provides device-wide, parallel operations for
+//! computing the differences of adjacent elements residing within
+//! device-accessible memory.
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! - DeviceAdjacentDifference calculates the differences of adjacent elements in
+//!   d_input. Because the binary operation could be noncommutative, there
+//!   are two sets of methods. Methods named SubtractLeft subtract left element
+//!   ``*(i - 1)`` of input sequence from current element ``*i``.
+//!   Methods named ``SubtractRight`` subtract current element ``*i`` from the
+//!   right one ``*(i + 1)``:
+//!
+//!   .. code-block:: c++
+//!
+//!      int *d_values; // [1, 2, 3, 4]
+//!      //...
+//!      int *d_subtract_left_result  <-- [  1,  1,  1,  1 ]
+//!      int *d_subtract_right_result <-- [ -1, -1, -1,  4 ]
+//!
+//! - For SubtractLeft, if the left element is out of bounds, the iterator is
+//!   assigned to ``*(result + (i - first))`` without modification.
+//! - For SubtractRight, if the right element is out of bounds, the iterator is
+//!   assigned to ``*(result + (i - first))`` without modification.
+//!
+//! Snippet
+//! ++++++++++++++++++++++++++
+//!
+//! The code snippet below illustrates how to use ``DeviceAdjacentDifference`` to
+//! compute the left difference between adjacent elements.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+//!
+//!    // Declare, allocate, and initialize device-accessible pointers
+//!    int  num_items;       // e.g., 8
+//!    int  *d_values;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+//!    //...
+//!
+//!    // Determine temporary device storage requirements
+//!    void     *d_temp_storage = nullptr;
+//!    size_t   temp_storage_bytes = 0;
+//!
+//!    cub::DeviceAdjacentDifference::SubtractLeft(
+//!      d_temp_storage, temp_storage_bytes, d_values, num_items);
+//!
+//!    // Allocate temporary storage
+//!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//!
+//!    // Run operation
+//!    cub::DeviceAdjacentDifference::SubtractLeft(
+//!      d_temp_storage, temp_storage_bytes, d_values, num_items);
+//!
+//!    // d_values <-- [1, 1, -1, 1, -1, 1, -1, 1]
+//!
+//! @endrst
+struct DeviceAdjacentDifference
+{
+private:
+  template <bool may_alias,
+            bool read_left,
+            typename NumItemsT,
+            typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT>
+  static CUB_RUNTIME_FUNCTION cudaError_t AdjacentDifference(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    NumItemsT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream)
+  {
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    using DispatchT =
+      DispatchAdjacentDifference<InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, may_alias, read_left>;
+
+    return DispatchT::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);
+  }
+
+public:
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Calculates the differences of adjacent elements in ``d_input``.
+  //!   That is, ``*d_input`` is assigned to ``*d_output``, and, for each iterator ``i`` in the
+  //!   range ``[d_input + 1, d_input + num_items)``, the result of
+  //!   ``difference_op(*i, *(i - 1))`` is assigned to ``*(d_output + (i - d_input))``.
+  //! - Note that the behavior is undefined if the input and output ranges
+  //!   overlap in any way.
+  //!
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;      // e.g., 8
+  //!    int  *d_input;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    int  *d_output;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!
+  //!    cub::DeviceAdjacentDifference::SubtractLeftCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output,
+  //!      num_items, CustomDifference());
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractLeftCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output,
+  //!      num_items, CustomDifference());
+  //!
+  //!    // d_input  <-- [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    // d_output <-- [1, 1, -1, 1, -1, 1, -1, 1]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   @rst
+  //!   is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
+  //!   and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+  //!   ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+  //!   a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+  //!   of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+  //!   ``value_types``.
+  //!   @endrst
+  //!
+  //! @tparam OutputIteratorT
+  //!   @rst
+  //!   is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
+  //!   @endrst
+  //!
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `OutputIteratorT`'s set of `value_types`.
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_input
+  //!   Pointer to the input sequence
+  //!
+  //! @param[out] d_output
+  //!   Pointer to the output sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //!
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT     = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    NumItemsT num_items,
+    DifferenceOpT difference_op = {},
+    cudaStream_t stream         = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy");
+
+    constexpr bool may_alias = false;
+    constexpr bool read_left = true;
+
+    return AdjacentDifference<may_alias, read_left>(
+      d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeftCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    NumItemsT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SubtractLeftCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory.
+  //!
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Calculates the differences of adjacent elements in ``d_input``. That is, for
+  //! each iterator ``i`` in the range ``[d_input + 1, d_input + num_items)``, the
+  //! result of ``difference_op(*i, *(i - 1))`` is assigned to
+  //! ``*(d_input + (i - d_input))``.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;     // e.g., 8
+  //!    int  *d_data;       // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractLeft(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items, CustomDifference());
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractLeft(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items, CustomDifference());
+  //!
+  //!    // d_data <-- [1, 1, -1, 1, -1, 1, -1, 1]
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   @rst
+  //!   is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
+  //!   ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+  //!   ``RandomAccessIteratorT``'s ``value_type``, and ``x - y`` is defined, then the
+  //!   return type of ``x - y`` should be convertible to a type in
+  //!   ``RandomAccessIteratorT``'s set of ``value_types``.
+  //!   @endrst
+  //!
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of `num_items`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_input
+  //!   Pointer to the input sequence and the result
+  //!
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //!
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename RandomAccessIteratorT, typename DifferenceOpT = cub::Difference, typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    RandomAccessIteratorT d_input,
+    NumItemsT num_items,
+    DifferenceOpT difference_op = {},
+    cudaStream_t stream         = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft");
+
+    constexpr bool may_alias = true;
+    constexpr bool read_left = true;
+
+    return AdjacentDifference<may_alias, read_left>(
+      d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractLeft(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    RandomAccessIteratorT d_input,
+    NumItemsT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SubtractLeft(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - Calculates the right differences of adjacent elements in ``d_input``.
+  //!   That is, ``*(d_input + num_items - 1)`` is assigned to
+  //!   ``*(d_output + num_items - 1)``, and, for each iterator ``i`` in the range
+  //!   ``[d_input, d_input + num_items - 1)``, the result of
+  //!   ``difference_op(*i, *(i + 1))`` is assigned to
+  //!   ``*(d_output + (i - d_input))``.
+  //! - Note that the behavior is undefined if the input and output ranges
+  //!   overlap in any way.
+  //!
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //!
+  //!    struct CustomDifference
+  //!    {
+  //!      template <typename DataType>
+  //!      __host__ DataType operator()(DataType &lhs, DataType &rhs)
+  //!      {
+  //!        return lhs - rhs;
+  //!      }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;     // e.g., 8
+  //!    int  *d_input;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    int  *d_output;
+  //!    ..
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractRightCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output, num_items, CustomDifference());
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractRightCopy(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_input, d_output, num_items, CustomDifference());
+  //!
+  //!    // d_input <-- [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   @rst
+  //!   is a model of `Input Iterator <https://en.cppreference.com/w/cpp/iterator/input_iterator>`_,
+  //!   and ``x`` and ``y`` are objects of ``InputIteratorT``'s ``value_type``, then
+  //!   ``x - y`` is defined, and ``InputIteratorT``'s ``value_type`` is convertible to
+  //!   a type in ``OutputIteratorT``'s set of ``value_types``, and the return type
+  //!   of ``x - y`` is convertible to a type in ``OutputIteratorT``'s set of
+  //!   ``value_types``.
+  //!   @endrst
+  //!
+  //! @tparam OutputIteratorT
+  //!   @rst
+  //!   is a model of `Output Iterator <https://en.cppreference.com/w/cpp/iterator/output_iterator>`_.
+  //!   @endrst
+  //!
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_input
+  //!   Pointer to the input sequence
+  //!
+  //! @param[out] d_output
+  //!   Pointer to the output sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //!
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences.
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename DifferenceOpT = cub::Difference,
+            typename NumItemsT     = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    NumItemsT num_items,
+    DifferenceOpT difference_op = {},
+    cudaStream_t stream         = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy");
+
+    constexpr bool may_alias = false;
+    constexpr bool read_left = false;
+
+    return AdjacentDifference<may_alias, read_left>(
+      d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename DifferenceOpT, typename NumItemsT = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRightCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    NumItemsT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SubtractRightCopy(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Subtracts the right element of each adjacent pair of elements residing within device-accessible memory.
+  //!
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! Calculates the right differences of adjacent elements in ``d_input``.
+  //! That is, for each iterator ``i`` in the range
+  //! ``[d_input, d_input + num_items - 1)``, the result of
+  //! ``difference_op(*i, *(i + 1))`` is assigned to ``*(d_input + (i - d_input))``.
+  //!
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how to use ``DeviceAdjacentDifference``
+  //! to compute the difference between adjacent elements.
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_adjacent_difference.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    int  num_items;    // e.g., 8
+  //!    int  *d_data;      // e.g., [1, 2, 1, 2, 1, 2, 1, 2]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceAdjacentDifference::SubtractRight(
+  //!      d_temp_storage, temp_storage_bytes, d_data, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run operation
+  //!    cub::DeviceAdjacentDifference::SubtractRight(
+  //!      d_temp_storage, temp_storage_bytes, d_data, num_items);
+  //!
+  //!    // d_data  <-- [-1, 1, -1, 1, -1, 1, -1, 2]
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   @rst
+  //!   is a model of `Random Access Iterator <https://en.cppreference.com/w/cpp/iterator/random_access_iterator>`_,
+  //!   ``RandomAccessIteratorT`` is mutable. If ``x`` and ``y`` are objects of
+  //!   ``RandomAccessIteratorT``'s `value_type`, and ``x - y`` is defined, then the
+  //!   return type of ``x - y`` should be convertible to a type in
+  //!   ``RandomAccessIteratorT``'s set of ``value_types``.
+  //!   @endrst
+  //!
+  //! @tparam DifferenceOpT
+  //!   Its `result_type` is convertible to a type in `RandomAccessIteratorT`'s
+  //!   set of `value_types`.
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_input
+  //!   Pointer to the input sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of items in the input sequence
+  //!
+  //! @param[in] difference_op
+  //!   The binary function used to compute differences
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename RandomAccessIteratorT, typename DifferenceOpT = cub::Difference, typename NumItemsT = std::uint32_t>
+  static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    RandomAccessIteratorT d_input,
+    NumItemsT num_items,
+    DifferenceOpT difference_op = {},
+    cudaStream_t stream         = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight");
+
+    constexpr bool may_alias = true;
+    constexpr bool read_left = false;
+
+    return AdjacentDifference<may_alias, read_left>(
+      d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename RandomAccessIteratorT, typename DifferenceOpT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED static CUB_RUNTIME_FUNCTION cudaError_t SubtractRight(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    RandomAccessIteratorT d_input,
+    NumItemsT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SubtractRight(d_temp_storage, temp_storage_bytes, d_input, num_items, difference_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_copy.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_copy.cuh
new file mode 100644
index 000000000..a6d24a522
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_copy.cuh
@@ -0,0 +1,196 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceCopy provides device-wide, parallel operations for copying data.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+//! @brief cub::DeviceCopy provides device-wide, parallel operations for copying data.
+struct DeviceCopy
+{
+  //! @rst
+  //! Copies data from a batch of given source ranges to their corresponding destination ranges.
+  //!
+  //! .. note::
+  //!
+  //!    If any input range aliases any output range the behavior is undefined.
+  //!    If any output range aliases another output range the behavior is undefined.
+  //!    Input ranges can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceCopy::Batched to perform a DeviceRunLength Decode operation.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    struct GetIteratorToRange
+  //!    {
+  //!      __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+  //!      {
+  //!        return thrust::make_constant_iterator(d_data_in[index]);
+  //!      }
+  //!      int32_t *d_data_in;
+  //!    };
+  //!
+  //!    struct GetPtrToRange
+  //!    {
+  //!      __host__ __device__ __forceinline__ auto operator()(uint32_t index)
+  //!      {
+  //!        return d_data_out + d_offsets[index];
+  //!      }
+  //!      int32_t *d_data_out;
+  //!      uint32_t *d_offsets;
+  //!    };
+  //!
+  //!    struct GetRunLength
+  //!    {
+  //!      __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+  //!      {
+  //!        return d_offsets[index + 1] - d_offsets[index];
+  //!      }
+  //!      uint32_t *d_offsets;
+  //!    };
+  //!
+  //!    uint32_t num_ranges = 5;
+  //!    int32_t *d_data_in;           // e.g., [4, 2, 7, 3, 1]
+  //!    int32_t *d_data_out;          // e.g., [0,                ...               ]
+  //!    uint32_t *d_offsets;          // e.g., [0, 2, 5, 6, 9, 14]
+  //!
+  //!    // Returns a constant iterator to the element of the i-th run
+  //!    thrust::counting_iterator<uint32_t> iota(0);
+  //!    auto iterators_in = thrust::make_transform_iterator(iota, GetIteratorToRange{d_data_in});
+  //!
+  //!    // Returns the run length of the i-th run
+  //!    auto sizes = thrust::make_transform_iterator(iota, GetRunLength{d_offsets});
+  //!
+  //!    // Returns pointers to the output range for each run
+  //!    auto ptrs_out = thrust::make_transform_iterator(iota, GetPtrToRange{d_data_out, d_offsets});
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage      = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+  //!    num_ranges);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run batched copy algorithm (used to perform runlength decoding)
+  //!    cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, iterators_in, ptrs_out, sizes,
+  //!    num_ranges);
+  //!
+  //!    // d_data_out       <-- [4, 4, 2, 2, 2, 7, 3, 3, 3, 1, 1, 1, 1, 1]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
+  //!
+  //! @tparam OutputIt
+  //!  **[inferred]** Device-accessible random-access input iterator type providing the iterators to
+  //!  the destination ranges
+  //!
+  //! @tparam SizeIteratorT
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
+  //!   copied for each pair of ranges
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] input_it
+  //!   Device-accessible iterator providing the iterators to the source ranges
+  //!
+  //! @param[in] output_it
+  //!   Device-accessible iterator providing the iterators to the destination ranges
+  //!
+  //! @param[in] sizes
+  //!   Device-accessible iterator providing the number of elements to be copied for each pair of ranges
+  //!
+  //! @param[in] num_ranges
+  //!   The total number of range pairs
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIt, typename OutputIt, typename SizeIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Batched(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIt input_it,
+    OutputIt output_it,
+    SizeIteratorT sizes,
+    uint32_t num_ranges,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceCopy::Batched");
+
+    // Integer type large enough to hold any offset in [0, num_ranges)
+    using RangeOffsetT = uint32_t;
+
+    // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
+    // uppper bound on num_thread_blocks_launched can be assumed to be given by
+    // IDIV_CEIL(num_ranges, 64)
+    using BlockOffsetT = uint32_t;
+
+    return detail::DispatchBatchMemcpy<
+      InputIt,
+      OutputIt,
+      SizeIteratorT,
+      RangeOffsetT,
+      BlockOffsetT,
+      detail::DeviceBatchMemcpyPolicy<RangeOffsetT, BlockOffsetT>,
+      false>::Dispatch(d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_for.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_for.cuh
new file mode 100644
index 000000000..3d8c4286b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_for.cuh
@@ -0,0 +1,839 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_for.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+/**
+ * `op_wrapper_t` turns bulk into a for-each operation by wrapping the user-provided unary operator.
+ */
+template <class OffsetT, class OpT, class RandomAccessIteratorT>
+struct op_wrapper_t
+{
+  RandomAccessIteratorT input;
+  OpT op;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  {
+    // Dereferencing `thrust::device_vector<T>` iterators returns a `thrust::device_reference<T>`
+    // instead of `T`. Since user-provided operator expects `T` as an argument, we need to unwrap.
+    (void) op(THRUST_NS_QUALIFIER::raw_reference_cast(*(input + i)));
+  }
+};
+
+/**
+ * `op_wrapper_vectorized_t` turns bulk into a for-each-copy operation.
+ * `op_wrapper_vectorized_t` is similar to `op_wrapper_t` but does not provide any guarantees about
+ * address of the input parameter. `OpT` might be given a copy of the value or an actual reference
+ * to the input iterator value (depending on the alignment of input iterator)
+ */
+template <class OffsetT, class OpT, class T>
+struct op_wrapper_vectorized_t
+{
+  const T* input; // Raw pointer to the input data
+  OpT op; // User-provided operator
+  OffsetT partially_filled_vector_id; // Index of the vector that doesn't have all elements
+  OffsetT num_items; // Total number of non-vectorized items
+
+  // TODO Can be extracted into tuning
+  constexpr static int vec_size = 4;
+
+  // Type of the vector that is used to load the input data
+  using vector_t = typename CubVector<T, vec_size>::Type;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
+  {
+    // Surrounding `Bulk` call doesn't invoke this operator on invalid indices, so we don't need to
+    // check for out-of-bounds access here.
+    if (i != partially_filled_vector_id)
+    { // Case of fully filled vector
+      const vector_t vec = *reinterpret_cast<const vector_t*>(input + vec_size * i);
+
+#pragma unroll
+      for (int j = 0; j < vec_size; j++)
+      {
+        (void) op(*(reinterpret_cast<const T*>(&vec) + j));
+      }
+    }
+    else
+    { // Case of partially filled vector
+      for (OffsetT j = i * vec_size; j < num_items; j++)
+      {
+        (void) op(input[j]);
+      }
+    }
+  }
+};
+
+} // namespace for_each
+} // namespace detail
+
+struct DeviceFor
+{
+private:
+  /**
+   * Checks if the pointer is aligned to the given vector type
+   */
+  template <class VectorT, class T>
+  CUB_RUNTIME_FUNCTION static bool is_aligned(const T* ptr)
+  {
+    return (reinterpret_cast<std::size_t>(ptr) & (sizeof(VectorT) - 1)) == 0;
+  }
+
+  template <class RandomAccessIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    RandomAccessIteratorT first,
+    OffsetT num_items,
+    OpT op,
+    cudaStream_t stream,
+    ::cuda::std::false_type /* do_not_vectorize */)
+  {
+    using wrapped_op_t = detail::for_each::op_wrapper_t<OffsetT, OpT, RandomAccessIteratorT>;
+    return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(num_items, wrapped_op_t{first, op}, stream);
+  }
+
+  template <class ContiguousIteratorT, class OffsetT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t for_each_n(
+    ContiguousIteratorT first, OffsetT num_items, OpT op, cudaStream_t stream, ::cuda::std::true_type /* vectorize */)
+  {
+    auto* unwrapped_first = THRUST_NS_QUALIFIER::unwrap_contiguous_iterator(first);
+    using wrapped_op_t = detail::for_each::op_wrapper_vectorized_t<OffsetT, OpT, detail::value_t<ContiguousIteratorT>>;
+
+    if (is_aligned<typename wrapped_op_t::vector_t>(unwrapped_first))
+    { // Vectorize loads
+      const OffsetT num_vec_items = ::cuda::ceil_div(num_items, wrapped_op_t::vec_size);
+
+      return detail::for_each::dispatch_t<OffsetT, wrapped_op_t>::dispatch(
+        num_vec_items,
+        wrapped_op_t{
+          unwrapped_first, op, num_items % wrapped_op_t::vec_size ? num_vec_items - 1 : num_vec_items, num_items},
+        stream);
+    }
+
+    // Fallback to non-vectorized version
+    return for_each_n(first, num_items, op, stream, ::cuda::std::false_type{});
+  }
+
+public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-temp-storage
+  //!     :end-before: example-end bulk-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Bulk(void* d_temp_storage, size_t& temp_storage_bytes, ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Bulk(shape, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-temp-storage
+  //!     :end-before: example-end for-each-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-temp-storage
+  //!     :end-before: example-end for-each-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEach(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEach(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-temp-storage
+  //!     :end-before: example-end for-each-copy-n-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopyN(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    NumItemsT num_items,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopyN(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //! - @devicestorage
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-temp-storage
+  //!     :end-before: example-end for-each-copy-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`,
+  //!   the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ForEachCopy(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorT first,
+    RandomAccessIteratorT last,
+    OpT op,
+    cudaStream_t stream = {})
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return ForEachCopy(first, last, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each index in the provided shape
+  //! The algorithm is similar to
+  //! `bulk <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2300r5.html#design-sender-adaptor-bulk>`_
+  //! from P2300.
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use Bulk to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-t
+  //!     :end-before: example-end bulk-square-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-wo-temp-storage
+  //!     :end-before: example-end bulk-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam ShapeT
+  //!   is an integral type
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] shape
+  //!   Shape of the index space to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each index in the index space
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class ShapeT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Bulk(ShapeT shape, OpT op, cudaStream_t stream = {})
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
+    static_assert(::cuda::std::is_integral<ShapeT>::value, "ShapeT must be an integral type");
+    using offset_t = ShapeT;
+    return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
+  }
+
+private:
+  // Internal version without NVTX raNGE
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t = NumItemsT;
+    // Disable auto-vectorization for now:
+    // constexpr bool use_vectorization =
+    //   detail::for_each::can_regain_copy_freedom<detail::value_t<RandomAccessIteratorT>, OpT>::value
+    //   && THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>::value;
+    using use_vectorization_t = ::cuda::std::bool_constant<false>;
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachN` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-n-wo-temp-storage
+  //!     :end-before: example-end for-each-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachN");
+    return ForEachNNoNVTX(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``
+  //!
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEach` to square each element in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-square-ref-t
+  //!     :end-before: example-end bulk-square-ref-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-wo-temp-storage
+  //!     :end-before: example-end for-each-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEach(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEach");
+
+    using offset_t = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
+
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+
+    return ForEachNNoNVTX(first, num_items, op, stream);
+  }
+
+private:
+  // Internal version without NVTX range
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopyNNoNVTX(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    using offset_t            = NumItemsT;
+    using use_vectorization_t = THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorT>;
+    return for_each_n<RandomAccessIteratorT, offset_t, OpT>(first, num_items, op, stream, use_vectorization_t{});
+  }
+
+public:
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, first + num_items)``.
+  //! Unlike the ``ForEachN`` algorithm, ``ForEachCopyN`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopyN`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopyN` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-n-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-n-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam NumItemsT
+  //!   is an integral type representing the number of elements to iterate over
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] num_items
+  //!   Number of elements to iterate over
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class NumItemsT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopyN(RandomAccessIteratorT first, NumItemsT num_items, OpT op, cudaStream_t stream = {})
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopyN");
+    return ForEachCopyNNoNVTX(first, num_items, op, stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! Applies the function object ``op`` to each element in the range ``[first, last)``.
+  //! Unlike the ``ForEach`` algorithm, ``ForEachCopy`` is allowed to invoke ``op`` on copies of the elements.
+  //! This relaxation allows ``ForEachCopy`` to vectorize loads.
+  //!
+  //! - Allowed to invoke ``op`` on copies of the elements
+  //! - The return value of ``op``, if any, is ignored.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The following code snippet demonstrates how to use `ForEachCopy` to count odd elements in a device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin bulk-odd-count-t
+  //!     :end-before: example-end bulk-odd-count-t
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin for-each-copy-wo-temp-storage
+  //!     :end-before: example-end for-each-copy-wo-temp-storage
+  //!
+  //! @endrst
+  //!
+  //! @tparam RandomAccessIteratorT
+  //!   is a model of Random Access Iterator whose value type is convertible to `op`'s argument type.
+  //!
+  //! @tparam OpT
+  //!   is a model of [Unary Function](https://en.cppreference.com/w/cpp/utility/functional/unary_function)
+  //!
+  //! @param[in] first
+  //!   The beginning of the sequence
+  //!
+  //! @param[in] last
+  //!   The end of the sequence
+  //!
+  //! @param[in] op
+  //!   Function object to apply to a copy of each element in the range
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within. Default stream is `0`.
+  template <class RandomAccessIteratorT, class OpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  ForEachCopy(RandomAccessIteratorT first, RandomAccessIteratorT last, OpT op, cudaStream_t stream = {})
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachCopy");
+    using offset_t       = typename THRUST_NS_QUALIFIER::iterator_traits<RandomAccessIteratorT>::difference_type;
+    const auto num_items = static_cast<offset_t>(THRUST_NS_QUALIFIER::distance(first, last));
+    return ForEachCopyNNoNVTX(first, num_items, op, stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_histogram.cuh
new file mode 100644
index 000000000..e6abc4bd0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_histogram.cuh
@@ -0,0 +1,1616 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of
+//! samples data residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_histogram.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+#include <limits>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of
+//! samples data residing within device-accessible memory.
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! A `histogram <http://en.wikipedia.org/wiki/Histogram>`_ counts the number of observations that fall into each
+//! of the disjoint categories (known as *bins*).
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceHistogram}
+//!
+//! @endrst
+struct DeviceHistogram
+{
+  //! @name Evenly-segmented bin ranges
+  //! @{
+
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
+  //!
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``.
+  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
+  //!   down to the nearest whole number. To protect against potential overflows, if the product
+  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
+  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
+  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
+  //!   be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - The ranges ``[d_samples, d_samples + num_samples)`` and
+  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT and SampleT must be valid
+  //!   arithmetic types. The common type must be convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a sequence of float samples
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histogram
+  //!    int      num_samples;    // e.g., 10
+  //!    float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
+  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+  //!    float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+  //!    float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_samples);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_samples);
+  //!
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input samples @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //!
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //!
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //!
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
+  //!
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin.
+  //!
+  //! @param[in] num_samples
+  //!   The number of input samples (i.e., the length of `d_samples`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    LevelT lower_level,
+    LevelT upper_level,
+    OffsetT num_samples,
+    cudaStream_t stream = 0)
+  {
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+    return MultiHistogramEven<1, 1>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      &d_histogram,
+      &num_levels,
+      &lower_level,
+      &upper_level,
+      num_samples,
+      static_cast<OffsetT>(1),
+      sizeof(SampleT) * num_samples,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    LevelT lower_level,
+    LevelT upper_level,
+    OffsetT num_samples,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return HistogramEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_samples,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
+  //!
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using
+  //!   the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``
+  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
+  //!   down to the nearest whole number. To protect against potential overflows, if the product
+  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
+  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
+  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
+  //!   be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - For a given row ``r`` in ``[0, num_rows)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
+  //!   ``row_end = row_begin + num_row_samples``. The ranges
+  //!   ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)``
+  //!   shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types. The common type must be
+  //!   convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histogram
+  //!    int      num_row_samples;    // e.g., 5
+  //!    int      num_rows;           // e.g., 2;
+  //!    size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+  //!    float*   d_samples;          // e.g., [2.2, 6.1, 7.1, 2.9, 3.5,   -, -,
+  //!                                 //        0.3, 2.9, 2.1, 6.1, 999.5, -, -]
+  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+  //!    float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+  //!    float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage  = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!        num_row_samples, num_rows, row_stride_bytes);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramEven(
+  //!        d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!        num_row_samples, num_rows, row_stride_bytes);
+  //!
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //!
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of
+  //!   length `num_levels - 1`.
+  //!
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //!
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
+  //!
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin.
+  //!
+  //! @param[in] num_row_samples
+  //!   The number of data samples per row in the region of interest
+  //!
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //!
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in
+  //!   the region of interest
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    LevelT lower_level,
+    LevelT upper_level,
+    OffsetT num_row_samples,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream = 0)
+  {
+    return MultiHistogramEven<1, 1>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      &d_histogram,
+      &num_levels,
+      &lower_level,
+      &upper_level,
+      num_row_samples,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    LevelT lower_level,
+    LevelT upper_level,
+    OffsetT num_row_samples,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return HistogramEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_row_samples,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
+  //! equal-width bins.
+  //!
+  //! - The input is a sequence of *pixel* structures, where each pixel comprises
+  //!   a record of ``NUM_CHANNELS`` consecutive data samples
+  //!   (e.g., an *RGBA* pixel).
+  //! - ``NUM_CHANNELS`` can be up to 4.
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS``
+  //!   (e.g., only *RGB* histograms from *RGBA* pixel samples).
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - If the common type of sample and level is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down
+  //!   to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product
+  //!   ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``,
+  //!   the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation
+  //!   will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin
+  //!   computation would overflow for 128-bit arithmetic.
+  //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
+  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and
+  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types.
+  //!   The common type must be convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 256-bin *RGB* histograms
+  //! from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel)
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histograms
+  //!    int              num_pixels;         // e.g., 5
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6)]
+  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+  //!                                         //       each allocated with 256 integer counters
+  //!    int              num_levels[3];      // e.g., {257, 257, 257};
+  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_pixels);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      lower_level, upper_level, num_pixels);
+  //!
+  //!    // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+  //!    //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+  //!    //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+  //!
+  //! @endrst
+  //!
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //!
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples.
+  //!   The samples from different channels are assumed to be interleaved
+  //!   (e.g., an array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //!
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be `num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in each active channel.
+  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+  //!
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+  //!
+  //! @param[in] num_pixels
+  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_pixels,
+    cudaStream_t stream = 0)
+  {
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+    return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_pixels,
+      static_cast<OffsetT>(1),
+      sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_pixels,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return MultiHistogramEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_pixels,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of
+  //! multi-channel "pixel" data samples using equal-width bins.
+  //!
+  //! - The input is a sequence of *pixel* structures, where each pixel
+  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - ``NUM_CHANNELS`` can be up to 4.
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB*
+  //!   histograms from *RGBA* pixel samples).
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - If the common type of sample and level is of integral type, the bin for a sample is
+  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``,
+  //!   round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``,
+  //!   the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by
+  //!   an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned.
+  //!   If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue``
+  //!   will only be returned if bin computation would overflow for 128-bit arithmetic.
+  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in
+  //!   ``[0, num_row_pixels)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
+  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
+  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel ``c`` in
+  //!   ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
+  //!   ``[sample_begin, sample_end)`` and
+  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
+  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
+  //!   and SampleT must be valid arithmetic types. The common type must be
+  //!   convertible to ``int`` and trivially copyable.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 256-bin
+  //! *RGB* histograms from a 2x3 region of interest of within a flattened 2x4
+  //! array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histograms
+  //!    int              num_row_pixels;     // e.g., 3
+  //!    int              num_rows;           // e.g., 2
+  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+  //!                                         //       each allocated with 256 integer counters
+  //!    int              num_levels[3];      // e.g., {257, 257, 257};
+  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+  //!    //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+  //!    //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+  //!
+  //! @endrst
+  //!
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //!
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input
+  //!   samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples. The
+  //!   samples from different channels are assumed to be interleaved (e.g.,
+  //!   an array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //!
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each
+  //!   active channel. For channel\ :sub:`i`, the allocation length
+  //!   of ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in each active channel.
+  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] lower_level
+  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+  //!
+  //! @param[in] upper_level
+  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+  //!
+  //! @param[in] num_row_pixels
+  //!   The number of multi-channel pixels per row in the region of interest
+  //!
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //!
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the region of
+  //!   interest
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven");
+
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+    Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+    _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
+    {
+      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
+      {
+        // Down-convert OffsetT data type
+        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+          d_temp_storage,
+          temp_storage_bytes,
+          d_samples,
+          d_histogram,
+          num_levels,
+          lower_level,
+          upper_level,
+          (int) num_row_pixels,
+          (int) num_rows,
+          (int) (row_stride_bytes / sizeof(SampleT)),
+          stream,
+          is_byte_sample);
+      }
+    }
+
+    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_row_pixels,
+      num_rows,
+      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+      stream,
+      is_byte_sample);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return MultiHistogramEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      lower_level,
+      upper_level,
+      num_row_pixels,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+  //! @name Custom bin ranges
+  //! @{
+
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+  //!
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
+  //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not
+  //!   overlap ``[d_samples, d_samples + num_samples)`` nor
+  //!   ``[d_levels, d_levels + num_levels)`` in any way. The ranges
+  //!   ``[d_levels, d_levels + num_levels)`` and
+  //!   ``[d_samples, d_samples + num_samples)`` may overlap.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of an six-bin histogram
+  //! from a sequence of float samples
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histogram
+  //!    int      num_samples;    // e.g., 10
+  //!    float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+  //!    float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
+  //!
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //!
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //!
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //!
+  //! @param[in] d_levels
+  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
+  //!   by consecutive boundary pairings: lower sample value boundaries are
+  //!   inclusive and upper sample value boundaries are exclusive.
+  //!
+  //! @param[in] num_samples
+  //!   The number of data samples per row in the region of interest
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    const LevelT* d_levels,
+    OffsetT num_samples,
+    cudaStream_t stream = 0)
+  {
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+    return MultiHistogramRange<1, 1>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      &d_histogram,
+      &num_levels,
+      &d_levels,
+      num_samples,
+      (OffsetT) 1,
+      (size_t) (sizeof(SampleT) * num_samples),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    const LevelT* d_levels,
+    OffsetT num_samples,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return HistogramRange(
+      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_samples, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+  //!
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins is (``num_levels - 1``)
+  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
+  //! - For a given row ``r`` in ``[0, num_rows)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
+  //!   ``row_end = row_begin + num_row_samples``. The range
+  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap
+  //!   ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``.
+  //!   The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of a six-bin histogram
+  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input samples and
+  //!    // output histogram
+  //!    int      num_row_samples;    // e.g., 5
+  //!    int      num_rows;           // e.g., 2;
+  //!    int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+  //!    float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+  //!                                 //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
+  //!    int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+  //!    float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_samples, num_rows, row_stride_bytes);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::HistogramRange(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_samples, num_rows, row_stride_bytes);
+  //!
+  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
+  //!
+  //! @endrst
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the input sequence of data samples.
+  //!
+  //! @param[out] d_histogram
+  //!   The pointer to the histogram counter output array of length
+  //!   `num_levels - 1`.
+  //!
+  //! @param[in] num_levels
+  //!   The number of boundaries (levels) for delineating histogram samples.
+  //!   Implies that the number of bins is `num_levels - 1`.
+  //!
+  //! @param[in] d_levels
+  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
+  //!   by consecutive boundary pairings: lower sample value boundaries are
+  //!   inclusive and upper sample value boundaries are exclusive.
+  //!
+  //! @param[in] num_row_samples
+  //!   The number of data samples per row in the region of interest
+  //!
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //!
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the region
+  //!   of interest
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    const LevelT* d_levels,
+    OffsetT num_row_samples,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream = 0)
+  {
+    return MultiHistogramRange<1, 1>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      &d_histogram,
+      &num_levels,
+      &d_levels,
+      num_row_samples,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram,
+    int num_levels,
+    const LevelT* d_levels,
+    OffsetT num_row_samples,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return HistogramRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      d_levels,
+      num_row_samples,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples
+  //! using the specified bin boundary levels.
+  //!
+  //! - The input is a sequence of *pixel* structures, where each pixel
+  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - ``NUM_CHANNELS`` can be up to 4.
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the
+  //!   range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall
+  //!   not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way.
+  //!   The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
+  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 4-bin *RGB*
+  //! histograms from a quad-channel sequence of *RGBA* pixels
+  //! (8 bits per channel per pixel)
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input samples and output histograms
+  //!    int            num_pixels;       // e.g., 5
+  //!    unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+  //!                                     //        (0, 6, 7, 5),(3, 0, 2, 6)]
+  //!    unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+  //!    int            num_levels[3];    // e.g., {5, 5, 5};
+  //!    unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+  //!                                     //         [0, 2, 4, 6, 8],
+  //!                                     //         [0, 2, 4, 6, 8] ];
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
+  //!
+  //!    // d_histogram   <-- [ [1, 3, 0, 1],
+  //!    //                     [3, 0, 0, 2],
+  //!    //                     [0, 2, 0, 3] ]
+  //!
+  //! @endrst
+  //!
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //!
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading
+  //!   input samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples.
+  //!   The samples from different channels are assumed to be interleaved (e.g.,
+  //!   an array of 32-bit pixels where each pixel consists of four *RGBA*
+  //!   8-bit samples).
+  //!
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in
+  //!   each active channel. Implies that the number of bins for
+  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] d_levels
+  //!   The pointers to the arrays of boundaries (levels), one for each active
+  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
+  //!   sample value boundaries are inclusive and upper sample value boundaries
+  //!   are exclusive.
+  //!
+  //! @param[in] num_pixels
+  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_pixels,
+    cudaStream_t stream = 0)
+  {
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+    return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      d_levels,
+      num_pixels,
+      (OffsetT) 1,
+      (size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_pixels,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return MultiHistogramRange(
+      d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
+  //! the specified bin boundary levels.
+  //!
+  //! - The input is a sequence of *pixel* structures, where each pixel comprises
+  //!   a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
+  //! - ``NUM_CHANNELS`` can be up to 4.
+  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
+  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
+  //! - A two-dimensional *region of interest* within ``d_samples`` can be
+  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
+  //! - The row stride must be a whole multiple of the sample data type
+  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
+  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
+  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
+  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let
+  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
+  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
+  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels
+  //!   ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range
+  //!   ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap
+  //!   ``[sample_begin, sample_end)`` nor
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges
+  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
+  //!   ``[sample_begin, sample_end)`` may overlap.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the computation of three 4-bin *RGB*
+  //! histograms from a 2x3 region of interest of within a flattened 2x4 array
+  //! of quad-channel *RGBA* pixels (8 bits per channel per pixel).
+  //!
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input
+  //!    // samples and output histograms
+  //!    int              num_row_pixels;     // e.g., 3
+  //!    int              num_rows;           // e.g., 2
+  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+  //!                                         //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+  //!    int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+  //!    int              num_levels[3];      // e.g., {5, 5, 5};
+  //!    unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+  //!                                         //         [0, 2, 4, 6, 8],
+  //!                                         //         [0, 2, 4, 6, 8] ];
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels, d_levels,
+  //!      num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Compute histograms
+  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_samples, d_histogram, num_levels,
+  //!      d_levels, num_row_pixels, num_rows, row_stride_bytes);
+  //!
+  //!    // d_histogram   <-- [ [2, 3, 0, 1],
+  //!    //                     [3, 0, 0, 2],
+  //!    //                     [1, 2, 0, 3] ]
+  //!
+  //! @endrst
+  //!
+  //! @tparam NUM_CHANNELS
+  //!   Number of channels interleaved in the input data (may be greater than
+  //!   the number of channels being actively histogrammed)
+  //!
+  //! @tparam NUM_ACTIVE_CHANNELS
+  //!   **[inferred]** Number of channels actively being histogrammed
+  //!
+  //! @tparam SampleIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input
+  //!   samples. @iterator
+  //!
+  //! @tparam CounterT
+  //!   **[inferred]** Integer type for histogram bin counters
+  //!
+  //! @tparam LevelT
+  //!   **[inferred]** Type for specifying boundaries (levels)
+  //!
+  //! @tparam OffsetT
+  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
+  //!   pointer differences, etc. @offset_size1
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_samples
+  //!   The pointer to the multi-channel input sequence of data samples. The
+  //!   samples from different channels are assumed to be interleaved (e.g., an
+  //!   array of 32-bit pixels where each pixel consists of four
+  //!   *RGBA* 8-bit samples).
+  //!
+  //! @param[out] d_histogram
+  //!   @rst
+  //!   The pointers to the histogram counter output arrays, one for each active
+  //!   channel. For channel\ :sub:`i`, the allocation length of
+  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] num_levels
+  //!   @rst
+  //!   The number of boundaries (levels) for delineating histogram samples in
+  //!   each active channel. Implies that the number of bins for
+  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
+  //!   @endrst
+  //!
+  //! @param[in] d_levels
+  //!   The pointers to the arrays of boundaries (levels), one for each active
+  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
+  //!   sample value boundaries are inclusive and upper sample value boundaries
+  //!   are exclusive.
+  //!
+  //! @param[in] num_row_pixels
+  //!   The number of multi-channel pixels per row in the region of interest
+  //!
+  //! @param[in] num_rows
+  //!   The number of rows in the region of interest
+  //!
+  //! @param[in] row_stride_bytes
+  //!   The number of bytes between starts of consecutive rows in the
+  //!   region of interest
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange");
+
+    /// The sample value type of the input iterator
+    using SampleT = cub::detail::value_t<SampleIteratorT>;
+    Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+    _CCCL_IF_CONSTEXPR (sizeof(OffsetT) > sizeof(int))
+    {
+      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
+      {
+        // Down-convert OffsetT data type
+        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+          d_temp_storage,
+          temp_storage_bytes,
+          d_samples,
+          d_histogram,
+          num_levels,
+          d_levels,
+          (int) num_row_pixels,
+          (int) num_rows,
+          (int) (row_stride_bytes / sizeof(SampleT)),
+          stream,
+          is_byte_sample);
+      }
+    }
+
+    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      d_levels,
+      num_row_pixels,
+      num_rows,
+      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+      stream,
+      is_byte_sample);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <int NUM_CHANNELS,
+            int NUM_ACTIVE_CHANNELS,
+            typename SampleIteratorT,
+            typename CounterT,
+            typename LevelT,
+            typename OffsetT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
+    const int num_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    size_t row_stride_bytes,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return MultiHistogramRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_histogram,
+      num_levels,
+      d_levels,
+      num_row_pixels,
+      num_rows,
+      row_stride_bytes,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //@}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_memcpy.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_memcpy.cuh
new file mode 100644
index 000000000..e71431cb7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_memcpy.cuh
@@ -0,0 +1,210 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
+
+#include <cstdint>
+#include <type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
+struct DeviceMemcpy
+{
+  //! @rst
+  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
+  //!
+  //! .. note::
+  //!
+  //!    If any input buffer aliases memory from any output buffer the behavior is undefined.
+  //!    If any output buffer aliases memory of another output buffer the behavior is undefined.
+  //!    Input buffers can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing
+  //! a single string buffer.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    struct GetPtrToStringItem
+  //!    {
+  //!      __host__ __device__ __forceinline__ void *operator()(uint32_t index)
+  //!      {
+  //!        return &d_string_data_in[d_string_offsets[index]];
+  //!      }
+  //!      char *d_string_data_in;
+  //!      uint32_t *d_string_offsets;
+  //!    };
+  //!
+  //!    struct GetStringItemSize
+  //!    {
+  //!      __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
+  //!      {
+  //!        return d_string_offsets[index + 1] - d_string_offsets[index];
+  //!      }
+  //!      uint32_t *d_string_offsets;
+  //!    };
+  //!
+  //!    uint32_t num_strings = 5;
+  //!    char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
+  //!    char *d_string_data_out;        // e.g., "                ...               "
+  //!    uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
+  //!    uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
+  //!    uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]
+  //!
+  //!    // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
+  //!    auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
+  //!    d_gather_index);
+  //!
+  //!    // Returns pointers to the input buffer for each string
+  //!    auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
+  //!                                                       GetPtrToStringItem{d_string_data_in,
+  //!    d_string_offsets_old});
+  //!
+  //!    // Returns the string size of the i-th string
+  //!    auto str_sizes = thrust::make_transform_iterator(gather_iterator,
+  //!    GetStringItemSize{d_string_offsets_old});
+  //!
+  //!    // Returns pointers to the output buffer for each string
+  //!    auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+  //!                                                        GetPtrToStringItem{d_string_data_out,
+  //!    d_string_offsets_new});
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage      = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+  //!    str_sizes, num_strings);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run batched copy algorithm (used to permute strings)
+  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
+  //!    str_sizes, num_strings);
+  //!
+  //!    // d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the source memory buffers
+  //!
+  //! @tparam OutputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the destination memory buffers
+  //!
+  //! @tparam BufferSizeIteratorT
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
+  //!   to be copied for each pair of buffers
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] input_buffer_it
+  //!   Device-accessible iterator providing the pointers to the source memory buffers
+  //!
+  //! @param[in] output_buffer_it
+  //!   Device-accessible iterator providing the pointers to the destination memory buffers
+  //!
+  //! @param[in] buffer_sizes
+  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
+  //!
+  //! @param[in] num_buffers
+  //!   The total number of buffer pairs
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Batched(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes,
+    uint32_t num_buffers,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMemcpy::Batched");
+    static_assert(std::is_pointer<cub::detail::value_t<InputBufferIt>>::value,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+    static_assert(std::is_pointer<cub::detail::value_t<OutputBufferIt>>::value,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+
+    // Integer type large enough to hold any offset in [0, num_buffers)
+    using BufferOffsetT = uint32_t;
+
+    // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
+    // uppper bound on num_thread_blocks_launched can be assumed to be given by
+    // IDIV_CEIL(num_buffers, 64)
+    using BlockOffsetT = uint32_t;
+
+    return detail::DispatchBatchMemcpy<
+      InputBufferIt,
+      OutputBufferIt,
+      BufferSizeIteratorT,
+      BufferOffsetT,
+      BlockOffsetT,
+      detail::DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT>,
+      true>::Dispatch(d_temp_storage,
+                      temp_storage_bytes,
+                      input_buffer_it,
+                      output_buffer_it,
+                      buffer_sizes,
+                      num_buffers,
+                      stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge.cuh
new file mode 100644
index 000000000..bf110f2f4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge.cuh
@@ -0,0 +1,197 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_merge.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/functional>
+
+CUB_NAMESPACE_BEGIN
+
+//! DeviceMerge provides device-wide, parallel operations for merging two sorted sequences of values (called keys) or
+//! key-value pairs in device-accessible memory. The sorting order is determined by a comparison functor (default:
+//! less-than), which has to establish a [strict weak ordering].
+//!
+//! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+struct DeviceMerge
+{
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of values (called keys) into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput
+  //! sequence in any order.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! The code snippet below illustrates the merging of two device vectors of `int` keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-keys
+  //!     :end-before: example-end merge-keys
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the first sorted input sequence. Must have the same
+  //! value type as KeyIteratorIn2.
+  //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //! same value type as KeyIteratorIn1.
+  //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the output sequence.
+  //! @tparam CompareOp **[deduced]** Binary predicate to compare the input iterator's value types. Must have a
+  //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required
+  //! allocation size is written to `temp_storage_bytes` and no work is done.
+  //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation.
+  //! @param[in] keys_in1 Iterator to the beginning of the first sorted input sequence.
+  //! @param[in] num_keys1 Number of keys in the first input sequence.
+  //! @param[in] keys_in2 Iterator to the beginning of the second sorted input sequence.
+  //! @param[in] num_keys2 Number of keys in the second input sequence.
+  //! @param[out] keys_out Iterator to the beginning of the output sequence.
+  //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the
+  //! second. Must establish a [strict weak ordering].
+  //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream<sub>0</sub>.
+  //!
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename KeyIteratorIn2,
+            typename KeyIteratorOut,
+            typename CompareOp = ::cuda::std::less<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t MergeKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorIn1 keys_in1,
+    int num_keys1,
+    KeyIteratorIn2 keys_in2,
+    int num_keys2,
+    KeyIteratorOut keys_out,
+    CompareOp compare_op = {},
+    cudaStream_t stream  = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+    return detail::merge::
+      dispatch_t<KeyIteratorIn1, NullType*, KeyIteratorIn2, NullType*, KeyIteratorOut, NullType*, int, CompareOp>::
+        dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          keys_in1,
+          nullptr,
+          num_keys1,
+          keys_in2,
+          nullptr,
+          num_keys2,
+          keys_out,
+          nullptr,
+          compare_op,
+          stream);
+  }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of key-value pairs into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the ouput
+  //! sequence in any order.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! The code snippet below illustrates the merging of two device vectors of `int` keys.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-pairs
+  //!     :end-before: example-end merge-pairs
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1 **[deduced]** Random access iterator to the keys of the first sorted input sequence. Must
+  //! have the same value type as KeyIteratorIn2.
+  //! @tparam ValueIteratorIn1 **[deduced]** Random access iterator to the values of the first sorted input sequence.
+  //! Must have the same value type as ValueIteratorIn2.
+  //! @tparam KeyIteratorIn2 **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //! same value type as KeyIteratorIn1.
+  //! @tparam ValueIteratorIn2 **[deduced]** Random access iterator to the values of the second sorted input sequence.
+  //! Must have the same value type as ValueIteratorIn1.
+  //! @tparam KeyIteratorOut **[deduced]** Random access iterator to the keys of the output sequence.
+  //! @tparam ValueIteratorOut **[deduced]** Random access iterator to the values of the output sequence.
+  //! @tparam CompareOp **[deduced]** Binary predicate to compare the key input iterator's value types. Must have a
+  //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @param[in] d_temp_storage Device-accessible allocation of temporary storage. When `nullptr`, the required
+  //! allocation size is written to `temp_storage_bytes` and no work is done.
+  //! @param[in,out] temp_storage_bytes Reference to size in bytes of `d_temp_storage` allocation.
+  //! @param[in] keys_in1 Iterator to the beginning of the keys of the first sorted input sequence.
+  //! @param[in] values_in1 Iterator to the beginning of the values of the first sorted input sequence.
+  //! @param[in] num_pairs1 Number of key-value pairs in the first input sequence.
+  //! @param[in] keys_in2 Iterator to the beginning of the keys of the second sorted input sequence.
+  //! @param[in] values_in2 Iterator to the beginning of the values of the second sorted input sequence.
+  //! @param[in] num_pairs2 Number of key-value pairs in the second input sequence.
+  //! @param[out] keys_out Iterator to the beginning of the keys of the output sequence.
+  //! @param[out] values_out Iterator to the beginning of the values of the output sequence.
+  //! @param[in] compare_op Comparison function object, returning true if the first argument is ordered before the
+  //! second. Must establish a [strict weak ordering].
+  //! @param[in] stream **[optional]** CUDA stream to launch kernels into. Default is stream<sub>0</sub>.
+  //!
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename ValueIteratorIn1,
+            typename KeyIteratorIn2,
+            typename ValueIteratorIn2,
+            typename KeyIteratorOut,
+            typename ValueIteratorOut,
+            typename CompareOp = ::cuda::std::less<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t MergePairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorIn1 keys_in1,
+    ValueIteratorIn1 values_in1,
+    int num_pairs1,
+    KeyIteratorIn2 keys_in2,
+    ValueIteratorIn2 values_in2,
+    int num_pairs2,
+    KeyIteratorOut keys_out,
+    ValueIteratorOut values_out,
+    CompareOp compare_op = {},
+    cudaStream_t stream  = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+    return detail::merge::dispatch_t<
+      KeyIteratorIn1,
+      ValueIteratorIn1,
+      KeyIteratorIn2,
+      ValueIteratorIn2,
+      KeyIteratorOut,
+      ValueIteratorOut,
+      int,
+      CompareOp>::dispatch(d_temp_storage,
+                           temp_storage_bytes,
+                           keys_in1,
+                           values_in1,
+                           num_pairs1,
+                           keys_in2,
+                           values_in2,
+                           num_pairs2,
+                           keys_out,
+                           values_out,
+                           compare_op,
+                           stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge_sort.cuh
new file mode 100644
index 000000000..293aaecce
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_merge_sort.cuh
@@ -0,0 +1,1111 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_merge_sort.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_namespace.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief DeviceMergeSort provides device-wide, parallel operations for
+ *        computing a merge sort across a sequence of data items residing within
+ *        device-accessible memory.
+ *
+ * @par Overview
+ * - DeviceMergeSort arranges items into ascending order using a comparison
+ *   functor with less-than semantics. Merge sort can handle arbitrary types (as
+ *   long as a value of these types is a model of [LessThan Comparable]) and
+ *   comparison functors, but is slower than DeviceRadixSort when sorting
+ *   arithmetic types into ascending/descending order.
+ * - Another difference from RadixSort is the fact that DeviceMergeSort can
+ *   handle arbitrary random-access iterators, as shown below.
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates a thrust reverse iterator usage.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>  // or equivalently <cub/device/device_merge_sort.cuh>
+ *
+ * struct CustomLess
+ * {
+ *   template <typename DataType>
+ *   __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+ *   {
+ *     return lhs < rhs;
+ *   }
+ * };
+ *
+ * // Declare, allocate, and initialize device-accessible pointers
+ * // for sorting data
+ * thrust::device_vector<KeyType> d_keys(num_items);
+ * thrust::device_vector<DataType> d_values(num_items);
+ * // ...
+ *
+ * // Initialize iterator
+ * using KeyIterator = typename thrust::device_vector<KeyType>::iterator;
+ * thrust::reverse_iterator<KeyIterator> reverse_iter(d_keys.end());
+ *
+ * // Determine temporary device storage requirements
+ * std::size_t temp_storage_bytes = 0;
+ * cub::DeviceMergeSort::SortPairs(
+ *   nullptr,
+ *   temp_storage_bytes,
+ *   reverse_iter,
+ *   thrust::raw_pointer_cast(d_values.data()),
+ *   num_items,
+ *   CustomLess());
+ *
+ * // Allocate temporary storage
+ * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+ *
+ * // Run sorting operation
+ * cub::DeviceMergeSort::SortPairs(
+ *   d_temp_storage,
+ *   temp_storage_bytes,
+ *   reverse_iter,
+ *   thrust::raw_pointer_cast(d_values.data()),
+ *   num_items,
+ *   CustomLess());
+ * @endcode
+ *
+ * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+ */
+struct DeviceMergeSort
+{
+private:
+  // Name reported for NVTX ranges
+  _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
+  {
+    return "cub::DeviceMergeSort";
+  }
+
+  // Internal version without NVTX range
+  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    ValueIteratorT d_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
+  }
+
+public:
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * SortPairs is not guaranteed to be stable. That is, suppose that i and j are
+   * equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 2, 1, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in,out] d_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    ValueIteratorT d_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    ValueIteratorT d_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * - SortPairsCopy is not guaranteed to be stable. That is, suppose
+   *   that `i` and `j` are equivalent: neither one is less than the
+   *   other. It is not guaranteed that the relative order of these
+   *   two elements will be preserved by sort.
+   * - Input arrays `d_input_keys` and `d_input_items` are not modified.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of
+   * `int` keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortPairsCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortPairsCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 2, 1, 0, 6]
+   *
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT
+   *   is a model of [Random Access Iterator]. Its `value_type` is a model of
+   *   [LessThan Comparable]. This `value_type`'s ordering relation is a
+   *   *strict weak ordering* as defined in the [LessThan Comparable]
+   *   requirements.
+   *
+   * @tparam ValueInputIteratorT
+   *   is a model of [Random Access Iterator].
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] d_input_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[out] d_output_keys
+   *   Pointer to the output sequence of sorted input keys
+   *
+   * @param[out] d_output_items
+   *   Pointer to the output sequence of sorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns `true` if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input_keys,
+      d_input_items,
+      d_output_keys,
+      d_output_items,
+      num_items,
+      compare_op,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyIteratorT,
+            typename ValueIteratorT,
+            typename OffsetT,
+            typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsCopy<KeyInputIteratorT, ValueInputIteratorT, KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input_keys,
+      d_input_items,
+      d_output_keys,
+      d_output_items,
+      num_items,
+      compare_op,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      static_cast<NullType*>(nullptr),
+      d_keys,
+      static_cast<NullType*>(nullptr),
+      num_items,
+      compare_op,
+      stream);
+  }
+
+public:
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j`
+   * are equivalent: neither one is less than the other. It is not guaranteed
+   * that the relative order of these two elements will be preserved by sort.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers
+   * // for sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopyNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    KeyIteratorT d_output_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    using DispatchMergeSortT =
+      DispatchMergeSort<KeyInputIteratorT, NullType*, KeyIteratorT, NullType*, PromotedOffsetT, CompareOpT>;
+
+    return DispatchMergeSortT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input_keys,
+      static_cast<NullType*>(nullptr),
+      d_output_keys,
+      static_cast<NullType*>(nullptr),
+      num_items,
+      compare_op,
+      stream);
+  }
+
+public:
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * - SortKeysCopy is not guaranteed to be stable. That is, suppose that `i`
+   *   and `j` are equivalent: neither one is less than the other. It is not
+   *   guaranteed that the relative order of these two elements will be
+   *   preserved by sort.
+   * - Input array d_input_keys is not modified.
+   * - Note that the behavior is undefined if the input and output ranges
+   *   overlap in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of
+   * `int` keys.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::SortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::SortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT
+   *   is a model of [Random Access Iterator]. Its `value_type` is a model of
+   *   [LessThan Comparable]. This `value_type`'s ordering relation is a
+   *   *strict weak ordering* as defined in the [LessThan Comparable]
+   *   requirements.
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[out] d_output_keys
+   *   Pointer to the output sequence of sorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    KeyIteratorT d_output_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysCopyNoNVTX(
+      d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    KeyIteratorT d_output_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysCopy<KeyInputIteratorT, KeyIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * StableSortPairs is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if x and y are elements such that x precedes y,
+   * and if the two elements are equivalent (neither x < y nor y < x) then
+   * a postcondition of stable_sort is that x still precedes y.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys with associated vector of `int` values.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 6, 5, 3, 0, 9]
+   * int  *d_values;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortPairs(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, d_values, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 6, 8, 9]
+   * // d_values    <-- [5, 4, 3, 1, 2, 0, 6]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam ValueIteratorT
+   *   is a model of [Random Access Iterator], and `ValueIteratorT` is mutable.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in,out] d_items
+   *   Pointer to the input sequence of unsorted input values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    ValueIteratorT d_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    return SortPairsNoNVTX<KeyIteratorT, ValueIteratorT, PromotedOffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyIteratorT, typename ValueIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    ValueIteratorT d_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortPairs<KeyIteratorT, ValueIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * StableSortKeys is stable: it preserves the relative ordering of equivalent
+   * elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   * and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   * a postcondition of stable_sort is that `x` still precedes `y`.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_keys;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortKeys(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_keys, num_items, custom_op);
+   *
+   * // d_keys      <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+
+    return SortKeysNoNVTX<KeyIteratorT, PromotedOffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyIteratorT d_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortKeys<KeyIteratorT, OffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * @brief Sorts items using a merge sorting method.
+   *
+   * @par
+   * - StableSortKeysCopy is stable: it preserves the relative ordering of equivalent
+   *   elements. That is, if `x` and `y` are elements such that `x` precedes `y`,
+   *   and if the two elements are equivalent (neither `x < y` nor `y < x`) then
+   *   a postcondition of stable_sort is that `x` still precedes `y`.
+   * - Input array d_input_keys is not modified
+   * - Note that the behavior is undefined if the input and output ranges overlap
+   *   in any way.
+   *
+   * @par Snippet
+   * The code snippet below illustrates the sorting of a device vector of `int`
+   * keys.
+   * \par
+   * \code
+   * #include <cub/cub.cuh>
+   * // or equivalently <cub/device/device_merge_sort.cuh>
+   *
+   * // Declare, allocate, and initialize device-accessible pointers for
+   * // sorting data
+   * int  num_items;       // e.g., 7
+   * int  *d_input_keys;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+   * int  *d_output_keys;  // must hold at least num_items elements
+   * ...
+   *
+   * // Initialize comparator
+   * CustomOpT custom_op;
+   *
+   * // Determine temporary device storage requirements
+   * void *d_temp_storage = nullptr;
+   * std::size_t temp_storage_bytes = 0;
+   * cub::DeviceMergeSort::StableSortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input_keys, d_output_keys, num_items, custom_op);
+   *
+   * // Allocate temporary storage
+   * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+   *
+   * // Run sorting operation
+   * cub::DeviceMergeSort::StableSortKeysCopy(
+   *   d_temp_storage, temp_storage_bytes,
+   *   d_input_keys, d_output_keys, num_items, custom_op);
+   *
+   * // d_output_keys   <-- [0, 3, 5, 6, 7, 8, 9]
+   * @endcode
+   *
+   * @tparam KeyInputIteratorT
+   *   is a model of [Random Access Iterator]. Its `value_type` is a model of
+   *   [LessThan Comparable]. This `value_type`'s ordering relation is a
+   *   *strict weak ordering* as defined in the [LessThan Comparable]
+   *   requirements.
+   *
+   * @tparam KeyIteratorT
+   *   is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and
+   *   its `value_type` is a model of [LessThan Comparable]. This `value_type`'s
+   *   ordering relation is a *strict weak ordering* as defined in
+   *   the [LessThan Comparable] requirements.
+   *
+   * @tparam OffsetT
+   *   is an integer type for global offsets.
+   *
+   * @tparam CompareOpT
+   *   is a type of callable object with the signature
+   *   `bool operator()(KeyT lhs, KeyT rhs)` that models
+   *   the [Strict Weak Ordering] concept.
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_input_keys
+   *   Pointer to the input sequence of unsorted input keys
+   *
+   * @param[out] d_output_keys
+   *   Pointer to the output sequence of sorted input keys
+   *
+   * @param[in] num_items
+   *   Number of elements in d_input_keys to sort
+   *
+   * @param[in] compare_op
+   *   Comparison function object which returns true if the first argument is
+   *   ordered before the second
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is
+   *   stream<sub>0</sub>.
+   *
+   * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator
+   * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+   * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable
+   */
+  template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysCopy(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    KeyIteratorT d_output_keys,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    using PromotedOffsetT = detail::promote_small_offset_t<OffsetT>;
+    return SortKeysCopyNoNVTX<KeyInputIteratorT, KeyIteratorT, PromotedOffsetT, CompareOpT>(
+      d_temp_storage, temp_storage_bytes, d_input_keys, d_output_keys, num_items, compare_op, stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_partition.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_partition.cuh
new file mode 100644
index 000000000..48666f137
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_partition.cuh
@@ -0,0 +1,749 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing
+//! within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_select_if.cuh>
+#include <cub/device/dispatch/dispatch_three_way_partition.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DevicePartition provides device-wide, parallel operations for
+//! partitioning sequences of data items residing within device-accessible memory.
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! These operations apply a selection criterion to construct a partitioned
+//! output sequence from items selected/unselected from a specified input
+//! sequence.
+//!
+//! Usage Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! @cdp_class{DevicePartition}
+//!
+//! Performance
+//! ++++++++++++++++++++++++++
+//!
+//! @linear_performance{partition}
+//!
+//! @endrst
+struct DevicePartition
+{
+  //! @rst
+  //! Uses the ``d_flags`` sequence to split the corresponding items from
+  //! ``d_in`` into a partitioned sequence ``d_out``.
+  //! The total number of items copied into the first partition is written to ``d_num_selected_out``.
+  //!
+  //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering, however copies of the unselected
+  //!   items are compacted into the rear of ``d_out`` in reverse order.
+  //! - The range ``[d_out, d_out + num_items)`` shall not overlap
+  //!   ``[d_in, d_in + num_items)`` nor ``[d_flags, d_flags + num_items)`` in any way.
+  //!   The range ``[d_in, d_in + num_items)`` may overlap ``[d_flags, d_flags + num_items)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_partition.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input, flags, and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+  //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    std::size_t temp_storage_bytes = 0;
+  //!    cub::DevicePartition::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DevicePartition::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+  //!    // d_num_selected_out    <-- [4]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of partitioned data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected (i.e., the
+  //!   offset of the unselected partition)
+  //!
+  //! @param[in] num_items
+  //!   Total number of items to select from
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagIterator d_flags,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::Flagged");
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using SelectOp      = NullType; // Selection op (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
+    using DispatchSelectIfT =
+      DispatchSelectIf<InputIteratorT,
+                       FlagIterator,
+                       OutputIteratorT,
+                       NumSelectedIteratorT,
+                       SelectOp,
+                       EqualityOp,
+                       OffsetT,
+                       true>;
+
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
+    return DispatchSelectIfT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_flags,
+      d_out,
+      d_num_selected_out,
+      SelectOp{},
+      EqualityOp{},
+      num_items,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagIterator d_flags,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Uses the ``select_op`` functor to split the corresponding items from ``d_in`` into
+  //! a partitioned sequence ``d_out``. The total number of items copied into the first partition is written
+  //! to ``d_num_selected_out``.
+  //!
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering, however copies of the unselected
+  //!   items are compacted into the rear of ``d_out`` in reverse order.
+  //! - The range ``[d_out, d_out + num_items)`` shall not overlap
+  //!   ``[d_in, d_in + num_items)`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_partition.cuh>
+  //!
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //!
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        explicit LessThan(int compare) : compare(compare) {}
+  //!
+  //!        CUB_RUNTIME_FUNCTION __forceinline__
+  //!        bool operator()(const int &a) const
+  //!        {
+  //!            return (a < compare);
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int      num_items;              // e.g., 8
+  //!    int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int      *d_num_selected_out;    // e.g., [ ]
+  //!    LessThan select_op(7);
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    std::size_t temp_storage_bytes = 0;
+  //!    cub::DevicePartition::If(
+  //!    d_temp_storage, temp_storage_bytes,
+  //!    d_in, d_out, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DevicePartition::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of ``d_temp_storage`` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of partitioned data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+  //!
+  //! @param[in] num_items
+  //!   Total number of items to select from
+  //!
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     OutputIteratorT d_out,
+     NumSelectedIteratorT d_num_selected_out,
+     NumItemsT num_items,
+     SelectOp select_op,
+     cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
+    using ChooseOffsetT = detail::choose_signed_offset<NumItemsT>;
+    using OffsetT       = typename ChooseOffsetT::type; // Signed integer type for global offsets
+    using FlagIterator  = NullType*; // FlagT iterator type (not used)
+    using EqualityOp    = NullType; // Equality operator (not used)
+
+    // Check if the number of items exceeds the range covered by the selected signed offset type
+    cudaError_t error = ChooseOffsetT::is_exceeding_offset_type(num_items);
+    if (error)
+    {
+      return error;
+    }
+
+    using DispatchSelectIfT =
+      DispatchSelectIf<InputIteratorT,
+                       FlagIterator,
+                       OutputIteratorT,
+                       NumSelectedIteratorT,
+                       SelectOp,
+                       EqualityOp,
+                       OffsetT,
+                       true>;
+
+    return DispatchSelectIfT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      nullptr,
+      d_out,
+      d_num_selected_out,
+      select_op,
+      EqualityOp{},
+      num_items,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp,
+            typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     OutputIteratorT d_out,
+     NumSelectedIteratorT d_num_selected_out,
+     NumItemsT num_items,
+     SelectOp select_op,
+     cudaStream_t stream,
+     bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  template <bool IS_DESCENDING,
+            typename KeyT,
+            typename ValueT,
+            typename OffsetT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename SelectedPolicy>
+  friend class DispatchSegmentedSort;
+
+  // Internal version without NVTX range
+  template <typename InputIteratorT,
+            typename FirstOutputIteratorT,
+            typename SecondOutputIteratorT,
+            typename UnselectedOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectFirstPartOp,
+            typename SelectSecondPartOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t IfNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    int num_items,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    cudaStream_t stream = 0)
+  {
+    using OffsetT                      = int;
+    using DispatchThreeWayPartitionIfT = DispatchThreeWayPartitionIf<
+      InputIteratorT,
+      FirstOutputIteratorT,
+      SecondOutputIteratorT,
+      UnselectedOutputIteratorT,
+      NumSelectedIteratorT,
+      SelectFirstPartOp,
+      SelectSecondPartOp,
+      OffsetT>;
+
+    return DispatchThreeWayPartitionIfT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_first_part_out,
+      d_second_part_out,
+      d_unselected_out,
+      d_num_selected_out,
+      select_first_part_op,
+      select_second_part_op,
+      num_items,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Uses two functors to split the corresponding items from ``d_in`` into a three partitioned sequences
+  //! ``d_first_part_out``, ``d_second_part_out``, and ``d_unselected_out``.
+  //! The total number of items copied into the first partition is written
+  //! to ``d_num_selected_out[0]``, while the total number of items copied into the second partition is written
+  //! to ``d_num_selected_out[1]``.
+  //!
+  //! - Copies of the items selected by ``select_first_part_op`` are compacted
+  //!   into ``d_first_part_out`` and maintain their original relative ordering.
+  //! - Copies of the items selected by ``select_second_part_op`` are compacted
+  //!   into ``d_second_part_out`` and maintain their original relative ordering.
+  //! - Copies of the unselected items are compacted into the ``d_unselected_out`` in reverse order.
+  //! - The ranges ``[d_out, d_out + num_items)``,
+  //!   ``[d_first_part_out, d_first_part_out + d_num_selected_out[0])``,
+  //!   ``[d_second_part_out, d_second_part_out + d_num_selected_out[1])``,
+  //!   ``[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])``,
+  //!   shall not overlap in any way.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates how this algorithm can partition an
+  //! input vector into small, medium, and large items so that the relative
+  //! order of items remain deterministic.
+  //!
+  //! Let's consider any value that doesn't exceed six a small one. On the
+  //! other hand, any value that exceeds 50 will be considered a large one.
+  //! Since the value used to define a small part doesn't match one that
+  //! defines the large part, the intermediate segment is implied.
+  //!
+  //! These definitions partition a value space into three categories. We want
+  //! to preserve the order of items in which they appear in the input vector.
+  //! Since the algorithm provides stable partitioning, this is possible.
+  //!
+  //! Since the number of items in each category is unknown beforehand, we need
+  //! three output arrays of num_items elements each. To reduce the memory
+  //! requirements, we can combine the output storage for two categories.
+  //!
+  //! Since each value falls precisely in one category, it's safe to add
+  //! "large" values into the head of the shared output vector and the "middle"
+  //! values into its tail. To add items into the tail of the output array, we
+  //! can use ``thrust::reverse_iterator``.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_partition.cuh>
+  //!
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        explicit LessThan(int compare) : compare(compare) {}
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        bool operator()(const int &a) const
+  //!        {
+  //!            return a < compare;
+  //!        }
+  //!    };
+  //!
+  //!    // Functor type for selecting values greater than some criteria
+  //!    struct GreaterThan
+  //!    {
+  //!        int compare;
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        explicit GreaterThan(int compare) : compare(compare) {}
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        bool operator()(const int &a) const
+  //!        {
+  //!            return a > compare;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int      num_items;                   // e.g., 8
+  //!    int      *d_in;                       // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_large_and_unselected_out; // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int      *d_small_out;                // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int      *d_num_selected_out;         // e.g., [ , ]
+  //!    thrust::reverse_iterator<T> unselected_out(d_large_and_unselected_out + num_items);
+  //!    LessThan small_items_selector(7);
+  //!    GreaterThan large_items_selector(50);
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void *d_temp_storage = nullptr;
+  //!    std::size_t temp_storage_bytes = 0;
+  //!    cub::DevicePartition::If(
+  //!         d_temp_storage, temp_storage_bytes,
+  //!         d_in, d_large_and_medium_out, d_small_out, unselected_out,
+  //!         d_num_selected_out, num_items,
+  //!         large_items_selector, small_items_selector);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DevicePartition::If(
+  //!         d_temp_storage, temp_storage_bytes,
+  //!         d_in, d_large_and_medium_out, d_small_out, unselected_out,
+  //!         d_num_selected_out, num_items,
+  //!         large_items_selector, small_items_selector);
+  //!
+  //!    // d_large_and_unselected_out  <-- [ 81,  ,  ,  ,  ,  , 8, 9 ]
+  //!    // d_small_out                 <-- [  0, 2, 3, 5, 2,  ,  ,   ]
+  //!    // d_num_selected_out          <-- [  1, 5 ]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam FirstOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output
+  //!   items selected by first operator @iterator
+  //!
+  //! @tparam SecondOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output
+  //!   items selected by second operator @iterator
+  //!
+  //! @tparam UnselectedOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing
+  //!   unselected items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items
+  //!   selected @iterator
+  //!
+  //! @tparam SelectFirstPartOp
+  //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+  //!
+  //! @tparam SelectSecondPartOp
+  //!   **[inferred]** Selection functor type having member `bool operator()(const T &a)`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_first_part_out
+  //!   Pointer to the output sequence of data items selected by `select_first_part_op`
+  //!
+  //! @param[out] d_second_part_out
+  //!   Pointer to the output sequence of data items selected by `select_second_part_op`
+  //!
+  //! @param[out] d_unselected_out
+  //!   Pointer to the output sequence of unselected data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output array with two elements, where total number of
+  //!   items selected by `select_first_part_op` is stored as
+  //!   `d_num_selected_out[0]` and total number of items selected by
+  //!   `select_second_part_op` is stored as `d_num_selected_out[1]`,
+  //!   respectively
+  //!
+  //! @param[in] num_items
+  //!   Total number of items to select from
+  //!
+  //! @param[in] select_first_part_op
+  //!   Unary selection operator to select `d_first_part_out`
+  //!
+  //! @param[in] select_second_part_op
+  //!   Unary selection operator to select `d_second_part_out`
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename FirstOutputIteratorT,
+            typename SecondOutputIteratorT,
+            typename UnselectedOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectFirstPartOp,
+            typename SelectSecondPartOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     std::size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     FirstOutputIteratorT d_first_part_out,
+     SecondOutputIteratorT d_second_part_out,
+     UnselectedOutputIteratorT d_unselected_out,
+     NumSelectedIteratorT d_num_selected_out,
+     int num_items,
+     SelectFirstPartOp select_first_part_op,
+     SelectSecondPartOp select_second_part_op,
+     cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DevicePartition::If");
+    return IfNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_first_part_out,
+      d_second_part_out,
+      d_unselected_out,
+      d_num_selected_out,
+      num_items,
+      select_first_part_op,
+      select_second_part_op,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename FirstOutputIteratorT,
+            typename SecondOutputIteratorT,
+            typename UnselectedOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectFirstPartOp,
+            typename SelectSecondPartOp>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     std::size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     FirstOutputIteratorT d_first_part_out,
+     SecondOutputIteratorT d_second_part_out,
+     UnselectedOutputIteratorT d_unselected_out,
+     NumSelectedIteratorT d_num_selected_out,
+     int num_items,
+     SelectFirstPartOp select_first_part_op,
+     SelectSecondPartOp select_second_part_op,
+     cudaStream_t stream,
+     bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return If<InputIteratorT,
+              FirstOutputIteratorT,
+              SecondOutputIteratorT,
+              UnselectedOutputIteratorT,
+              NumSelectedIteratorT,
+              SelectFirstPartOp,
+              SelectSecondPartOp>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_first_part_out,
+      d_second_part_out,
+      d_unselected_out,
+      d_num_selected_out,
+      num_items,
+      select_first_part_op,
+      select_second_part_op,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_radix_sort.cuh
new file mode 100644
index 000000000..a14c5e436
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_radix_sort.cuh
@@ -0,0 +1,3629 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
+//! items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @brief DeviceRadixSort provides device-wide, parallel operations for
+//!        computing a radix sort across a sequence of data items residing
+//!        within device-accessible memory. ![](sorting_logo.png)
+//!
+//! @par Overview
+//! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort)
+//! arranges items into ascending (or descending) order. The algorithm relies
+//! upon a positional representation for keys, i.e., each key is comprised of an
+//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
+//! least-significant to most-significant. For a given input sequence of keys
+//! and a set of rules specifying a total ordering of the symbolic alphabet, the
+//! radix sorting method produces a lexicographic ordering of those keys.
+//!
+//! @par Supported Types
+//! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+//! (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half`
+//! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are
+//! supported as long as decomposer object is provided.
+//!
+//! @par Floating-Point Special Cases
+//!
+//! - Positive and negative zeros are considered equivalent, and will be treated
+//!   as such in the output.
+//! - No special handling is implemented for NaN values; these are sorted
+//!   according to their bit representations after any transformations.
+//!
+//! @par Transformations
+//! Although the direct radix sorting method can only be applied to unsigned
+//! integral types, DeviceRadixSort is able to sort signed and floating-point
+//! types via simple bit-wise transformations that ensure lexicographic key
+//! ordering. Additional transformations occur for descending sorts. These
+//! transformations must be considered when restricting the
+//! `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+//! before the bit-range truncation.
+//!
+//! Any transformations applied to the keys prior to sorting are reversed
+//! while writing to the final output buffer.
+//!
+//! @par Type Specific Bitwise Transformations
+//! To convert the input values into a radix-sortable bitwise representation,
+//! the following transformations take place prior to sorting:
+//!
+//! - For unsigned integral values, the keys are used directly.
+//! - For signed integral values, the sign bit is inverted.
+//! - For positive floating point values, the sign bit is inverted.
+//! - For negative floating point values, the full key is inverted.
+//!
+//! For floating point types, positive and negative zero are a special case and
+//! will be considered equivalent during sorting.
+//!
+//! @par Descending Sort Bitwise Transformations
+//! If descending sort is used, the keys are inverted after performing any
+//! type-specific transformations, and the resulting keys are sorted in ascending
+//! order.
+//!
+//! @par Stability
+//! DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are
+//! considered equal and appear in the result in the same order as they appear in
+//! the input.
+//!
+//! @par Usage Considerations
+//! @cdp_class{DeviceRadixSort}
+//!
+//! @par Performance
+//! @linear_performance{radix sort} The following chart illustrates
+//! DeviceRadixSort::SortKeys performance across different CUDA architectures
+//! for uniform-random `uint32` keys.
+//! @plots_below
+//!
+//! @image html lsb_radix_sort_int32_keys.png
+struct DeviceRadixSort
+{
+private:
+  template <bool IsDescending, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
+    ::cuda::std::false_type,
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    bool is_overwrite_okay,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    DecomposerT decomposer,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream);
+
+  template <bool IsDescending, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
+    ::cuda::std::true_type,
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    bool is_overwrite_okay,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    DecomposerT decomposer,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream)
+  {
+    return DispatchRadixSort<IsDescending, KeyT, ValueT, OffsetT, DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>, DecomposerT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        static_cast<OffsetT>(num_items),
+        begin_bit,
+        end_bit,
+        is_overwrite_okay,
+        stream,
+        decomposer);
+  }
+
+  template <bool IsDescending, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
+    ::cuda::std::false_type,
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    bool is_overwrite_okay,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    DecomposerT decomposer,
+    cudaStream_t stream);
+
+  template <bool IsDescending, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
+    ::cuda::std::true_type,
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    bool is_overwrite_okay,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    DecomposerT decomposer,
+    cudaStream_t stream)
+  {
+    constexpr int begin_bit = 0;
+    const int end_bit       = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
+
+    return DeviceRadixSort::custom_radix_sort<IsDescending>(
+      ::cuda::std::true_type{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      num_items,
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  // Name reported for NVTX ranges
+  _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
+  {
+    return "cub::DeviceRadixSort";
+  }
+
+public:
+  //! @name KeyT-value pairs
+  //@{
+
+  //! @brief Sorts key-value pairs into ascending order.
+  //!        (`~2N` auxiliary storage required)
+  //!
+  //! @par
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys_in,    d_keys_in    + num_items)`
+  //!   - `[d_keys_out,   d_keys_out   + num_items)`
+  //!   - `[d_values_in,  d_values_in  + num_items)`
+  //!   - `[d_values_out, d_values_out + num_items)`
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32, uint32` and
+  //! `uint64, uint64` pairs, respectively.
+  //!
+  //! @image html lsb_radix_sort_int32_pairs.png
+  //! @image html lsb_radix_sort_int64_pairs.png
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of `int`
+  //! keys with associated vector of `int` values.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_keys_out;        // e.g., [        ...        ]
+  //! int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //! int  *d_values_out;      // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+  //!     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+  //!     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+  //!
+  //! // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+  //! // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., sizeof(unsigned int) * 8)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // TODO API that doesn't accept decomposer should also contain a static
+    //      assert that the key type is fundamental.
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      static_cast<OffsetT>(num_items),
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, NumItemsT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,    d_keys_in    + num_items)``
+  //!   * ``[d_keys_out,   d_keys_out   + num_items)``
+  //!   * ``[d_values_in,  d_values_in  + num_items)``
+  //!   * ``[d_values_out, d_values_out + num_items)``
+  //!
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairs``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-bits
+  //!     :end-before: example-end pairs-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairs(void* d_temp_storage,
+              size_t& temp_storage_bytes,
+              const KeyT* d_keys_in,
+              KeyT* d_keys_out,
+              const ValueT* d_values_in,
+              ValueT* d_values_out,
+              NumItemsT num_items,
+              DecomposerT decomposer,
+              int begin_bit,
+              int end_bit,
+              cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,    d_keys_in    + num_items)``
+  //!   * ``[d_keys_out,   d_keys_out   + num_items)``
+  //!   * ``[d_values_in,  d_values_in  + num_items)``
+  //!   * ``[d_values_out, d_values_out + num_items)``
+  //!
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairs``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs
+  //!     :end-before: example-end pairs
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairs(void* d_temp_storage,
+              size_t& temp_storage_bytes,
+              const KeyT* d_keys_in,
+              KeyT* d_keys_out,
+              const ValueT* d_values_in,
+              ValueT* d_values_out,
+              NumItemsT num_items,
+              DecomposerT decomposer,
+              cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @brief Sorts key-value pairs into ascending order.
+  //!        (`~N` auxiliary storage required)
+  //!
+  //! @par
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+  //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   - `[d_values.Current(),   d_values.Current()   + num_items)`
+  //!   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32, uint32` and
+  //! `uint64, uint64` pairs, respectively.
+  //!
+  //! @image html lsb_radix_sort_int32_pairs.png
+  //! @image html lsb_radix_sort_int64_pairs.png
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of `int`
+  //! keys with associated vector of `int` values.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers for
+  //! // sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_key_alt_buf;     // e.g., [        ...        ]
+  //! int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //! int  *d_value_alt_buf;   // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortPairs(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortPairs(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+  //!
+  //! // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+  //! // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //!
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    constexpr bool is_overwrite_okay = true;
+
+    return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! * The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   - ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
+  //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
+  //!
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairs``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-db
+  //!     :end-before: example-end pairs-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairs(void* d_temp_storage,
+              size_t& temp_storage_bytes,
+              DoubleBuffer<KeyT>& d_keys,
+              DoubleBuffer<ValueT>& d_values,
+              NumItemsT num_items,
+              DecomposerT decomposer,
+              cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = false;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! * The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   - ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
+  //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
+  //!
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairs``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-bits-db
+  //!     :end-before: example-end pairs-bits-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairs(void* d_temp_storage,
+              size_t& temp_storage_bytes,
+              DoubleBuffer<KeyT>& d_keys,
+              DoubleBuffer<ValueT>& d_values,
+              NumItemsT num_items,
+              DecomposerT decomposer,
+              int begin_bit,
+              int end_bit,
+              cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = false;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @brief Sorts key-value pairs into descending order.
+  //!        (`~2N` auxiliary storage required).
+  //!
+  //! @par
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys_in,    d_keys_in    + num_items)`
+  //!   - `[d_keys_out,   d_keys_out   + num_items)`
+  //!   - `[d_values_in,  d_values_in  + num_items)`
+  //!   - `[d_values_out, d_values_out + num_items)`
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! Performance is similar to DeviceRadixSort::SortPairs.
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of `int`
+  //! keys with associated vector of `int` values.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_keys_out;        // e.g., [        ...        ]
+  //! int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //! int  *d_values_out;      // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortPairsDescending(
+  //!     d_temp_storage, temp_storage_bytes,
+  //!     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortPairsDescending(
+  //!     d_temp_storage, temp_storage_bytes,
+  //!     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+  //!
+  //! // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+  //! // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, NumItemsT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,    d_keys_in    + num_items)``
+  //!   * ``[d_keys_out,   d_keys_out   + num_items)``
+  //!   * ``[d_values_in,  d_values_in  + num_items)``
+  //!   * ``[d_values_out, d_values_out + num_items)``
+  //!
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairsDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending-bits
+  //!     :end-before: example-end pairs-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairsDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      const KeyT* d_keys_in,
+      KeyT* d_keys_out,
+      const ValueT* d_values_in,
+      ValueT* d_values_out,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      int begin_bit,
+      int end_bit,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,    d_keys_in    + num_items)``
+  //!   * ``[d_keys_out,   d_keys_out   + num_items)``
+  //!   * ``[d_values_in,  d_values_in  + num_items)``
+  //!   * ``[d_values_out, d_values_out + num_items)``
+  //!
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairsDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending
+  //!     :end-before: example-end pairs-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the corresponding input sequence of associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the correspondingly-reordered output sequence of associated
+  //!   value items
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairsDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      const KeyT* d_keys_in,
+      KeyT* d_keys_out,
+      const ValueT* d_values_in,
+      ValueT* d_values_out,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @brief Sorts key-value pairs into descending order.
+  //!        (`~N` auxiliary storage required).
+  //!
+  //! @par
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+  //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //!   - `[d_values.Current(),   d_values.Current()   + num_items)`
+  //!   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! Performance is similar to DeviceRadixSort::SortPairs.
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of `int`
+  //! keys with associated vector of `int` values.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_key_alt_buf;     // e.g., [        ...        ]
+  //! int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //! int  *d_value_alt_buf;   // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortPairsDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortPairsDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+  //!
+  //! // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+  //! // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    constexpr bool is_overwrite_okay = true;
+
+    return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! * The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   - ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
+  //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
+  //!
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairsDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending-db
+  //!     :end-before: example-end pairs-descending-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairsDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      DoubleBuffer<KeyT>& d_keys,
+      DoubleBuffer<ValueT>& d_values,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = true;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! * The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   - ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!   - ``[d_values.Current(),   d_values.Current()   + num_items)``
+  //!   - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
+  //!
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortPairsDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin pairs-descending-bits-db
+  //!     :end-before: example-end pairs-descending-bits-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** ValueT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortPairsDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      DoubleBuffer<KeyT>& d_keys,
+      DoubleBuffer<ValueT>& d_values,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      int begin_bit,
+      int end_bit,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = true;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //@}  end member group
+  /******************************************************************/ /**
+                                                                        * @name Keys-only
+                                                                        *********************************************************************/
+  //@{
+
+  //! @brief Sorts keys into ascending order.
+  //!        (`~2N` auxiliary storage required)
+  //!
+  //! @par
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys_in,    d_keys_in    + num_items)`
+  //!   - `[d_keys_out,   d_keys_out   + num_items)`
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageNP  For sorting using only `O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32` and `uint64`
+  //! keys, respectively.
+  //!
+  //! @image html lsb_radix_sort_int32_keys.png
+  //! @image html lsb_radix_sort_int64_keys.png
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of
+  //! `int` keys.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_keys_out;        // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortKeys(
+  //!   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortKeys(
+  //!   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+  //!
+  //! // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      static_cast<OffsetT>(num_items),
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,  d_keys_in  + num_items)``
+  //!   * ``[d_keys_out, d_keys_out + num_items)``
+  //!
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeys``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-bits
+  //!     :end-before: example-end keys-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeys(void* d_temp_storage,
+             size_t& temp_storage_bytes,
+             const KeyT* d_keys_in,
+             KeyT* d_keys_out,
+             NumItemsT num_items,
+             DecomposerT decomposer,
+             int begin_bit,
+             int end_bit,
+             cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,  d_keys_in  + num_items)``
+  //!   * ``[d_keys_out, d_keys_out + num_items)``
+  //!
+  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeys``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys
+  //!     :end-before: example-end keys
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeys(void* d_temp_storage,
+             size_t& temp_storage_bytes,
+             const KeyT* d_keys_in,
+             KeyT* d_keys_out,
+             NumItemsT num_items,
+             DecomposerT decomposer,
+             cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @brief Sorts keys into ascending order. (`~N` auxiliary storage required).
+  //!
+  //! @par
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+  //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! The following charts illustrate saturated sorting performance across
+  //! different CUDA architectures for uniform-random `uint32` and `uint64`
+  //! keys, respectively.
+  //!
+  //! @image html lsb_radix_sort_int32_keys.png
+  //! @image html lsb_radix_sort_int64_keys.png
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of
+  //! `int` keys.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_key_alt_buf;     // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Create a DoubleBuffer to wrap the pair of device pointers
+  //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortKeys(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortKeys(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+  //!
+  //! // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    constexpr bool is_overwrite_okay = true;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, NumItemsT>(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! * The contents of both buffers may be altered by the sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! * @devicestorageP
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeys``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-db
+  //!     :end-before: example-end keys-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeys(void* d_temp_storage,
+             size_t& temp_storage_bytes,
+             DoubleBuffer<KeyT>& d_keys,
+             NumItemsT num_items,
+             DecomposerT decomposer,
+             cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! * The contents of both buffers may be altered by the sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! * @devicestorageP
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeys``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-bits-db
+  //!     :end-before: example-end keys-bits-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeys(void* d_temp_storage,
+             size_t& temp_storage_bytes,
+             DoubleBuffer<KeyT>& d_keys,
+             NumItemsT num_items,
+             DecomposerT decomposer,
+             int begin_bit,
+             int end_bit,
+             cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = false;
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @brief Sorts keys into descending order.
+  //!        (`~2N` auxiliary storage required).
+  //!
+  //! @par
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys_in,    d_keys_in    + num_items)`
+  //!   - `[d_keys_out,   d_keys_out   + num_items)`
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! Performance is similar to DeviceRadixSort::SortKeys.
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of
+  //! `int` keys.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_keys_out;        // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Create a DoubleBuffer to wrap the pair of device pointers
+  //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortKeysDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortKeysDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+  //!
+  //! // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+  //!
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,  d_keys_in  + num_items)``
+  //!   * ``[d_keys_out, d_keys_out + num_items)``
+  //!
+  //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeysDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending-bits
+  //!     :end-before: example-end keys-descending-bits
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeysDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      const KeyT* d_keys_in,
+      KeyT* d_keys_out,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      int begin_bit,
+      int end_bit,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
+  //!
+  //! * The contents of the input data are not altered by the sorting operation.
+  //! * Pointers to contiguous memory must be used; iterators are not currently
+  //!   supported.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys_in,  d_keys_in  + num_items)``
+  //!   * ``[d_keys_out, d_keys_out + num_items)``
+  //!
+  //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeysDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending
+  //!     :end-before: example-end keys-descending
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeysDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      const KeyT* d_keys_in,
+      KeyT* d_keys_out,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    // We cast away const-ness, but will *not* write to these arrays.
+    // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+    // create a new double-buffer internally when the `is_overwrite_ok` flag
+    // is not set.
+    constexpr bool is_overwrite_okay = false;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @brief Sorts keys into descending order.
+  //!        (`~N` auxiliary storage required).
+  //!
+  //! @par
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+  //!   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! @par Performance
+  //! Performance is similar to DeviceRadixSort::SortKeys.
+  //!
+  //! @par Snippet
+  //! The code snippet below illustrates the sorting of a device vector of `i`nt keys.
+  //! @par
+  //! @code
+  //! #include <cub/cub.cuh>
+  //! // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //! // Declare, allocate, and initialize device-accessible pointers
+  //! // for sorting data
+  //! int  num_items;          // e.g., 7
+  //! int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //! int  *d_key_alt_buf;     // e.g., [        ...        ]
+  //! ...
+  //!
+  //! // Create a DoubleBuffer to wrap the pair of device pointers
+  //! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //! // Determine temporary device storage requirements
+  //! void     *d_temp_storage = nullptr;
+  //! size_t   temp_storage_bytes = 0;
+  //! cub::DeviceRadixSort::SortKeysDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+  //!
+  //! // Allocate temporary storage
+  //! cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //! // Run sorting operation
+  //! cub::DeviceRadixSort::SortKeysDescending(
+  //!   d_temp_storage, temp_storage_bytes, d_keys, num_items);
+  //!
+  //! // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+  //! @endcode
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    NumItemsT num_items,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Unsigned integer type for global offsets.
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    constexpr bool is_overwrite_okay = true;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    NumItemsT num_items,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, NumItemsT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream);
+  }
+#endif
+
+  //! @rst
+  //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! * The contents of both buffers may be altered by the sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! * @devicestorageP
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeysDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending-db
+  //!     :end-before: example-end keys-descending-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeysDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      DoubleBuffer<KeyT>& d_keys,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      stream);
+  }
+
+  //! @rst
+  //! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
+  //!
+  //! * The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! * The contents of both buffers may be altered by the sorting operation.
+  //! * In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   * ``[d_keys.Current(),     d_keys.Current()     + num_items)``
+  //!   * ``[d_keys.Alternate(),   d_keys.Alternate()   + num_items)``
+  //!
+  //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
+  //!   differentiating key bits. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! * Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! * @devicestorageP
+  //! * @devicestorage
+  //!
+  //! Snippet
+  //! ==========================================================================
+  //!
+  //! Let's consider a user-defined ``custom_t`` type below. To sort an array of
+  //! ``custom_t`` objects, we have to tell CUB about relevant members of the
+  //! ``custom_t`` type. We do this by providing a decomposer that returns a
+  //! tuple of references to relevant members of the key.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin custom-type
+  //!     :end-before: example-end custom-type
+  //!
+  //! The following snippet shows how to sort an array of ``custom_t`` objects
+  //! using ``cub::DeviceRadixSort::SortKeysDescending``:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin keys-descending-bits-db
+  //!     :end-before: example-end keys-descending-bits-db
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** KeyT type
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam DecomposerT
+  //!   **[inferred]** Type of a callable object responsible for decomposing a
+  //!   ``KeyT`` into a tuple of references to its constituent arithmetic types:
+  //!   ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
+  //!   The leftmost element of the tuple is considered the most significant.
+  //!   The call operator must not modify members of the key.
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   Number of items to sort
+  //!
+  //! @param decomposer
+  //!   Callable object responsible for decomposing a ``KeyT`` into a tuple of
+  //!   references to its constituent arithmetic types. The leftmost element of
+  //!   the tuple is considered the most significant. The call operator must not
+  //!   modify members of the key.
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for
+  //!   key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`)
+  //!
+  //! @param[in] stream
+  //!   **[optional]** CUDA stream to launch kernels within.
+  //!   Default is stream<sub>0</sub>.
+  template <typename KeyT, typename NumItemsT, typename DecomposerT>
+  CUB_RUNTIME_FUNCTION static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<DecomposerT, int>::value, //
+      cudaError_t>::type
+    SortKeysDescending(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      DoubleBuffer<KeyT>& d_keys,
+      NumItemsT num_items,
+      DecomposerT decomposer,
+      int begin_bit,
+      int end_bit,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // unsigned integer type for global offsets
+    using offset_t           = detail::choose_offset_t<NumItemsT>;
+    using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
+
+    static_assert(decomposer_check_t::value,
+                  "DecomposerT must be a callable object returning a tuple of references to "
+                  "arithmetic types");
+
+    constexpr bool is_overwrite_okay = true;
+    constexpr bool is_descending     = true;
+    DoubleBuffer<NullType> d_values;
+
+    return DeviceRadixSort::custom_radix_sort<is_descending>(
+      decomposer_check_t{},
+      d_temp_storage,
+      temp_storage_bytes,
+      is_overwrite_okay,
+      d_keys,
+      d_values,
+      static_cast<offset_t>(num_items),
+      decomposer,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_reduce.cuh
new file mode 100644
index 000000000..0e4dbb390
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_reduce.cuh
@@ -0,0 +1,1240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data
+//! items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_reduce.cuh>
+#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+#include <limits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceReduce provides device-wide, parallel operations for computing
+//! a reduction across a sequence of data items residing within
+//! device-accessible memory.
+//!
+//! .. image:: ../../img/reduce_logo.png
+//!     :align: center
+//!
+//! Overview
+//! ====================================
+//!
+//! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
+//! (or *fold*) uses a binary combining operator to compute a single aggregate
+//! from a sequence of input elements.
+//!
+//! Usage Considerations
+//! ====================================
+//!
+//! @cdp_class{DeviceReduce}
+//!
+//! Performance
+//! ====================================
+//!
+//! @linear_performance{reduction, reduce-by-key, and run-length encode}
+//!
+//! @endrst
+struct DeviceReduce
+{
+  //! @rst
+  //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
+  //!
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a user-defined min-reduction of a
+  //! device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;  // e.g., 7
+  //!    int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;     // e.g., [-]
+  //!    CustomMin    min_op;
+  //!    int          init;       // e.g., INT_MAX
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items, min_op, init);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run reduction
+  //!    cub::DeviceReduce::Reduce(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items, min_op, init);
+  //!
+  //!    // d_out <-- [0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] init
+  //!   Initial value of the reduction
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    ReductionOpT reduction_op,
+    T init,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Reduce");
+
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, T>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_items,
+    ReductionOpT reduction_op,
+    T init,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Reduce<InputIteratorT, OutputIteratorT, ReductionOpT, T>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide sum using the addition (``+``) operator.
+  //!
+  //! - Uses ``0`` as the initial value of the reduction.
+  //! - Does not support ``+`` operators that are non-commutative..
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the sum-reduction of a device vector
+  //! of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [-]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Sum(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sum-reduction
+  //!    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //!
+  //!    // d_out <-- [38]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      NumItemsT num_items,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Sum");
+
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // The output value type
+    using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>;
+
+    using InitT = OutputT;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum, InitT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<OffsetT>(num_items),
+      cub::Sum(),
+      InitT{}, // zero-initialize
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_items,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Sum<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide minimum using the less-than (``<``) operator.
+  //!
+  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction.
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [-]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run min-reduction
+  //!    cub::DeviceReduce::Min(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+  //!
+  //!    // d_out <-- [0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Min(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      NumItemsT num_items,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");
+
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    using InitT = InputT;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min, InitT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<OffsetT>(num_items),
+      cub::Min(),
+      // replace with
+      // std::numeric_limits<T>::max() when
+      // C++11 support is more prevalent
+      Traits<InitT>::Max(),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Min(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_items,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Min<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Finds the first device-wide minimum using the less-than (``<``) operator, also returning the index of that item.
+  //!
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The minimum is written to ``d_out.value`` and its offset in the input array is written to ``d_out.key``.
+  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap `d_out`.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmin-reduction of a device vector
+  //! of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_items;      // e.g., 7
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_argmin;      // e.g., [{-,-}]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run argmin-reduction
+  //!    cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+  //!
+  //!    // d_argmin <-- [{5, 0}]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items
+  //!   (of some type `T`) @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMin");
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+
+    using AccumT = OutputTupleT;
+
+    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    // TODO Address https://github.com/NVIDIA/cub/issues/651
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+
+    return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin, InitT, AccumT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMin(), initial_value, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ArgMin<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide maximum using the greater-than (``>``) operator.
+  //!
+  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_max;         // e.g., [-]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run max-reduction
+  //!    cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+  //!
+  //!    // d_max <-- [9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Max(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      NumItemsT num_items,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");
+
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    using InitT = InputT;
+
+    return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max, InitT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<OffsetT>(num_items),
+      cub::Max(),
+      // replace with
+      // std::numeric_limits<T>::lowest()
+      // when C++11 support is more
+      // prevalent
+      Traits<InitT>::Lowest(),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Max(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_items,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Max<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Finds the first device-wide maximum using the greater-than (``>``)
+  //! operator, also returning the index of that item
+  //!
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The maximum is written to ``d_out.value`` and its offset in the input
+  //!     array is written to ``d_out.key``.
+  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmax-reduction of a device vector
+  //! of `int` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int                      num_items;      // e.g., 7
+  //!    int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    KeyValuePair<int, int>   *d_argmax;      // e.g., [{-,-}]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ArgMax(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run argmax-reduction
+  //!    cub::DeviceReduce::ArgMax(
+  //!      d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+  //!
+  //!    // d_argmax <-- [{6, 9}]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `cub::KeyValuePair<int, T>`) @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ArgMax");
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+
+    using AccumT = OutputTupleT;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    // TODO Address https://github.com/NVIDIA/cub/issues/651
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+
+    return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax, InitT, AccumT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_indexed_in, d_out, num_items, cub::ArgMax(), initial_value, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ArgMax<InputIteratorT, OutputIteratorT>(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Fuses transform and reduce operations
+  //!
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a user-defined min-reduction of a
+  //! device vector of `int` data elements.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //!
+  //!    thrust::device_vector<int> in = { 1, 2, 3, 4 };
+  //!    thrust::device_vector<int> out(1);
+  //!
+  //!    std::size_t temp_storage_bytes = 0;
+  //!    std::uint8_t *d_temp_storage = nullptr;
+  //!
+  //!    const int init = 42;
+  //!
+  //!    cub::DeviceReduce::TransformReduce(
+  //!      d_temp_storage,
+  //!      temp_storage_bytes,
+  //!      in.begin(),
+  //!      out.begin(),
+  //!      in.size(),
+  //!      cub::Sum{},
+  //!      square_t{},
+  //!      init);
+  //!
+  //!    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  //!    d_temp_storage = temp_storage.data().get();
+  //!
+  //!    cub::DeviceReduce::TransformReduce(
+  //!      d_temp_storage,
+  //!      temp_storage_bytes,
+  //!      in.begin(),
+  //!      out.begin(),
+  //!      in.size(),
+  //!      cub::Sum{},
+  //!      square_t{},
+  //!      init);
+  //!
+  //!    // out[0] <-- 72
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam TransformOpT
+  //!   **[inferred]** Unary reduction functor type having member `auto operator()(const T &a)`
+  //!
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] transform_op
+  //!   Unary transform functor
+  //!
+  //! @param[in] init
+  //!   Initial value of the reduction
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ReductionOpT,
+            typename TransformOpT,
+            typename T,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformReduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    ReductionOpT reduction_op,
+    TransformOpT transform_op,
+    T init,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::TransformReduce");
+
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      static_cast<OffsetT>(num_items),
+      reduction_op,
+      init,
+      stream,
+      transform_op);
+  }
+
+  //! @rst
+  //! Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+  //!
+  //! This operation computes segmented reductions within ``d_values_in`` using the specified binary ``reduction_op``
+  //! functor. The segments are identified by "runs" of corresponding keys in `d_keys_in`, where runs are maximal
+  //! ranges of consecutive, identical keys. For the *i*\ :sup:`th` run encountered, the first key of the run and
+  //! the corresponding value aggregate of that run are written to ``d_unique_out[i]`` and ``d_aggregates_out[i]``,
+  //! respectively. The total number of runs encountered is written to ``d_num_runs_out``.
+  //!
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - Let ``out`` be any of
+  //!   ``[d_unique_out, d_unique_out + *d_num_runs_out)``
+  //!   ``[d_aggregates_out, d_aggregates_out + *d_num_runs_out)``
+  //!   ``d_num_runs_out``. The ranges represented by ``out`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_values_in, d_values_in + num_items)`` nor ``out`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the segmented reduction of ``int`` values grouped by runs of
+  //! associated ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_reduce.cuh>
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+  //!    int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+  //!    int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+  //!    int          *d_num_runs_out;    // e.g., [-]
+  //!    CustomMin    reduction_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceReduce::ReduceByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_unique_out, d_values_in,
+  //!      d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run reduce-by-key
+  //!    cub::DeviceReduce::ReduceByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_unique_out, d_values_in,
+  //!      d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+  //!
+  //!    // d_unique_out      <-- [0, 2, 9, 5, 8]
+  //!    // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+  //!    // d_num_runs_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam UniqueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing unique output keys @iterator
+  //!
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam AggregatesOutputIterator
+  //!   **[inferred]** Random-access output iterator type for writing output value aggregates @iterator
+  //!
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //!
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //!
+  //! @param[out] d_unique_out
+  //!   Pointer to the output sequence of unique keys (one key per run)
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of corresponding values
+  //!
+  //! @param[out] d_aggregates_out
+  //!   Pointer to the output sequence of value aggregates
+  //!   (one aggregate per run)
+  //!
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs encountered
+  //!   (i.e., the length of `d_unique_out`)
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs
+  //!   (i.e., the length of `d_in_keys` and `d_in_values`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeysInputIteratorT,
+            typename UniqueOutputIteratorT,
+            typename ValuesInputIteratorT,
+            typename AggregatesOutputIteratorT,
+            typename NumRunsOutputIteratorT,
+            typename ReductionOpT,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    ReductionOpT reduction_op,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::ReduceByKey");
+
+    // Signed integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    // FlagT iterator type (not used)
+
+    // Selection op (not used)
+
+    // Default == operator
+    using EqualityOp = Equality;
+
+    return DispatchReduceByKey<
+      KeysInputIteratorT,
+      UniqueOutputIteratorT,
+      ValuesInputIteratorT,
+      AggregatesOutputIteratorT,
+      NumRunsOutputIteratorT,
+      EqualityOp,
+      ReductionOpT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_unique_out,
+                         d_values_in,
+                         d_aggregates_out,
+                         d_num_runs_out,
+                         EqualityOp(),
+                         reduction_op,
+                         static_cast<OffsetT>(num_items),
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeysInputIteratorT,
+            typename UniqueOutputIteratorT,
+            typename ValuesInputIteratorT,
+            typename AggregatesOutputIteratorT,
+            typename NumRunsOutputIteratorT,
+            typename ReductionOpT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t ReduceByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    ReductionOpT reduction_op,
+    int num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ReduceByKey<KeysInputIteratorT,
+                       UniqueOutputIteratorT,
+                       ValuesInputIteratorT,
+                       AggregatesOutputIteratorT,
+                       NumRunsOutputIteratorT,
+                       ReductionOpT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_unique_out,
+      d_values_in,
+      d_aggregates_out,
+      d_num_runs_out,
+      reduction_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_run_length_encode.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_run_length_encode.cuh
new file mode 100644
index 000000000..120562a46
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a
+//! sequence of data items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cuda/std/__functional/invoke.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/device/dispatch/dispatch_rle.cuh>
+#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceRunLengthEncode provides device-wide, parallel operations for
+//! demarcating "runs" of same-valued items within a sequence residing
+//! within device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! A `run-length encoding <http://en.wikipedia.org/wiki/Run-length_encoding>`_
+//! computes a simple compressed representation of a sequence of input elements
+//! such that each maximal "run" of consecutive same-valued data items is
+//! encoded as a single data value along with a count of the elements in that
+//! run.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceRunLengthEncode}
+//!
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{run-length encode}
+//!
+//! @endrst
+struct DeviceRunLengthEncode
+{
+  //! @rst
+  //! Computes a run-length encoding of the sequence ``d_in``.
+  //!
+  //! - For the *i*\ :sup:`th` run encountered, the first key of the run and
+  //!   its length are written to ``d_unique_out[i]`` and ``d_counts_out[i]``, respectively.
+  //! - The total number of runs encountered is written to ``d_num_runs_out``.
+  //! - The ``==`` equality operator is used to determine whether values are equivalent
+  //! - In-place operations are not supported. There must be no overlap between any of the provided ranges:
+  //!
+  //!   - ``[d_unique_out, d_unique_out + *d_num_runs_out)``
+  //!   - ``[d_counts_out, d_counts_out + *d_num_runs_out)``
+  //!   - ``[d_num_runs_out, d_num_runs_out + 1)``
+  //!   - ``[d_in, d_in + num_items)``
+  //!
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the run-length encoding of a sequence of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_run_length_encode.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_num_runs_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceRunLengthEncode::Encode(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run encoding
+  //!    cub::DeviceRunLengthEncode::Encode(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+  //!
+  //!    // d_unique_out      <-- [0, 2, 9, 5, 8]
+  //!    // d_counts_out      <-- [1, 2, 1, 3, 1]
+  //!    // d_num_runs_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam UniqueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing unique output items @iterator
+  //!
+  //! @tparam LengthsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing output counts @iterator
+  //!
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of keys
+  //!
+  //! @param[out] d_unique_out
+  //!   Pointer to the output sequence of unique keys (one key per run)
+  //!
+  //! @param[out] d_counts_out
+  //!   Pointer to the output sequence of run-lengths (one count per run)
+  //!
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs
+  //!
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename UniqueOutputIteratorT,
+            typename LengthsOutputIteratorT,
+            typename NumRunsOutputIteratorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    UniqueOutputIteratorT d_unique_out,
+    LengthsOutputIteratorT d_counts_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::Encode");
+
+    using offset_t     = int; // Signed integer type for global offsets
+    using equality_op  = Equality; // Default == operator
+    using reduction_op = cub::Sum; // Value reduction operator
+
+    // The lengths output value type
+    using length_t = cub::detail::non_void_value_t<LengthsOutputIteratorT, offset_t>;
+
+    // Generator type for providing 1s values for run-length reduction
+    using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
+
+    using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;
+
+    using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;
+
+    using policy_t = detail::device_run_length_encode_policy_hub<accum_t, key_t>;
+
+    return DispatchReduceByKey<
+      InputIteratorT,
+      UniqueOutputIteratorT,
+      lengths_input_iterator_t,
+      LengthsOutputIteratorT,
+      NumRunsOutputIteratorT,
+      equality_op,
+      reduction_op,
+      offset_t,
+      accum_t,
+      policy_t>::Dispatch(d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_unique_out,
+                          lengths_input_iterator_t((length_t) 1),
+                          d_counts_out,
+                          d_num_runs_out,
+                          equality_op(),
+                          reduction_op(),
+                          num_items,
+                          stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename UniqueOutputIteratorT,
+            typename LengthsOutputIteratorT,
+            typename NumRunsOutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Encode(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    UniqueOutputIteratorT d_unique_out,
+    LengthsOutputIteratorT d_counts_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    int num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Encode<InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Enumerates the starting offsets and lengths of all non-trivial runs
+  //! (of ``length > 1``) of same-valued keys in the sequence ``d_in``.
+  //!
+  //! - For the *i*\ :sup:`th` non-trivial run, the run's starting offset and
+  //!   its length are written to ``d_offsets_out[i]`` and ``d_lengths_out[i]``, respectively.
+  //! - The total number of runs encountered is written to ``d_num_runs_out``.
+  //! - The ``==`` equality operator is used to determine whether values are equivalent
+  //! - In-place operations are not supported. There must be no overlap between any of the provided ranges:
+  //!
+  //!   - ``[d_offsets_out, d_offsets_out + *d_num_runs_out)``
+  //!   - ``[d_lengths_out, d_lengths_out + *d_num_runs_out)``
+  //!   - ``[d_num_runs_out, d_num_runs_out + 1)``
+  //!   - ``[d_in, d_in + num_items)``
+  //!
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the identification of non-trivial runs
+  //! within a sequence of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_run_length_encode.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int          num_items;          // e.g., 8
+  //!    int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_num_runs_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceRunLengthEncode::NonTrivialRuns(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run encoding
+  //!    cub::DeviceRunLengthEncode::NonTrivialRuns(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+  //!
+  //!    // d_offsets_out         <-- [1, 4]
+  //!    // d_lengths_out         <-- [2, 3]
+  //!    // d_num_runs_out        <-- [2]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OffsetsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing run-offset values @iterator
+  //!
+  //! @tparam LengthsOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing run-length values @iterator
+  //!
+  //! @tparam NumRunsOutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of runs encountered @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to input sequence of data items
+  //!
+  //! @param[out] d_offsets_out
+  //!   Pointer to output sequence of run-offsets
+  //!   (one offset per non-trivial run)
+  //!
+  //! @param[out] d_lengths_out
+  //!   Pointer to output sequence of run-lengths (one count per non-trivial run)
+  //!
+  //! @param[out] d_num_runs_out
+  //!   Pointer to total number of runs (i.e., length of `d_offsets_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of associated key+value pairs (i.e., the length of `d_in_keys` and `d_in_values`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OffsetsOutputIteratorT,
+            typename LengthsOutputIteratorT,
+            typename NumRunsOutputIteratorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t NonTrivialRuns(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceRunLengthEncode::NonTrivialRuns");
+
+    using OffsetT    = int; // Signed integer type for global offsets
+    using EqualityOp = Equality; // Default == operator
+
+    return DeviceRleDispatch<
+      InputIteratorT,
+      OffsetsOutputIteratorT,
+      LengthsOutputIteratorT,
+      NumRunsOutputIteratorT,
+      EqualityOp,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_in,
+                         d_offsets_out,
+                         d_lengths_out,
+                         d_num_runs_out,
+                         EqualityOp(),
+                         num_items,
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename OffsetsOutputIteratorT,
+            typename LengthsOutputIteratorT,
+            typename NumRunsOutputIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  NonTrivialRuns(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    int num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return NonTrivialRuns<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_scan.cuh
new file mode 100644
index 000000000..8e42124d7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_scan.cuh
@@ -0,0 +1,2205 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
+//! items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <cuda/std/__functional/invoke.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceScan provides device-wide, parallel operations for computing a
+//! prefix scan across a sequence of data items residing within
+//! device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! Given a sequence of input elements and a binary reduction operator, a
+//! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
+//! sequence where each element is computed to be the reduction of the elements
+//! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
+//! with the addition operator. The term *inclusive* indicates that the
+//! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
+//! The term *exclusive* indicates the *i*\ :sup:`th` input is not
+//! incorporated into the *i*\ :sup:`th` output reduction. When the input and
+//! output sequences are the same, the scan is performed in-place.
+//!
+//! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
+//! *"decoupled look-back"* algorithm for performing global prefix scan with
+//! only a single pass through the input data, as described in our 2016 technical
+//! report [1]_. The central idea is to leverage a small, constant factor of
+//! redundant work in order to overlap the latencies of global prefix
+//! propagation with local computation. As such, our algorithm requires only
+//! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
+//! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
+//!
+//! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
+//!    <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
+//!    *NVIDIA Technical Report NVR-2016-002*, 2016.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceScan}
+//!
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{prefix scan}
+//!
+//! @endrst
+struct DeviceScan
+{
+  //! @name Exclusive scans
+  //! @{
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum.
+  //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
+  //!   The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum of an ``int``
+  //! device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // d_out <-- [0, 8, 14, 21, 26, 29, 29]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+    using InitT   = cub::detail::value_t<InputIteratorT>;
+
+    // Initial value
+    InitT init_value{};
+
+    return DispatchScan<InputIteratorT, OutputIteratorT, Sum, detail::InputValue<InitT>, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), detail::InputValue<InitT>(init_value), num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveSum<InputIteratorT, OutputIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum in-place.
+  //! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum of an ``int``
+  //! device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //!
+  //!    // d_data <-- [0, 8, 14, 21, 26, 29, 29]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
+    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
+  {
+    return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is applied as
+  //! the initial value, and is assigned to ``*d_out``.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, (int) INT_MAX, num_items);
+  //!
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, (int) INT_MAX, num_items);
+  //!
+  //!    // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      detail::InputValue<InitValueT>(init_value),
+      num_items,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is applied as
+  //! the initial value, and is assigned to ``*d_data``.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an
+  //! ``int`` device vector:
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    CustomMin    min_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, (int) INT_MAX, num_items);
+  //!
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, (int) INT_MAX, num_items);
+  //!
+  //!    // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveScan<IteratorT, ScanOpT, InitValueT>(
+      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified
+  //! binary ``scan_op`` functor. The ``init_value`` value is provided as a future value.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
+  //!   The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    int          *d_init_iter;   // e.g., INT_MAX
+  //!    CustomMin    min_op;
+  //!
+  //!    auto future_init_value =
+  //!      cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+  //!
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, future_init_value, num_items);
+  //!
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, future_init_value, num_items);
+  //!
+  //!    // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    FutureValue<InitValueT, InitValueIterT> init_value,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      scan_op,
+      detail::InputValue<InitValueT>(init_value),
+      num_items,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    FutureValue<InitValueT, InitValueIterT> init_value,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, InitValueIterT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan using the specified binary ``scan_op`` functor.
+  //! The ``init_value`` value is provided as a future value.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_init_iter;   // e.g., INT_MAX
+  //!    CustomMin    min_op;
+  //!
+  //!    auto future_init_value =
+  //!      cub::FutureValue<InitialValueT, IterT>(d_init_iter);
+  //!
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, future_init_value, num_items);
+  //!
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, future_init_value, num_items);
+  //!
+  //!    // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value` used Binary scan functor type
+  //!   having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] init_value
+  //!   Initial value to seed the exclusive scan (and is assigned to `*d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    FutureValue<InitValueT, InitValueIterT> init_value,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename InitValueIterT = InitValueT*,
+            typename NumItemsT      = int>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    FutureValue<InitValueT, InitValueIterT> init_value,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveScan<IteratorT, ScanOpT, InitValueT, InitValueIterT>(
+      d_temp_storage, temp_storage_bytes, d_data, scan_op, init_value, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+  //! @name Inclusive scans
+  //! @{
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix sum
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, num_items);
+  //!
+  //!    // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveSum<InputIteratorT, OutputIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum in-place.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int  num_items;      // e.g., 7
+  //!    int  *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix sum
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSum(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, num_items);
+  //!
+  //!    // d_data <-- [8, 14, 21, 26, 29, 29, 38]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Random-access iterator to the sequence of data items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
+    void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
+  {
+    return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveSum<IteratorT>(d_temp_storage, temp_storage_bytes, d_data, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix scan
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //!
+  //!    // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in]
+  //!   d_temp_storage Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to
+  //!   `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
+  }
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
+  //! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
+  //! is assigned to ``*d_out``.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
+  //!   range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
+  //!   shall not overlap in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin device-inclusive-scan
+  //!     :end-before: example-end device-inclusive-scan
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan outputs @iterator
+  //!
+  //! @tparam ScanOpT
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!  **[inferred]** Type of the `init_value`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to
+  //!   `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to the size in bytes of the `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Random-access iterator to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Random-access iterator to the output sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] init_value
+  //!   Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
+  //!   is assigned to `*d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   CUDA stream to launch kernels within.
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+    using AccumT  = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
+    constexpr bool ForceInclusive = true;
+
+    return DispatchScan<
+      InputIteratorT,
+      OutputIteratorT,
+      ScanOpT,
+      detail::InputValue<InitValueT>,
+      OffsetT,
+      AccumT,
+      DeviceScanPolicy<AccumT, ScanOpT>,
+      ForceInclusive>::Dispatch(d_temp_storage,
+                                temp_storage_bytes,
+                                d_in,
+                                d_out,
+                                scan_op,
+                                detail::InputValue<InitValueT>(init_value),
+                                num_items,
+                                stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveScan<InputIteratorT, OutputIteratorT, ScanOpT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan using the specified binary ``scan_op`` functor.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_data;        // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    CustomMin    min_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive
+  //!    // prefix scan
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, min_op, num_items);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScan(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, min_op, num_items);
+  //!
+  //!    // d_data <-- [8, 6, 6, 5, 3, 0, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in]
+  //!   d_temp_storage Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to
+  //!   `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_data
+  //!   Random-access iterator to the sequence of data items
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan functor
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename ScanOpT, typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveScan<IteratorT, ScanOpT>(d_temp_storage, temp_storage_bytes, d_data, scan_op, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix sum-by-key with key equality
+  //! defined by ``equality_op``. The value of ``0`` is applied as the initial
+  //! value, and is assigned to the beginning of each segment in ``d_values_out``.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int num_items;      // e.g., 7
+  //!    int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix sum
+  //!    cub::DeviceScan::ExclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //!
+  //!    // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //!
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //!
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Random-access input iterator to the input sequence of key items
+  //!
+  //! @param[in] d_values_in
+  //!   Random-access input iterator to the input sequence of value items
+  //!
+  //! @param[out] d_values_out
+  //!   Random-access output iterator to the output sequence of value items
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //!
+  //! @param[in] equality_op
+  //!   Binary functor that defines the equality of keys.
+  //!   Default is cub::Equality().
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    EqualityOpT equality_op = EqualityOpT(),
+    cudaStream_t stream     = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+    using InitT   = cub::detail::value_t<ValuesInputIteratorT>;
+
+    // Initial value
+    InitT init_value{};
+
+    return DispatchScanByKey<
+      KeysInputIteratorT,
+      ValuesInputIteratorT,
+      ValuesOutputIteratorT,
+      EqualityOpT,
+      Sum,
+      InitT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_values_out,
+                         equality_op,
+                         Sum(),
+                         init_value,
+                         num_items,
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    EqualityOpT equality_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide exclusive prefix scan-by-key using the
+  //! specified binary ``scan_op`` functor. The key equality is defined by
+  //! ``equality_op``.  The ``init_value`` value is applied as the initial
+  //! value, and is assigned to the beginning of each segment in ``d_values_out``.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // CustomEqual functor
+  //!    struct CustomEqual
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return a == b;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    CustomEqual  equality_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for exclusive
+  //!    // prefix scan
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::ExclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op,
+  //!      (int) INT_MAX, num_items, equality_op);
+  //!
+  //!    // Allocate temporary storage for exclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run exclusive prefix min-scan
+  //!    cub::DeviceScan::ExclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op,
+  //!      (int) INT_MAX, num_items, equality_op);
+  //!
+  //!    // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //!
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //!
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam InitValueT
+  //!   **[inferred]** Type of the `init_value` value used in Binary scan
+  //!   functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!    required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //!
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //!
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //!
+  //!  @param[in] scan_op
+  //!    Binary scan functor
+  //!
+  //!  @param[in] init_value
+  //!    Initial value to seed the exclusive scan (and is assigned to the
+  //!    beginning of each segment in `d_values_out`)
+  //!
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and
+  //!    `d_values_in`)
+  //!
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //!
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    EqualityOpT equality_op = EqualityOpT(),
+    cudaStream_t stream     = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScanByKey<
+      KeysInputIteratorT,
+      ValuesInputIteratorT,
+      ValuesOutputIteratorT,
+      EqualityOpT,
+      ScanOpT,
+      InitValueT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_values_out,
+                         equality_op,
+                         scan_op,
+                         init_value,
+                         num_items,
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename InitValueT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    NumItemsT num_items,
+    EqualityOpT equality_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ExclusiveScanByKey<KeysInputIteratorT,
+                              ValuesInputIteratorT,
+                              ValuesOutputIteratorT,
+                              ScanOpT,
+                              InitValueT,
+                              EqualityOpT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_values_out,
+      scan_op,
+      init_value,
+      num_items,
+      equality_op,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
+  //!
+  //! - Supports non-commutative sum operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int num_items;      // e.g., 7
+  //!    int *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive prefix sum
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix sum
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix sum
+  //!    cub::DeviceScan::InclusiveSumByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, num_items);
+  //!
+  //!    // d_out <-- [8, 14, 7, 12, 15, 0, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //!
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //!
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage.
+  //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //!
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //!
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //!
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //!
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //!
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    EqualityOpT equality_op = EqualityOpT(),
+    cudaStream_t stream     = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScanByKey<
+      KeysInputIteratorT,
+      ValuesInputIteratorT,
+      ValuesOutputIteratorT,
+      EqualityOpT,
+      Sum,
+      NullType,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_values_out,
+                         equality_op,
+                         Sum(),
+                         NullType(),
+                         num_items,
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    NumItemsT num_items,
+    EqualityOpT equality_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveSumByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, num_items, equality_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide inclusive prefix scan-by-key using the
+  //! specified binary ``scan_op`` functor. The key equality is defined by ``equality_op``.
+  //!
+  //! - Supports non-commutative scan operators.
+  //! - Results are not deterministic for pseudo-associative operators (e.g.,
+  //!   addition of floating-point types). Results for pseudo-associative
+  //!   operators may vary from run to run. Additional details can be found in
+  //!   the @lookback description.
+  //! - ``d_keys_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_keys_in, d_keys_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - ``d_values_in`` may equal ``d_values_out`` but the range
+  //!   ``[d_values_in, d_values_in + num_items)`` and the range
+  //!   ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+  //!    #include <climits>       // for INT_MAX
+  //!
+  //!    // CustomMin functor
+  //!    struct CustomMin
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return (b < a) ? b : a;
+  //!        }
+  //!    };
+  //!
+  //!    // CustomEqual functor
+  //!    struct CustomEqual
+  //!    {
+  //!        template <typename T>
+  //!        __host__ __device__ __forceinline__
+  //!        T operator()(const T &a, const T &b) const {
+  //!            return a == b;
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // input and output
+  //!    int          num_items;      // e.g., 7
+  //!    int          *d_keys_in;     // e.g., [0, 0, 1, 1, 1, 2, 2]
+  //!    int          *d_values_in;   // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int          *d_values_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ]
+  //!    CustomMin    min_op;
+  //!    CustomEqual  equality_op;
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements for inclusive prefix scan
+  //!    void *d_temp_storage = nullptr;
+  //!    size_t temp_storage_bytes = 0;
+  //!    cub::DeviceScan::InclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+  //!
+  //!    // Allocate temporary storage for inclusive prefix scan
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run inclusive prefix min-scan
+  //!    cub::DeviceScan::InclusiveScanByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
+  //!
+  //!    // d_out <-- [8, 6, 7, 5, 3, 0, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeysInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
+  //!
+  //! @tparam ValuesInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
+  //!
+  //! @tparam ValuesOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Functor type having member
+  //!   `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** An integral type representing the number of input elements
+  //!
+  //!  @param[in] d_temp_storage
+  //!    Device-accessible allocation of temporary storage.
+  //!    When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //!  @param[in,out] temp_storage_bytes
+  //!    Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //!  @param[in] d_keys_in
+  //!    Random-access input iterator to the input sequence of key items
+  //!
+  //!  @param[in] d_values_in
+  //!    Random-access input iterator to the input sequence of value items
+  //!
+  //!  @param[out] d_values_out
+  //!    Random-access output iterator to the output sequence of value items
+  //!
+  //!  @param[in] scan_op
+  //!    Binary scan functor
+  //!
+  //!  @param[in] num_items
+  //!    Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
+  //!
+  //!  @param[in] equality_op
+  //!    Binary functor that defines the equality of keys.
+  //!    Default is cub::Equality().
+  //!
+  //!  @param[in] stream
+  //!    @rst
+  //!    **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!    @endrst
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    EqualityOpT equality_op = EqualityOpT(),
+    cudaStream_t stream     = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
+
+    // Unsigned integer type for global offsets
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchScanByKey<
+      KeysInputIteratorT,
+      ValuesInputIteratorT,
+      ValuesOutputIteratorT,
+      EqualityOpT,
+      ScanOpT,
+      NullType,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_values_out,
+                         equality_op,
+                         scan_op,
+                         NullType(),
+                         num_items,
+                         stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeysInputIteratorT,
+            typename ValuesInputIteratorT,
+            typename ValuesOutputIteratorT,
+            typename ScanOpT,
+            typename EqualityOpT = Equality,
+            typename NumItemsT   = std::uint32_t>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanOpT scan_op,
+    NumItemsT num_items,
+    EqualityOpT equality_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return InclusiveScanByKey<KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, ScanOpT, EqualityOpT>(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_values_out, scan_op, num_items, equality_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 000000000..cc627b971
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,1713 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across
+//! multiple, non-overlapping sequences of data items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceSegmentedRadixSort provides device-wide, parallel operations
+//! for computing a batched radix sort across multiple, non-overlapping
+//! sequences of data items residing within device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
+//! arranges items into ascending (or descending) order. The algorithm relies
+//! upon a positional representation for keys, i.e., each key is comprised of an
+//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
+//! least-significant to most-significant.  For a given input sequence of keys
+//! and a set of rules specifying a total ordering of the symbolic alphabet, the
+//! radix sorting method produces a lexicographic ordering of those keys.
+//!
+//! See Also
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
+//! that algorithm's documentation for more information.
+//!
+//! Segments are not required to be contiguous. Any element of input(s) or
+//! output(s) outside the specified segments will not be accessed nor modified.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSegmentedRadixSort}
+//!
+//! @endrst
+struct DeviceSegmentedRadixSort
+{
+private:
+  // Name reported for NVTX ranges
+  _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
+  {
+    return "cub::DeviceSegmentedRadixSort";
+  }
+
+public:
+  //! @name Key-value pairs
+  //! @{
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order. (``~2N`` auxiliary storage required)
+  //!
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length `num_segments`, such that `d_begin_offsets[i]` is the first
+  //!   element of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If
+  //!   ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      false,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and yield
+  //!   a corresponding performance improvement.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      true,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
+  //!
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and `out` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      false,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!   not to be modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer
+  //!   contains the unsorted input values and, upon return, is updated to point
+  //!   to the sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      true,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+  //! @name Keys-only
+  //! @{
+
+  //! @rst
+  //! Sorts segments of keys into ascending order. (``~2N`` auxiliary storage required)
+  //!
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage.
+  //!   When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // Null value type
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      false,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive)
+  //!   needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      true,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
+  //!
+  //! - The contents of the input data are not altered by the sorting operation
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
+  //!   the sorting interface using DoubleBuffer wrappers below.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., sizeof(unsigned int) * 8)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      false,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the
+  //!   number of key bits specified and the targeted device architecture).
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
+  //!   bits can be specified. This can reduce overall sorting overhead and
+  //!   yield a corresponding performance improvement.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //! - @devicestorageP
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of `int` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_radix_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedRadixSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items within the segmented array, including items not
+  //!   covered by segments. `num_items` should match the largest element within
+  //!   the range `[d_end_offsets, d_end_offsets + num_segments)`.
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] begin_bit
+  //!   **[optional]** The least-significant bit index (inclusive) needed for key comparison
+  //!
+  //! @param[in] end_bit
+  //!   **[optional]** The most-significant bit index (exclusive) needed for key
+  //!   comparison (e.g., `sizeof(unsigned int) * 8`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit       = 0,
+    int end_bit         = sizeof(KeyT) * 8,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+
+    // Signed integer type for global offsets
+    using OffsetT = int;
+
+    // Null value type
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      true,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 000000000..ec5d017fc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,1064 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across
+//! multiple sequences of data items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_reduce.cuh>
+#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceSegmentedReduce provides device-wide, parallel operations for
+//! computing a reduction across multiple sequences of data items
+//! residing within device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
+//! (or *fold*) uses a binary combining operator to compute a single aggregate
+//! from a sequence of input elements.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSegmentedReduce}
+//!
+//! @endrst
+struct DeviceSegmentedReduce
+{
+private:
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename OffsetT,
+            typename ReductionOpT,
+            typename InitT,
+            typename... Ts>
+  CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
+    ::cuda::std::false_type,
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT initial_value,
+    cudaStream_t stream);
+
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename OffsetT,
+            typename ReductionOpT,
+            typename InitT,
+            typename... Ts>
+  CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
+    ::cuda::std::true_type,
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT initial_value,
+    cudaStream_t stream)
+  {
+    return DispatchSegmentedReduce<
+      InputIteratorT,
+      OutputIteratorT,
+      BeginOffsetIteratorT,
+      EndOffsetIteratorT,
+      OffsetT,
+      ReductionOpT,
+      Ts...>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       d_out,
+                       num_segments,
+                       d_begin_offsets,
+                       d_end_offsets,
+                       reduction_op,
+                       initial_value,
+                       stream);
+  }
+
+public:
+  //! @rst
+  //! Computes a device-wide segmented reduction using the specified
+  //! binary ``reduction_op`` functor.
+  //!
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - Provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-reduce
+  //!     :end-before: example-end segmented-reduce-reduce
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] initial_value
+  //!   Initial value of the reduction for each segment
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename ReductionOpT,
+            typename T>
+  CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    T initial_value,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
+
+    // Integer type for global offsets
+    using OffsetT               = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      reduction_op,
+      initial_value, // zero-initialize
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename BeginOffsetIteratorT,
+            typename EndOffsetIteratorT,
+            typename ReductionOpT,
+            typename T>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    T initial_value,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, ReductionOpT, T>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      reduction_op,
+      initial_value,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide segmented sum using the addition (``+``) operator.
+  //!
+  //! - Uses ``0`` as the initial value of the reduction for each segment.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``+`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-sum
+  //!     :end-before: example-end segmented-reduce-sum
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
+
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    // The output value type
+    using OutputT               = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>;
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Sum>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      cub::Sum(),
+      OutputT(), // zero-initialize
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Sum(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Sum<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide segmented minimum using the less-than (``<``) operator.
+  //!
+  //! - Uses ``std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
+  //!   specified as ``segment_offsets + 1``).
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-custommin
+  //!     :end-before: example-end segmented-reduce-custommin
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-min
+  //!     :end-before: example-end segmented-reduce-min
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Min(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
+
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    // The input value type
+    using InputT                = cub::detail::value_t<InputIteratorT>;
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, cub::Min>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      cub::Min(),
+      Traits<InputT>::Max(), // replace with
+                             // std::numeric_limits<T>::max()
+                             // when C++11 support is
+                             // more prevalent
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Min(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Min<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Finds the first device-wide minimum in each segment using the
+  //! less-than (``<``) operator, also returning the in-segment index of that item.
+  //!
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The minimum of the *i*\ :sup:`th` segment is written to
+  //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
+  //!   - The ``{1, std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
+  //!
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
+  //!   the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
+  //!   is specified as ``segment_offsets + 1``).
+  //! - Does not support ``<`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-argmin
+  //!     :end-before: example-end segmented-reduce-argmin
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `KeyValuePair<int, T>`) @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
+
+    // Integer type for global offsets
+    // Using common iterator value type is a breaking change, see:
+    // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
+    using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    using AccumT = OutputTupleT;
+
+    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    // TODO Address https://github.com/NVIDIA/cub/issues/651
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Max())};
+
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<ArgIndexInputIteratorT,
+                            OutputIteratorT,
+                            BeginOffsetIteratorT,
+                            EndOffsetIteratorT,
+                            OffsetT,
+                            cub::ArgMin,
+                            InitT,
+                            AccumT>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_indexed_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      cub::ArgMin(),
+      initial_value,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ArgMin<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
+  //!
+  //! - Uses ``std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-max
+  //!     :end-before: example-end segmented-reduce-max
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  Max(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
+
+    // Integer type for global offsets
+    using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    // The input value type
+    using InputT = cub::detail::value_t<InputIteratorT>;
+
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      cub::Max(),
+      Traits<InputT>::Lowest(), // replace with
+                                // std::numeric_limits<T>::lowest()
+                                // when C++11 support is
+                                // more prevalent
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t
+  Max(void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      InputIteratorT d_in,
+      OutputIteratorT d_out,
+      int num_segments,
+      BeginOffsetIteratorT d_begin_offsets,
+      EndOffsetIteratorT d_end_offsets,
+      cudaStream_t stream,
+      bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Max<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Finds the first device-wide maximum in each segment using the
+  //! greater-than (``>``) operator, also returning the in-segment index of that item
+  //!
+  //! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
+  //!   (assuming the value type of ``d_in`` is ``T``)
+  //!
+  //!   - The maximum of the *i*\ :sup:`th` segment is written to
+  //!     ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
+  //!   - The ``{1, std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
+  //!
+  //! - When input a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - Does not support ``>`` operators that are non-commutative.
+  //! - Let ``s`` be in ``[0, num_segments)``. The range
+  //!   ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
+  //!   overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)``.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the argmax-reduction of a device vector
+  //! of `int` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-reduce-argmax
+  //!     :end-before: example-end segmented-reduce-argmax
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items
+  //!   (of some type `T`) @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate
+  //!   (having value type `KeyValuePair<int, T>`) @iterator
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length `num_segments`, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
+
+    // Integer type for global offsets
+    // Using common iterator value type is a breaking change, see:
+    // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
+    using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    // The input type
+    using InputValueT = cub::detail::value_t<InputIteratorT>;
+
+    // The output tuple type
+    using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
+
+    using AccumT = OutputTupleT;
+
+    using InitT = detail::reduce::empty_problem_init_t<AccumT>;
+
+    // The output value type
+    using OutputValueT = typename OutputTupleT::Value;
+
+    // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+    using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
+
+    ArgIndexInputIteratorT d_indexed_in(d_in);
+
+    // Initial value
+    // TODO Address https://github.com/NVIDIA/cub/issues/651
+    InitT initial_value{AccumT(1, Traits<InputValueT>::Lowest())};
+
+    using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
+    static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
+
+    return segmented_reduce<ArgIndexInputIteratorT,
+                            OutputIteratorT,
+                            BeginOffsetIteratorT,
+                            EndOffsetIteratorT,
+                            OffsetT,
+                            cub::ArgMax,
+                            InitT,
+                            AccumT>(
+      integral_offset_check{},
+      d_temp_storage,
+      temp_storage_bytes,
+      d_indexed_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      cub::ArgMax(),
+      initial_value,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return ArgMax<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_sort.cuh
new file mode 100644
index 000000000..7d01b6d56
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_segmented_sort.cuh
@@ -0,0 +1,3238 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple,
+//! non-overlapping sequences of data items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_segmented_sort.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_namespace.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceSegmentedSort provides device-wide, parallel operations for
+//! computing a batched sort across multiple, non-overlapping sequences of
+//! data items residing within device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The algorithm arranges items into ascending (or descending) order.
+//! The underlying sorting algorithm is undefined. Depending on the segment size,
+//! it might be radix sort, merge sort or something else. Therefore, no
+//! assumptions on the underlying implementation should be made.
+//!
+//! Differences from DeviceSegmentedRadixSort
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! DeviceSegmentedRadixSort is optimized for significantly large segments (tens
+//! of thousands of items and more). Nevertheless, some domains produce a wide
+//! range of segment sizes. DeviceSegmentedSort partitions segments into size
+//! groups and specialize sorting algorithms for each group. This approach leads
+//! to better resource utilization in the presence of segment size imbalance or
+//! moderate segment sizes (up to thousands of items).
+//! This algorithm is more complex and consists of multiple kernels. This fact
+//! leads to longer compilation times as well as larger binaries sizes.
+//!
+//! Supported Types
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The algorithm has to satisfy the underlying algorithms restrictions. Radix
+//! sort usage restricts the list of supported types. Therefore,
+//! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
+//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and
+//! ``__nv_bfloat16`` 16-bit floating-point types.
+//!
+//! Segments are not required to be contiguous. Any element of input(s) or
+//! output(s) outside the specified segments will not be accessed nor modified.
+//!
+//! A simple example
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!    // or equivalently <cub/device/device_segmented_sort.cuh>
+//!
+//!    // Declare, allocate, and initialize device-accessible pointers
+//!    // for sorting data
+//!    int  num_items;          // e.g., 7
+//!    int  num_segments;       // e.g., 3
+//!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+//!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+//!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+//!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+//!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+//!    ...
+//!
+//!    // Determine temporary device storage requirements
+//!    void     *d_temp_storage = nullptr;
+//!    size_t   temp_storage_bytes = 0;
+//!    cub::DeviceSegmentedSort::SortPairs(
+//!        d_temp_storage, temp_storage_bytes,
+//!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+//!        num_items, num_segments, d_offsets, d_offsets + 1);
+//!
+//!    // Allocate temporary storage
+//!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+//!
+//!    // Run sorting operation
+//!    cub::DeviceSegmentedSort::SortPairs(
+//!        d_temp_storage, temp_storage_bytes,
+//!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+//!        num_items, num_segments, d_offsets, d_offsets + 1);
+//!
+//!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+//!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+//!
+//! @endrst
+struct DeviceSegmentedSort
+{
+private:
+  // Name reported for NVTX ranges
+  _CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
+  {
+    return "cub::DeviceSegmentedRadixSort";
+  }
+
+  // Internal version without NVTX range
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = false;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @name Keys-only
+  //! @{
+
+  //! @rst
+  //! Sorts segments of keys into ascending order.
+  //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as `segment_offsets+1`).
+  //! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``int`` keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible
+  //!    // pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = true;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of keys into descending order. Approximately
+  //! ``num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysDescendingNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = false;
+    constexpr bool is_overwrite_okay = true;
+
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets  +1``).
+  //! - SortKeys is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible
+  //!    // pointers for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no
+  //!   work is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysNoNVTX(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = true;
+    constexpr bool is_overwrite_okay = true;
+
+    using DispatchT =
+      DispatchSegmentedSort<is_descending, KeyT, cub::NullType, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<NullType> d_values;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of keys into descending order. Approximately
+  //! ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysDescendingNoNVTX(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into ascending order. Approximately
+  //! ``num_items +  2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeys is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into descending order.
+  //! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeysDescending is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``)
+  //!   then a postcondition of stable sort is that ``x`` still precedes ``y``.
+  //! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
+  //!   ``[d_keys_in, d_keys_in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
+  //!   be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeys is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeys(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortKeys<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of keys into descending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers managed by a
+  //!   DoubleBuffer structure that indicates which of the two buffers is
+  //!   "current" (and thus contains the input data to be sorted).
+  //! - The contents of both buffers may be altered by the sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within the DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortKeysDescending is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
+  //!   The range ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ```i`
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a DoubleBuffer to wrap the pair of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortKeysDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
+  //!   ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
+  //!   ``i``-th segment is considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortKeysDescending<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = false;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @}  end member group
+  //! @name Key-value pairs
+  //! @{
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = true;
+    constexpr bool is_overwrite_okay = false;
+    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+    DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void    *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsDescendingNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = false;
+    constexpr bool is_overwrite_okay = true;
+    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting
+  //!   operation.
+  //! - Upon completion, the sorting operation will update the "current" indicator
+  //!   within each DoubleBuffer wrapper to reference which of the two buffers
+  //!   now contains the sorted output sequence (a function of the number of key bits
+  //!   specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
+  //!   ``j`` are equivalent: neither one is less than the other. It is not
+  //!   guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  // Internal version without NVTX range
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool is_descending     = true;
+    constexpr bool is_overwrite_okay = true;
+    using DispatchT = DispatchSegmentedSort<is_descending, KeyT, ValueT, int, BeginOffsetIteratorT, EndOffsetIteratorT>;
+
+    return DispatchT::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+
+public:
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - SortPairsDescending is not guaranteed to be stable. That is, suppose that
+  //!   ``i`` and ``j`` are equivalent: neither one is less than the other. It is
+  //!   not guaranteed that the relative order of these two elements will be
+  //!   preserved by sort.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for
+  //!    // sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::SortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsDescendingNoNVTX(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return SortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairs is stable: it preserves the relative ordering of
+  //!   equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When nullptr, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The contents of the input data are not altered by the sorting operation.
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairsDescending is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
+  //!   ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
+  //!   not overlap ``[in, in + num_items)``,
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
+  //!   ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes,
+  //!        d_keys_in, d_keys_out, d_values_in, d_values_out,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Device-accessible pointer to the input data of key data to sort
+  //!
+  //! @param[out] d_keys_out
+  //!   Device-accessible pointer to the sorted output sequence of key data
+  //!
+  //! @param[in] d_values_in
+  //!   Device-accessible pointer to the corresponding input sequence of
+  //!   associated value items
+  //!
+  //! @param[out] d_values_out
+  //!   Device-accessible pointer to the correspondingly-reordered output
+  //!   sequence of associated value items
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_values_in,
+      d_values_out,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into ascending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers. Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the
+  //!   sorting operation.
+  //! - Upon completion, the sorting operation will update the "current"
+  //!   indicator within each DoubleBuffer wrapper to reference which of the two
+  //!   buffers now contains the sorted output sequence (a function of the number
+  //!   of key bits specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairs is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes `y`, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!    // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairs(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+  //!    // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortPairs<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Sorts segments of key-value pairs into descending order.
+  //! Approximately ``2 * num_segments`` auxiliary storage required.
+  //!
+  //! - The sorting operation is given a pair of key buffers and a corresponding
+  //!   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+  //!   structure that indicates which of the two buffers is "current" (and thus
+  //!   contains the input data to be sorted).
+  //! - The contents of both buffers within each pair may be altered by the sorting
+  //!   operation.
+  //! - Upon completion, the sorting operation will update the "current" indicator
+  //!   within each DoubleBuffer wrapper to reference which of the two buffers
+  //!   now contains the sorted output sequence (a function of the number of key bits
+  //!   specified and the targeted device architecture).
+  //! - When the input is a contiguous sequence of segments, a single sequence
+  //!   ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
+  //!   for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
+  //!   the latter is specified as ``segment_offsets + 1``).
+  //! - StableSortPairsDescending is stable: it preserves the relative ordering
+  //!   of equivalent elements. That is, if ``x`` and ``y`` are elements such that
+  //!   ``x`` precedes ``y``, and if the two elements are equivalent (neither
+  //!   ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
+  //!   ``x`` still precedes ``y``.
+  //! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
+  //!   be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
+  //!   ``[cur, cur + num_items)`` shall not overlap
+  //!   ``[alt, alt + num_items)``. Both ranges shall not overlap
+  //!   ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
+  //!   ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
+  //! - Segments are not required to be contiguous. For all index values ``i``
+  //!   outside the specified segments ``d_keys.Current()[i]``,
+  //!   ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
+  //!   ``d_values.Alternate()[i]`` will not be accessed nor modified.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the batched sorting of three segments
+  //! (with one zero-length segment) of ``i`` nt keys with associated vector of
+  //! ``i`` nt values.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for sorting data
+  //!    int  num_items;          // e.g., 7
+  //!    int  num_segments;       // e.g., 3
+  //!    int  *d_offsets;         // e.g., [0, 3, 3, 7]
+  //!    int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+  //!    int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+  //!    int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+  //!    int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+  //!    ...
+  //!
+  //!    // Create a set of DoubleBuffers to wrap pairs of device pointers
+  //!    cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+  //!    cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run sorting operation
+  //!    cub::DeviceSegmentedSort::StableSortPairsDescending(
+  //!        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+  //!        num_items, num_segments, d_offsets, d_offsets + 1);
+  //!
+  //!    // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+  //!    // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyT
+  //!   **[inferred]** Key type
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Value type
+  //!
+  //! @tparam BeginOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   beginning offsets @iterator
+  //!
+  //! @tparam EndOffsetIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading segment
+  //!   ending offsets @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work
+  //!   is done
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_keys
+  //!   Reference to the double-buffer of keys whose "current" device-accessible
+  //!   buffer contains the unsorted input keys and, upon return, is updated to
+  //!   point to the sorted output keys
+  //!
+  //! @param[in,out] d_values
+  //!   Double-buffer of values whose "current" device-accessible buffer contains
+  //!   the unsorted input values and, upon return, is updated to point to the
+  //!   sorted output values
+  //!
+  //! @param[in] num_items
+  //!   The total number of items to sort (across all segments)
+  //!
+  //! @param[in] num_segments
+  //!   The number of segments that comprise the sorting data
+  //!
+  //! @param[in] d_begin_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of beginning offsets of
+  //!   length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
+  //!   element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
+  //!   @endrst
+  //!
+  //! @param[in] d_end_offsets
+  //!   @rst
+  //!   Random-access input iterator to the sequence of ending offsets of length
+  //!   ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
+  //!   the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
+  //!   If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
+  //!   considered empty.
+  //!   @endrst
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
+    return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return StableSortPairsDescending<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_select.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_select.cuh
new file mode 100644
index 000000000..22c9380eb
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_select.cuh
@@ -0,0 +1,1373 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data
+//! items residing within device-accessible memory.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/choose_offset.cuh>
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_select_if.cuh>
+#include <cub/device/dispatch/dispatch_unique_by_key.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceSelect provides device-wide, parallel operations for compacting
+//! selected items from sequences of data items residing within
+//! device-accessible memory.
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! These operations apply a selection criterion to selectively copy
+//! items from a specified input sequence to a compact output sequence.
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSelect}
+//!
+//! Performance
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @linear_performance{select-flagged, select-if, and select-unique}
+//!
+//! @endrst
+struct DeviceSelect
+{
+  //! @rst
+  //! Uses the ``d_flags`` sequence to selectively copy the corresponding items from ``d_in`` into ``d_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap ``[d_in, d_in + num_items)``,
+  //!   | ``[d_flags, d_flags + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input,
+  //!    // flags, and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+  //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_out                 <-- [1, 4, 6, 7]
+  //!    // d_num_selected_out    <-- [4]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected (i.e., length of `d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagIterator d_flags,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
+
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using SelectOp   = NullType; // Selection op (not used)
+    using EqualityOp = NullType; // Equality operator (not used)
+
+    return DispatchSelectIf<
+      InputIteratorT,
+      FlagIterator,
+      OutputIteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       d_flags,
+                       d_out,
+                       d_num_selected_out,
+                       SelectOp(),
+                       EqualityOp(),
+                       num_items,
+                       stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename FlagIterator, typename OutputIteratorT, typename NumSelectedIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagIterator d_flags,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Flagged<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
+  //! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
+  //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
+  //!   | ``[d_flags, d_flags + num_items)`` in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>  // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input,
+  //!    // flags, and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_data;                // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::Flagged(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_flags, d_num_selected_out, num_items);
+  //!
+  //!    // d_data                <-- [1, 4, 6, 7]
+  //!    // d_num_selected_out    <-- [4]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access iterator type for reading and writing selected items @iterator
+  //!
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //!
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_data`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    FlagIterator d_flags,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Flagged");
+
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using SelectOp   = NullType; // Selection op (not used)
+    using EqualityOp = NullType; // Equality operator (not used)
+
+    constexpr bool may_alias = true;
+
+    return DispatchSelectIf<
+      IteratorT,
+      FlagIterator,
+      IteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false,
+      may_alias>::Dispatch(d_temp_storage,
+                           temp_storage_bytes,
+                           d_data, // in
+                           d_flags,
+                           d_data, // out
+                           d_num_selected_out,
+                           SelectOp(),
+                           EqualityOp(),
+                           num_items,
+                           stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    FlagIterator d_flags,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Flagged<IteratorT, FlagIterator, NumSelectedIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_data, d_flags, d_num_selected_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap
+  //!   | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        LessThan(int compare) : compare(compare) {}
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        bool operator()(const int &a) const {
+  //!            return (a < compare);
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int      num_items;              // e.g., 8
+  //!    int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int      *d_num_selected_out;    // e.g., [ ]
+  //!    LessThan select_op(7);
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // d_out                 <-- [0, 2, 3, 5, 2]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!   (i.e., length of `d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     OutputIteratorT d_out,
+     NumSelectedIteratorT d_num_selected_out,
+     ::cuda::std::int64_t num_items,
+     SelectOp select_op,
+     cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
+
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using FlagIterator = NullType*; // FlagT iterator type (not used)
+    using EqualityOp   = NullType; // Equality operator (not used)
+
+    return DispatchSelectIf<
+      InputIteratorT,
+      FlagIterator,
+      OutputIteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       nullptr,
+                       d_out,
+                       d_num_selected_out,
+                       select_op,
+                       EqualityOp(),
+                       num_items,
+                       stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     InputIteratorT d_in,
+     OutputIteratorT d_out,
+     NumSelectedIteratorT d_num_selected_out,
+     ::cuda::std::int64_t num_items,
+     SelectOp select_op,
+     cudaStream_t stream,
+     bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return If<InputIteratorT, OutputIteratorT, NumSelectedIteratorT, SelectOp>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - | Copies of the selected items are compacted in ``d_data`` and maintain
+  //!   | their original relative ordering.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Functor type for selecting values less than some criteria
+  //!    struct LessThan
+  //!    {
+  //!        int compare;
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        LessThan(int compare) : compare(compare) {}
+  //!
+  //!        __host__ __device__ __forceinline__
+  //!        bool operator()(const int &a) const {
+  //!            return (a < compare);
+  //!        }
+  //!    };
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int      num_items;              // e.g., 8
+  //!    int      *d_data;                // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+  //!    int      *d_num_selected_out;    // e.g., [ ]
+  //!    LessThan select_op(7);
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::If(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_data, d_num_selected_out, num_items, select_op);
+  //!
+  //!    // d_data                <-- [0, 2, 3, 5, 2]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access input iterator type for reading and writing items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_data`)
+  //!
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     IteratorT d_data,
+     NumSelectedIteratorT d_num_selected_out,
+     ::cuda::std::int64_t num_items,
+     SelectOp select_op,
+     cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::If");
+
+    using OffsetT      = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using FlagIterator = NullType*; // FlagT iterator type (not used)
+    using EqualityOp   = NullType; // Equality operator (not used)
+
+    constexpr bool may_alias = true;
+
+    return DispatchSelectIf<
+      IteratorT,
+      FlagIterator,
+      IteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false,
+      may_alias>::Dispatch(d_temp_storage,
+                           temp_storage_bytes,
+                           d_data, // in
+                           nullptr,
+                           d_data, // out
+                           d_num_selected_out,
+                           select_op,
+                           EqualityOp(),
+                           num_items,
+                           stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename IteratorT, typename NumSelectedIteratorT, typename SelectOp>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  If(void* d_temp_storage,
+     size_t& temp_storage_bytes,
+     IteratorT d_data,
+     NumSelectedIteratorT d_num_selected_out,
+     ::cuda::std::int64_t num_items,
+     SelectOp select_op,
+     cudaStream_t stream,
+     bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return If<IteratorT, NumSelectedIteratorT, SelectOp>(
+      d_temp_storage, temp_storage_bytes, d_data, d_num_selected_out, num_items, select_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively copy the
+  //! corresponding items from ``d_in`` into ``d_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The expression ``select_op(flag)`` must be convertible to ``bool``,
+  //!   where the type of ``flag`` corresponds to the value type of ``FlagIterator``.
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap
+  //!   | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_select_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-select-iseven
+  //!     :end-before: example-end segmented-select-iseven
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_select_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-select-flaggedif
+  //!     :end-before: example-end segmented-select-flaggedif
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!   (i.e., length of `d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename FlagIterator,
+            typename OutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename SelectOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagIterator d_flags,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    SelectOp select_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
+
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using EqualityOp = NullType; // Equality operator (not used)
+
+    return DispatchSelectIf<
+      InputIteratorT,
+      FlagIterator,
+      OutputIteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       d_flags,
+                       d_out,
+                       d_num_selected_out,
+                       select_op,
+                       EqualityOp(),
+                       num_items,
+                       stream);
+  }
+
+  //! @rst
+  //! Uses the ``select_op`` functor applied to ``d_flags`` to selectively compact the
+  //! corresponding items in ``d_data``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The expression ``select_op(flag)`` must be convertible to ``bool``,
+  //!   where the type of ``flag`` corresponds to the value type of ``FlagIterator``.
+  //! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
+  //! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
+  //!   | ``[d_flags, d_flags + num_items)`` in any other way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_select_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-select-iseven
+  //!     :end-before: example-end segmented-select-iseven
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_select_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin segmented-select-flaggedif-inplace
+  //!     :end-before: example-end segmented-select-flaggedif-inplace
+  //!
+  //! @endrst
+  //!
+  //! @tparam IteratorT
+  //!   **[inferred]** Random-access iterator type for reading and writing selected items @iterator
+  //!
+  //! @tparam FlagIterator
+  //!   **[inferred]** Random-access input iterator type for reading selection flags @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam SelectOp
+  //!   **[inferred]** Selection operator type having member `bool operator()(const T &a)`
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in,out] d_data
+  //!   Pointer to the sequence of data items
+  //!
+  //! @param[in] d_flags
+  //!   Pointer to the input sequence of selection flags
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_data`)
+  //!
+  //! @param[in] select_op
+  //!   Unary selection operator
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename IteratorT, typename FlagIterator, typename NumSelectedIteratorT, typename SelectOp>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    IteratorT d_data,
+    FlagIterator d_flags,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    SelectOp select_op,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::FlaggedIf");
+
+    using OffsetT    = ::cuda::std::int64_t; // Signed integer type for global offsets
+    using EqualityOp = NullType; // Equality operator (not used)
+
+    constexpr bool may_alias = true;
+
+    return DispatchSelectIf<
+      IteratorT,
+      FlagIterator,
+      IteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false,
+      may_alias>::Dispatch(d_temp_storage,
+                           temp_storage_bytes,
+                           d_data, // in
+                           d_flags,
+                           d_data, // out
+                           d_num_selected_out,
+                           select_op,
+                           EqualityOp(),
+                           num_items,
+                           stream);
+  }
+
+  //! @rst
+  //! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys,
+  //! only the first key from each run is selectively copied to ``d_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain their original relative ordering.
+  //! - | The range ``[d_out, d_out + *d_num_selected_out)`` shall not overlap
+  //!   | ``[d_in, d_in + num_items)`` nor ``d_num_selected_out`` in any way.
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::Unique(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::Unique(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_in, d_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_out                 <-- [0, 2, 9, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected items @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output sequence of selected data items
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the output total number of items selected
+  //!   (i.e., length of `d_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::Unique");
+
+    using OffsetT      = ::cuda::std::int64_t;
+    using FlagIterator = NullType*; // FlagT iterator type (not used)
+    using SelectOp     = NullType; // Selection op (not used)
+    using EqualityOp   = Equality; // Default == operator
+
+    return DispatchSelectIf<
+      InputIteratorT,
+      FlagIterator,
+      OutputIteratorT,
+      NumSelectedIteratorT,
+      SelectOp,
+      EqualityOp,
+      OffsetT,
+      false>::Dispatch(d_temp_storage,
+                       temp_storage_bytes,
+                       d_in,
+                       nullptr,
+                       d_out,
+                       d_num_selected_out,
+                       SelectOp(),
+                       EqualityOp(),
+                       num_items,
+                       stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Unique(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ::cuda::std::int64_t num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Unique<InputIteratorT, OutputIteratorT, NumSelectedIteratorT>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
+  //! equal-valued keys, only the first key and its value from each run is selectively copied
+  //! to ``d_keys_out`` and ``d_values_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys_in,          d_keys_in    + num_items)``
+  //!   - ``[d_keys_out,         d_keys_out   + *d_num_selected_out)``
+  //!   - ``[d_values_in,        d_values_in  + num_items)``
+  //!   - ``[d_values_out,       d_values_out + *d_num_selected_out)``
+  //!   - ``[d_num_selected_out, d_num_selected_out + 1)``
+  //!
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_keys_out            <-- [0, 2, 9, 5, 8]
+  //!    // d_values_out          <-- [1, 2, 4, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected keys @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected values @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam EqualityOpT
+  //!   **[inferred]** Type of equality_op
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of values
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the output sequence of selected keys
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the output sequence of selected values
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+  //!
+  //! @param[in] equality_op
+  //!   Binary predicate to determine equality
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT,
+            typename EqualityOpT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static //
+    typename ::cuda::std::enable_if< //
+      !::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, //
+      cudaError_t>::type
+    UniqueByKey(
+      void* d_temp_storage,
+      size_t& temp_storage_bytes,
+      KeyInputIteratorT d_keys_in,
+      ValueInputIteratorT d_values_in,
+      KeyOutputIteratorT d_keys_out,
+      ValueOutputIteratorT d_values_out,
+      NumSelectedIteratorT d_num_selected_out,
+      NumItemsT num_items,
+      EqualityOpT equality_op,
+      cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSelect::UniqueByKey");
+
+    using OffsetT = detail::choose_offset_t<NumItemsT>;
+
+    return DispatchUniqueByKey<
+      KeyInputIteratorT,
+      ValueInputIteratorT,
+      KeyOutputIteratorT,
+      ValueOutputIteratorT,
+      NumSelectedIteratorT,
+      EqualityOpT,
+      OffsetT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys_in,
+                         d_values_in,
+                         d_keys_out,
+                         d_values_out,
+                         d_num_selected_out,
+                         equality_op,
+                         static_cast<OffsetT>(num_items),
+                         stream);
+  }
+
+  //! @rst
+  //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
+  //! equal-valued keys, only the first key and its value from each run is selectively copied
+  //! to ``d_keys_out`` and ``d_values_out``.
+  //! The total number of items selected is written to ``d_num_selected_out``.
+  //!
+  //! - The ``==`` equality operator is used to determine whether keys are equivalent
+  //! - Copies of the selected items are compacted into ``d_out`` and maintain
+  //!   their original relative ordering.
+  //! - In-place operations are not supported. There must be no overlap between
+  //!   any of the provided ranges:
+  //!
+  //!   - ``[d_keys_in,          d_keys_in    + num_items)``
+  //!   - ``[d_keys_out,         d_keys_out   + *d_num_selected_out)``
+  //!   - ``[d_values_in,        d_values_in  + num_items)``
+  //!   - ``[d_values_out,       d_values_out + *d_num_selected_out)``
+  //!   - ``[d_num_selected_out, d_num_selected_out + 1)``
+  //!
+  //! - @devicestorage
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers
+  //!    // for input and output
+  //!    int  num_items;              // e.g., 8
+  //!    int  *d_keys_in;             // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+  //!    int  *d_values_in;           // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+  //!    int  *d_keys_out;            // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_values_out;          // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+  //!    int  *d_num_selected_out;    // e.g., [ ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void     *d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run selection
+  //!    cub::DeviceSelect::UniqueByKey(
+  //!      d_temp_storage, temp_storage_bytes,
+  //!      d_keys_in, d_values_in,
+  //!      d_keys_out, d_values_out, d_num_selected_out, num_items);
+  //!
+  //!    // d_keys_out            <-- [0, 2, 9, 5, 8]
+  //!    // d_values_out          <-- [1, 2, 4, 5, 8]
+  //!    // d_num_selected_out    <-- [5]
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input keys @iterator
+  //!
+  //! @tparam ValueInputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input values @iterator
+  //!
+  //! @tparam KeyOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected keys @iterator
+  //!
+  //! @tparam ValueOutputIteratorT
+  //!   **[inferred]** Random-access output iterator type for writing selected values @iterator
+  //!
+  //! @tparam NumSelectedIteratorT
+  //!   **[inferred]** Output iterator type for recording the number of items selected @iterator
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
+  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_keys_in
+  //!   Pointer to the input sequence of keys
+  //!
+  //! @param[in] d_values_in
+  //!   Pointer to the input sequence of values
+  //!
+  //! @param[out] d_keys_out
+  //!   Pointer to the output sequence of selected keys
+  //!
+  //! @param[out] d_values_out
+  //!   Pointer to the output sequence of selected values
+  //!
+  //! @param[out] d_num_selected_out
+  //!   Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream = 0)
+  {
+    return UniqueByKey(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_keys_out,
+      d_values_out,
+      d_num_selected_out,
+      num_items,
+      Equality{},
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename KeyInputIteratorT,
+            typename ValueInputIteratorT,
+            typename KeyOutputIteratorT,
+            typename ValueOutputIteratorT,
+            typename NumSelectedIteratorT,
+            typename NumItemsT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    NumItemsT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return UniqueByKey<KeyInputIteratorT,
+                       ValueInputIteratorT,
+                       KeyOutputIteratorT,
+                       ValueOutputIteratorT,
+                       NumSelectedIteratorT,
+                       NumItemsT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_keys_out,
+      d_values_out,
+      d_num_selected_out,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_spmv.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_spmv.cuh
new file mode 100644
index 000000000..8b7e60d43
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_spmv.cuh
@@ -0,0 +1,247 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication
+//! (SpMV).
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/config.cuh>
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_spmv_orig.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <iterator>
+#include <limits>
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! DeviceSpmv provides device-wide parallel operations for performing
+//! sparse-matrix * dense-vector multiplication (SpMV).
+//!
+//! Overview
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! The `SpMV computation <http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication>`_
+//! performs the matrix-vector operation ``y = A * x + y``, where:
+//!
+//!  - ``A`` is an ``m * n`` sparse matrix whose non-zero structure is specified in
+//!    `compressed-storage-row (CSR) format
+//!    <https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)>`_ (i.e., three
+//!    arrays:
+//!    ``values``, ``row_offsets``, and ``column_indices``)
+//!  - ``x`` and ``y`` are dense vectors
+//!
+//! Usage Considerations
+//! +++++++++++++++++++++++++++++++++++++++++++++
+//!
+//! @cdp_class{DeviceSpmv}
+//!
+//! @endrst
+struct DeviceSpmv
+{
+  //! @name CSR matrix operations
+  //! @{
+
+  //! @rst
+  //! This function performs the matrix-vector operation ``y = A*x``.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates SpMV upon a 9x9 CSR matrix ``A`` representing a 3x3 lattice (24 non-zeros).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+  //!
+  //!    // Declare, allocate, and initialize device-accessible pointers for input matrix A, input
+  //!    vector x,
+  //!    // and output vector y
+  //!    int    num_rows = 9;
+  //!    int    num_cols = 9;
+  //!    int    num_nonzeros = 24;
+  //!
+  //!    float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+  //!                      //        1, 1, 1, 1, 1, 1, 1, 1,
+  //!                      //        1, 1, 1, 1, 1, 1, 1, 1]
+  //!
+  //!    int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+  //!                             //        4, 6, 1, 3, 5, 7, 2, 4,
+  //!                             //        8, 3, 7, 4, 6, 8, 5, 7]
+  //!
+  //!    int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+  //!
+  //!    float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+  //!    float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+  //!    ...
+  //!
+  //!    // Determine temporary device storage requirements
+  //!    void*    d_temp_storage = nullptr;
+  //!    size_t   temp_storage_bytes = 0;
+  //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+  //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+  //!        num_rows, num_cols, num_nonzeros);
+  //!
+  //!    // Allocate temporary storage
+  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  //!
+  //!    // Run SpMV
+  //!    cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+  //!        d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+  //!        num_rows, num_cols, num_nonzeros);
+  //!
+  //!    // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+  //!
+  //! @endrst
+  //!
+  //! @tparam ValueT
+  //!   **[inferred]** Matrix and vector value type (e.g., `float`, `double`, etc.)
+  //!
+  //! @param[in] d_temp_storage
+  //!   Device-accessible allocation of temporary storage.
+  //!   When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is done.
+  //!
+  //! @param[in,out] temp_storage_bytes
+  //!   Reference to size in bytes of `d_temp_storage` allocation
+  //!
+  //! @param[in] d_values
+  //!   Pointer to the array of `num_nonzeros` values of the corresponding nonzero elements
+  //!   of matrix `A`.
+  //!
+  //! @param[in] d_row_offsets
+  //!   Pointer to the array of `m + 1` offsets demarcating the start of every row in
+  //!   `d_column_indices` and `d_values` (with the final entry being equal to `num_nonzeros`)
+  //!
+  //! @param[in] d_column_indices
+  //!   Pointer to the array of `num_nonzeros` column-indices of the corresponding nonzero
+  //!   elements of matrix `A`. (Indices are zero-valued.)
+  //!
+  //! @param[in] d_vector_x
+  //!   Pointer to the array of `num_cols` values corresponding to the dense input vector `x`
+  //!
+  //! @param[out] d_vector_y
+  //!   Pointer to the array of `num_rows` values corresponding to the dense output vector `y`
+  //!
+  //! @param[in] num_rows
+  //!   number of rows of matrix `A`.
+  //!
+  //! @param[in] num_cols
+  //!   number of columns of matrix `A`.
+  //!
+  //! @param[in] num_nonzeros
+  //!   number of nonzero elements of matrix `A`.
+  //!
+  //! @param[in] stream
+  //!   @rst
+  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //!   @endrst
+  template <typename ValueT>
+  CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const ValueT* d_values,
+    const int* d_row_offsets,
+    const int* d_column_indices,
+    const ValueT* d_vector_x,
+    ValueT* d_vector_y,
+    int num_rows,
+    int num_cols,
+    int num_nonzeros,
+    cudaStream_t stream = 0)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSpmv::CsrMV");
+
+    SpmvParams<ValueT, int> spmv_params;
+    spmv_params.d_values          = d_values;
+    spmv_params.d_row_end_offsets = d_row_offsets + 1;
+    spmv_params.d_column_indices  = d_column_indices;
+    spmv_params.d_vector_x        = d_vector_x;
+    spmv_params.d_vector_y        = d_vector_y;
+    spmv_params.num_rows          = num_rows;
+    spmv_params.num_cols          = num_cols;
+    spmv_params.num_nonzeros      = num_nonzeros;
+    spmv_params.alpha             = ValueT{1};
+    spmv_params.beta              = ValueT{0};
+
+    return DispatchSpmv<ValueT, int>::Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename ValueT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    const ValueT* d_values,
+    const int* d_row_offsets,
+    const int* d_column_indices,
+    const ValueT* d_vector_x,
+    ValueT* d_vector_y,
+    int num_rows,
+    int num_cols,
+    int num_nonzeros,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return CsrMV<ValueT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_values,
+      d_row_offsets,
+      d_column_indices,
+      d_vector_x,
+      d_vector_y,
+      num_rows,
+      num_cols,
+      num_nonzeros,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/device_transform.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_transform.cuh
new file mode 100644
index 000000000..984109692
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/device_transform.cuh
@@ -0,0 +1,271 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/nvtx.cuh>
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/tuple>
+
+CUB_NAMESPACE_BEGIN
+
+//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input
+//! sequences into an output sequence.
+struct DeviceTransform
+{
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many
+  //!     :end-before: example-end transform-many
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
+    return detail::transform::
+      dispatch_t<false, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity
+  //! (i.e. address) of the objects passed to the call operator of the transformation operation.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB
+  // APIs.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return Transform(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //!
+  //! A Simple Example
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin transform-many-stable
+  //!     :end-before: example-end transform-many-stable
+  //!
+  //! @endrst
+  //!
+  //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The
+  //! iterators' value types must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
+    return detail::transform::
+      dispatch_t<true, int, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, TransformOp>::
+        dispatch(
+          ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //! @rst
+  //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding
+  //! input elements and writing the result to the corresponding output element. The objects passed to the call operator
+  //! of the transformation operation are guaranteed to reside in the input sequences and are never copied.
+  //! @endrst
+  //!
+  //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type
+  //! must be trivially relocatable.
+  //! @param output An iterator to the output sequence where num_items results are written to.
+  //! @param num_items The number of elements in each input sequence.
+  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
+  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
+  //! operator must be assignable to the dereferenced output iterator.
+  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename TransformOp>
+  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    RandomAccessIteratorIn input,
+    RandomAccessIteratorOut output,
+    int num_items,
+    TransformOp transform_op,
+    cudaStream_t stream = nullptr)
+  {
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+    return TransformStableArgumentAddresses(
+      ::cuda::std::make_tuple(::cuda::std::move(input)),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(transform_op),
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
new file mode 100644
index 000000000..af41c7137
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -0,0 +1,378 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_adjacent_difference.cuh>
+#include <cub/detail/type_traits.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
+{
+  const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
+}
+
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          typename InputT,
+          bool MayAlias,
+          bool ReadLeft>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
+  InputIteratorT input,
+  InputT* first_tile_previous,
+  OutputIteratorT result,
+  DifferenceOpT difference_op,
+  OffsetT num_items)
+{
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy;
+
+  // It is OK to introspect the return type or parameter types of the
+  // `operator()` function of `__device__` extended lambda within device code.
+  using OutputT = detail::invoke_result_t<DifferenceOpT, InputT, InputT>;
+
+  using Agent =
+    AgentDifference<ActivePolicyT,
+                    InputIteratorT,
+                    OutputIteratorT,
+                    DifferenceOpT,
+                    OffsetT,
+                    InputT,
+                    OutputT,
+                    MayAlias,
+                    ReadLeft>;
+
+  __shared__ typename Agent::TempStorage storage;
+
+  Agent agent(storage, input, first_tile_previous, result, difference_op, num_items);
+
+  int tile_idx      = static_cast<int>(blockIdx.x);
+  OffsetT tile_base = static_cast<OffsetT>(tile_idx) * ActivePolicyT::ITEMS_PER_TILE;
+
+  agent.Process(tile_idx, tile_base);
+}
+
+template <typename InputIteratorT, bool MayAlias = true>
+struct DeviceAdjacentDifferencePolicy
+{
+  using ValueT = typename std::iterator_traits<InputIteratorT>::value_type;
+
+  //------------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //------------------------------------------------------------------------------
+
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    using AdjacentDifferencePolicy =
+      AgentAdjacentDifferencePolicy<128,
+                                    Nominal8BItemsToItems<ValueT>(7),
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    LOAD_DEFAULT,
+                                    BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  {
+    using AdjacentDifferencePolicy =
+      AgentAdjacentDifferencePolicy<128,
+                                    Nominal8BItemsToItems<ValueT>(7),
+                                    BLOCK_LOAD_WARP_TRANSPOSE,
+                                    MayAlias ? LOAD_CA : LOAD_LDG,
+                                    BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename DifferenceOpT,
+          typename OffsetT,
+          bool MayAlias,
+          bool ReadLeft,
+          typename SelectedPolicy = DeviceAdjacentDifferencePolicy<InputIteratorT, MayAlias>>
+struct DispatchAdjacentDifference : public SelectedPolicy
+{
+  using InputT = typename std::iterator_traits<InputIteratorT>::value_type;
+
+  void* d_temp_storage;
+  std::size_t& temp_storage_bytes;
+  InputIteratorT d_input;
+  OutputIteratorT d_output;
+  OffsetT num_items;
+  DifferenceOpT difference_op;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    OffsetT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input(d_input)
+      , d_output(d_output)
+      , num_items(num_items)
+      , difference_op(difference_op)
+      , stream(stream)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_DEPRECATED CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchAdjacentDifference(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    OffsetT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input(d_input)
+      , d_output(d_output)
+      , num_items(num_items)
+      , difference_op(difference_op)
+      , stream(stream)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using AdjacentDifferencePolicyT = typename ActivePolicyT::AdjacentDifferencePolicy;
+
+    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      constexpr int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE;
+      const int num_tiles     = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      std::size_t first_tile_previous_size = MayAlias * num_tiles * sizeof(InputT);
+
+      void* allocations[1]            = {nullptr};
+      std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+
+        if (temp_storage_bytes == 0)
+        {
+          temp_storage_bytes = 1;
+        }
+
+        break;
+      }
+
+      if (num_items == OffsetT{})
+      {
+        break;
+      }
+
+      auto first_tile_previous = reinterpret_cast<InputT*>(allocations[0]);
+
+      if (MayAlias)
+      {
+        using AgentDifferenceInitT = AgentDifferenceInit<InputIteratorT, InputT, OffsetT, ReadLeft>;
+
+        constexpr int init_block_size = AgentDifferenceInitT::BLOCK_THREADS;
+        const int init_grid_size      = ::cuda::ceil_div(num_tiles, init_block_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking DeviceAdjacentDifferenceInitKernel"
+                "<<<%d, %d, 0, %lld>>>()\n",
+                init_grid_size,
+                init_block_size,
+                reinterpret_cast<long long>(stream));
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
+          .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
+                d_input,
+                first_tile_previous,
+                num_tiles,
+                tile_size);
+
+        error = CubDebug(detail::DebugSyncStream(stream));
+
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel"
+              "<<<%d, %d, 0, %lld>>>()\n",
+              num_tiles,
+              AdjacentDifferencePolicyT::BLOCK_THREADS,
+              reinterpret_cast<long long>(stream));
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
+        .doit(DeviceAdjacentDifferenceDifferenceKernel<
+                MaxPolicyT,
+                InputIteratorT,
+                OutputIteratorT,
+                DifferenceOpT,
+                OffsetT,
+                InputT,
+                MayAlias,
+                ReadLeft>,
+              d_input,
+              first_tile_previous,
+              d_output,
+              difference_op,
+              num_items);
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    OffsetT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchAdjacentDifference dispatch(
+        d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_input,
+    OutputIteratorT d_output,
+    OffsetT num_items,
+    DifferenceOpT difference_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
new file mode 100644
index 000000000..344626c4b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -0,0 +1,751 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DispatchBatchMemcpy provides device-wide, parallel operations for copying data from a number
+ * of given source buffers to their corresponding destination buffer.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_batch_memcpy.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/detail/temporary_storage.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * Parameterizable tuning policy type for AgentBatchMemcpy
+ */
+template <uint32_t _BLOCK_THREADS, uint32_t _BYTES_PER_THREAD>
+struct AgentBatchMemcpyLargeBuffersPolicy
+{
+  /// Threads per thread block
+  static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
+  /// The number of bytes each thread copies
+  static constexpr uint32_t BYTES_PER_THREAD = _BYTES_PER_THREAD;
+};
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <typename BufferOffsetScanTileStateT, typename BlockOffsetScanTileStateT, typename TileOffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel(
+  BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
+  BlockOffsetScanTileStateT block_offset_scan_tile_state,
+  TileOffsetT num_tiles)
+{
+  // Initialize tile status
+  buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
+  block_offset_scan_tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Kernel that copies buffers that need to be copied by at least one (and potentially many) thread
+ * blocks.
+ */
+template <typename ChainedPolicyT,
+          typename BufferOffsetT,
+          typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferTileOffsetItT,
+          typename TileT,
+          typename TileOffsetT,
+          bool IsMemcpy>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes,
+    BufferTileOffsetItT buffer_tile_offsets,
+    TileT buffer_offset_tile,
+    TileOffsetT last_tile_offset)
+{
+  using StatusWord    = typename TileT::StatusWord;
+  using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
+  using BufferSizeT   = cub::detail::value_t<BufferSizeIteratorT>;
+  /// Internal load/store type. For byte-wise memcpy, a single-byte type
+  using AliasT =
+    typename ::cuda::std::conditional<IsMemcpy,
+                                      std::iterator_traits<char*>,
+                                      std::iterator_traits<cub::detail::value_t<InputBufferIt>>>::type::value_type;
+  /// Types of the input and output buffers
+  using InputBufferT  = cub::detail::value_t<InputBufferIt>;
+  using OutputBufferT = cub::detail::value_t<OutputBufferIt>;
+
+  constexpr uint32_t BLOCK_THREADS    = ActivePolicyT::BLOCK_THREADS;
+  constexpr uint32_t ITEMS_PER_THREAD = ActivePolicyT::BYTES_PER_THREAD;
+  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
+
+  BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);
+
+  uint32_t tile_id = blockIdx.x;
+
+  // No block-level buffers => we're done here
+  if (num_blev_buffers == 0)
+  {
+    return;
+  }
+
+  // While there's still tiles of bytes from block-level buffers to copied
+  do
+  {
+    __shared__ BufferOffsetT block_buffer_id;
+
+    // Make sure thread 0 does not overwrite the buffer id before other threads have finished with
+    // the prior iteration of the loop
+    CTA_SYNC();
+
+    // Binary search the buffer that this tile belongs to
+    if (threadIdx.x == 0)
+    {
+      block_buffer_id = UpperBound(buffer_tile_offsets, num_blev_buffers, tile_id) - 1;
+    }
+
+    // Make sure thread 0 has written the buffer this thread block is assigned to
+    CTA_SYNC();
+
+    const BufferOffsetT buffer_id = block_buffer_id;
+
+    // The relative offset of this tile within the buffer it's assigned to
+    BufferSizeT tile_offset_within_buffer =
+      static_cast<BufferSizeT>(tile_id - buffer_tile_offsets[buffer_id]) * TILE_SIZE;
+
+    // If the tile has already reached beyond the work of the end of the last buffer
+    if (buffer_id >= num_blev_buffers - 1 && tile_offset_within_buffer > buffer_sizes[buffer_id])
+    {
+      return;
+    }
+
+    // Tiny remainders are copied without vectorizing laods
+    if (buffer_sizes[buffer_id] - tile_offset_within_buffer <= 32)
+    {
+      BufferSizeT thread_offset = tile_offset_within_buffer + threadIdx.x;
+      for (int i = 0; i < ITEMS_PER_THREAD; i++)
+      {
+        if (thread_offset < buffer_sizes[buffer_id])
+        {
+          const auto value = read_item<IsMemcpy, AliasT, InputBufferT>(input_buffer_it[buffer_id], thread_offset);
+          write_item<IsMemcpy, AliasT, OutputBufferT>(output_buffer_it[buffer_id], thread_offset, value);
+        }
+        thread_offset += BLOCK_THREADS;
+      }
+    }
+    else
+    {
+      copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
+        input_buffer_it[buffer_id],
+        output_buffer_it[buffer_id],
+        (cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
+        tile_offset_within_buffer);
+    }
+
+    tile_id += gridDim.x;
+  } while (true);
+}
+
+/**
+ * @brief Kernel that copies data from a batch of given source buffers to their corresponding
+ * destination buffer. If a buffer's size is to large to be copied by a single thread block, that
+ * buffer is put into a queue of buffers that will get picked up later on, where multiple blocks
+ * collaborate on each of these buffers. All other buffers get copied straight away.
+ *
+ * @param input_buffer_it [in] Iterator providing the pointers to the source memory buffers
+ * @param output_buffer_it [in] Iterator providing the pointers to the destination memory buffers
+ * @param buffer_sizes [in] Iterator providing the number of bytes to be copied for each pair of
+ * buffers
+ * @param num_buffers [in] The total number of buffer pairs
+ * @param blev_buffer_srcs [out] The source pointers of buffers that require block-level
+ * collaboration
+ * @param blev_buffer_dsts [out] The destination pointers of buffers that require block-level
+ * collaboration
+ * @param blev_buffer_sizes [out] The sizes of buffers that require block-level collaboration
+ * @param blev_buffer_scan_state [in,out] Tile states for the prefix sum over the count of buffers
+ * requiring block-level collaboration (to "stream compact" (aka "select") BLEV-buffers)
+ * @param blev_block_scan_state [in,out] Tile states for the prefix sum over the number of thread
+ * blocks getting assigned to each buffer that requires block-level collaboration
+ */
+template <typename ChainedPolicyT,
+          typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferOffsetT,
+          typename BlevBufferSrcsOutItT,
+          typename BlevBufferDstsOutItT,
+          typename BlevBufferSizesOutItT,
+          typename BlevBufferTileOffsetsOutItT,
+          typename BlockOffsetT,
+          typename BLevBufferOffsetTileState,
+          typename BLevBlockOffsetTileState,
+          bool IsMemcpy>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes,
+    BufferOffsetT num_buffers,
+    BlevBufferSrcsOutItT blev_buffer_srcs,
+    BlevBufferDstsOutItT blev_buffer_dsts,
+    BlevBufferSizesOutItT blev_buffer_sizes,
+    BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
+    BLevBufferOffsetTileState blev_buffer_scan_state,
+    BLevBlockOffsetTileState blev_block_scan_state)
+{
+  // Internal type used for storing a buffer's size
+  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
+
+  // Alias the correct tuning policy for the current compilation pass' architecture
+  using AgentBatchMemcpyPolicyT = typename ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT;
+
+  // Block-level specialization
+  using AgentBatchMemcpyT = AgentBatchMemcpy<
+    AgentBatchMemcpyPolicyT,
+    InputBufferIt,
+    OutputBufferIt,
+    BufferSizeIteratorT,
+    BufferOffsetT,
+    BlevBufferSrcsOutItT,
+    BlevBufferDstsOutItT,
+    BlevBufferSizesOutItT,
+    BlevBufferTileOffsetsOutItT,
+    BlockOffsetT,
+    BLevBufferOffsetTileState,
+    BLevBlockOffsetTileState,
+    IsMemcpy>;
+
+  // Shared memory for AgentBatchMemcpy
+  __shared__ typename AgentBatchMemcpyT::TempStorage temp_storage;
+
+  // Process this block's tile of input&output buffer pairs
+  AgentBatchMemcpyT(
+    temp_storage,
+    input_buffer_it,
+    output_buffer_it,
+    buffer_sizes,
+    num_buffers,
+    blev_buffer_srcs,
+    blev_buffer_dsts,
+    blev_buffer_sizes,
+    blev_buffer_tile_offsets,
+    blev_buffer_scan_state,
+    blev_block_scan_state)
+    .ConsumeTile(blockIdx.x);
+}
+
+template <class BufferOffsetT, class BlockOffsetT>
+struct DeviceBatchMemcpyPolicy
+{
+  static constexpr uint32_t BLOCK_THREADS         = 128U;
+  static constexpr uint32_t BUFFERS_PER_THREAD    = 4U;
+  static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;
+
+  static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS    = 256U;
+  static constexpr uint32_t LARGE_BUFFER_BYTES_PER_THREAD = 32U;
+
+  static constexpr uint32_t WARP_LEVEL_THRESHOLD  = 128;
+  static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024;
+
+  using buff_delay_constructor_t  = detail::default_delay_constructor_t<BufferOffsetT>;
+  using block_delay_constructor_t = detail::default_delay_constructor_t<BlockOffsetT>;
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    static constexpr bool PREFER_POW2_BITS = true;
+    using AgentSmallBufferPolicyT          = AgentBatchMemcpyPolicy<
+               BLOCK_THREADS,
+               BUFFERS_PER_THREAD,
+               TLEV_BYTES_PER_THREAD,
+               PREFER_POW2_BITS,
+               LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD,
+               WARP_LEVEL_THRESHOLD,
+               BLOCK_LEVEL_THRESHOLD,
+               buff_delay_constructor_t,
+               block_delay_constructor_t>;
+
+    using AgentLargeBufferPolicyT =
+      AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
+  };
+
+  /// SM70
+  struct Policy700 : ChainedPolicy<700, Policy700, Policy350>
+  {
+    static constexpr bool PREFER_POW2_BITS = false;
+    using AgentSmallBufferPolicyT          = AgentBatchMemcpyPolicy<
+               BLOCK_THREADS,
+               BUFFERS_PER_THREAD,
+               TLEV_BYTES_PER_THREAD,
+               PREFER_POW2_BITS,
+               LARGE_BUFFER_BLOCK_THREADS * LARGE_BUFFER_BYTES_PER_THREAD,
+               WARP_LEVEL_THRESHOLD,
+               BLOCK_LEVEL_THRESHOLD,
+               buff_delay_constructor_t,
+               block_delay_constructor_t>;
+
+    using AgentLargeBufferPolicyT =
+      AgentBatchMemcpyLargeBuffersPolicy<LARGE_BUFFER_BLOCK_THREADS, LARGE_BUFFER_BYTES_PER_THREAD>;
+  };
+
+  using MaxPolicy = Policy700;
+};
+
+/**
+ * @tparam InputBufferIt **[inferred]** Random-access input iterator type providing the pointers
+ * to the source memory buffers
+ * @tparam OutputBufferIt **[inferred]** Random-access input iterator type providing the pointers
+ * to the destination memory buffers
+ * @tparam BufferSizeIteratorT **[inferred]** Random-access input iterator type providing the
+ * number of bytes to be copied for each pair of buffers
+ * @tparam BufferOffsetT Integer type large enough to hold any offset in [0, num_buffers)
+ * @tparam BlockOffsetT Integer type large enough to hold any offset in [0,
+ * num_thread_blocks_launched)
+ */
+template <typename InputBufferIt,
+          typename OutputBufferIt,
+          typename BufferSizeIteratorT,
+          typename BufferOffsetT,
+          typename BlockOffsetT,
+          typename SelectedPolicy = DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT>,
+          bool IsMemcpy           = true>
+struct DispatchBatchMemcpy : SelectedPolicy
+{
+  //------------------------------------------------------------------------------
+  // TYPE ALIASES
+  //------------------------------------------------------------------------------
+  // Tile state for the single-pass prefix scan to "stream compact" (aka "select") the buffers
+  // requiring block-level collaboration
+  using BufferPartitionScanTileStateT = typename cub::ScanTileState<BufferOffsetT>;
+
+  // Tile state for the single-pass prefix scan to keep track of how many blocks are assigned to
+  // each of the buffers requiring block-level collaboration
+  using BufferTileOffsetScanStateT = typename cub::ScanTileState<BlockOffsetT>;
+
+  // Internal type used to keep track of a buffer's size
+  using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
+
+  //------------------------------------------------------------------------------
+  // Member Veriables
+  //------------------------------------------------------------------------------
+  void* d_temp_storage;
+  size_t& temp_storage_bytes;
+  InputBufferIt input_buffer_it;
+  OutputBufferIt output_buffer_it;
+  BufferSizeIteratorT buffer_sizes;
+  BufferOffsetT num_buffers;
+  cudaStream_t stream;
+
+  //------------------------------------------------------------------------------
+  // Constructor
+  //------------------------------------------------------------------------------
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchBatchMemcpy(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes,
+    BufferOffsetT num_buffers,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , input_buffer_it(input_buffer_it)
+      , output_buffer_it(output_buffer_it)
+      , buffer_sizes(buffer_sizes)
+      , num_buffers(num_buffers)
+      , stream(stream)
+  {}
+
+  //------------------------------------------------------------------------------
+  // Chained policy invocation
+  //------------------------------------------------------------------------------
+  /**
+   * @brief Tuning policy invocation. This member function template is getting instantiated for all
+   * tuning policies in the tuning policy chain. It is, however, *invoked* for the correct tuning
+   * policy only.
+   */
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;
+
+    // Single-pass prefix scan tile states for the prefix-sum over the number of block-level buffers
+    using BLevBufferOffsetTileState = cub::ScanTileState<BufferOffsetT>;
+
+    // Single-pass prefix scan tile states for the prefix sum over the number of thread blocks
+    // assigned to each of the block-level buffers
+    using BLevBlockOffsetTileState = cub::ScanTileState<BlockOffsetT>;
+
+    cudaError error = cudaSuccess;
+
+    enum : uint32_t
+    {
+      // Memory for the source pointers of the buffers that require block-level collaboration
+      MEM_BLEV_BUFFER_SRCS = 0,
+      // Memory for the destination pointers of the buffers that require block-level collaboration
+      MEM_BLEV_BUFFER_DSTS,
+      // Memory for the block-level buffers' sizes
+      MEM_BLEV_BUFFER_SIZES,
+      // Memory to keep track of the assignment of thread blocks to block-level buffers
+      MEM_BLEV_BUFFER_TBLOCK,
+      // Memory for the tile states of the prefix sum over the number of buffers that require
+      // block-level collaboration
+      MEM_BLEV_BUFFER_SCAN_STATE,
+      // Memory for the scan tile states of the prefix sum over the number of thread block's
+      // assigned up to and including a certain block-level buffer
+      MEM_BLEV_BLOCK_SCAN_STATE,
+      // Total number of distinct memory allocations in the temporary storage memory BLOB
+      MEM_NUM_ALLOCATIONS
+    };
+
+    // Number of threads per block for initializing the grid states
+    constexpr BlockOffsetT INIT_KERNEL_THREADS = 128U;
+
+    // The number of buffers that get processed per thread block
+    constexpr uint32_t TILE_SIZE = ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS
+                                 * ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;
+
+    // The number of thread blocks (or tiles) required to process all of the given buffers
+    BlockOffsetT num_tiles = ::cuda::ceil_div(num_buffers, TILE_SIZE);
+
+    using BlevBufferSrcsOutT          = ::cuda::std::_If<IsMemcpy, const void*, cub::detail::value_t<InputBufferIt>>;
+    using BlevBufferDstOutT           = ::cuda::std::_If<IsMemcpy, void*, cub::detail::value_t<OutputBufferIt>>;
+    using BlevBufferSrcsOutItT        = BlevBufferSrcsOutT*;
+    using BlevBufferDstsOutItT        = BlevBufferDstOutT*;
+    using BlevBufferSizesOutItT       = BufferSizeT*;
+    using BlevBufferTileOffsetsOutItT = BlockOffsetT*;
+
+    temporary_storage::layout<MEM_NUM_ALLOCATIONS> temporary_storage_layout;
+
+    auto blev_buffer_srcs_slot       = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SRCS);
+    auto blev_buffer_dsts_slot       = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_DSTS);
+    auto blev_buffer_sizes_slot      = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SIZES);
+    auto blev_buffer_block_slot      = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_TBLOCK);
+    auto blev_buffer_scan_slot       = temporary_storage_layout.get_slot(MEM_BLEV_BUFFER_SCAN_STATE);
+    auto blev_buffer_block_scan_slot = temporary_storage_layout.get_slot(MEM_BLEV_BLOCK_SCAN_STATE);
+
+    auto blev_buffer_srcs_alloc  = blev_buffer_srcs_slot->template create_alias<BlevBufferSrcsOutT>();
+    auto blev_buffer_dsts_alloc  = blev_buffer_dsts_slot->template create_alias<BlevBufferDstOutT>();
+    auto blev_buffer_sizes_alloc = blev_buffer_sizes_slot->template create_alias<BufferSizeT>();
+    auto blev_buffer_block_alloc = blev_buffer_block_slot->template create_alias<BlockOffsetT>();
+    auto blev_buffer_scan_alloc  = blev_buffer_scan_slot->template create_alias<uint8_t>();
+    auto blev_block_scan_alloc   = blev_buffer_block_scan_slot->template create_alias<uint8_t>();
+
+    std::size_t buffer_offset_scan_storage = 0;
+    std::size_t blev_block_scan_storage    = 0;
+    error =
+      CubDebug(BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles), buffer_offset_scan_storage));
+    if (error)
+    {
+      return error;
+    }
+
+    error =
+      CubDebug(BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles), blev_block_scan_storage));
+    if (error)
+    {
+      return error;
+    }
+
+    blev_buffer_srcs_alloc.grow(num_buffers);
+    blev_buffer_dsts_alloc.grow(num_buffers);
+    blev_buffer_sizes_alloc.grow(num_buffers);
+    blev_buffer_block_alloc.grow(num_buffers);
+    blev_buffer_scan_alloc.grow(buffer_offset_scan_storage);
+    blev_block_scan_alloc.grow(blev_block_scan_storage);
+
+    // Just return if no temporary storage is provided
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = temporary_storage_layout.get_size();
+      return error;
+    }
+
+    // Return if empty problem
+    if (num_buffers == 0)
+    {
+      return error;
+    }
+
+    // Alias memory buffers into the storage blob
+    error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Alias into temporary storage allocation
+    BlevBufferSrcsOutItT d_blev_src_buffers          = blev_buffer_srcs_alloc.get();
+    BlevBufferDstsOutItT d_blev_dst_buffers          = blev_buffer_dsts_alloc.get();
+    BlevBufferSizesOutItT d_blev_buffer_sizes        = blev_buffer_sizes_alloc.get();
+    BlevBufferTileOffsetsOutItT d_blev_block_offsets = blev_buffer_block_alloc.get();
+
+    // Kernels' grid sizes
+    BlockOffsetT init_grid_size         = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
+    BlockOffsetT batch_memcpy_grid_size = num_tiles;
+
+    // Kernels
+    auto init_scan_states_kernel =
+      InitTileStateKernel<BLevBufferOffsetTileState, BLevBlockOffsetTileState, BlockOffsetT>;
+    auto batch_memcpy_non_blev_kernel = BatchMemcpyKernel<
+      MaxPolicyT,
+      InputBufferIt,
+      OutputBufferIt,
+      BufferSizeIteratorT,
+      BufferOffsetT,
+      BlevBufferSrcsOutItT,
+      BlevBufferDstsOutItT,
+      BlevBufferSizesOutItT,
+      BlevBufferTileOffsetsOutItT,
+      BlockOffsetT,
+      BLevBufferOffsetTileState,
+      BLevBlockOffsetTileState,
+      IsMemcpy>;
+
+    auto multi_block_memcpy_kernel = MultiBlockBatchMemcpyKernel<
+      MaxPolicyT,
+      BufferOffsetT,
+      BlevBufferSrcsOutItT,
+      BlevBufferDstsOutItT,
+      BlevBufferSizesOutItT,
+      BlevBufferTileOffsetsOutItT,
+      BLevBufferOffsetTileState,
+      BlockOffsetT,
+      IsMemcpy>;
+
+    constexpr uint32_t BLEV_BLOCK_THREADS = ActivePolicyT::AgentLargeBufferPolicyT::BLOCK_THREADS;
+
+    // Get device ordinal
+    int device_ordinal;
+    error = CubDebug(cudaGetDevice(&device_ordinal));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Get SM count
+    int sm_count;
+    error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Get SM occupancy for the batch memcpy block-level buffers kernel
+    int batch_memcpy_blev_occupancy;
+    error = CubDebug(MaxSmOccupancy(batch_memcpy_blev_occupancy, multi_block_memcpy_kernel, BLEV_BLOCK_THREADS));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    int batch_memcpy_blev_grid_size =
+      static_cast<int>(sm_count * batch_memcpy_blev_occupancy * CUB_SUBSCRIPTION_FACTOR(0));
+
+    // Construct the tile status for the buffer prefix sum
+    BLevBufferOffsetTileState buffer_scan_tile_state;
+    error = CubDebug(buffer_scan_tile_state.Init(
+      static_cast<int32_t>(num_tiles), blev_buffer_scan_alloc.get(), buffer_offset_scan_storage));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Construct the tile status for thread blocks-to-buffer-assignment prefix sum
+    BLevBlockOffsetTileState block_scan_tile_state;
+    error = CubDebug(block_scan_tile_state.Init(
+      static_cast<int32_t>(num_tiles), blev_block_scan_alloc.get(), blev_block_scan_storage));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking "
+            "InitTileStateKernel<<<%d, %d, 0, %lld>>>()\n",
+            static_cast<int>(init_grid_size),
+            INIT_KERNEL_THREADS,
+            (long long) stream);
+#endif
+
+    // Invoke init_kernel to initialize buffer prefix sum-tile descriptors
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+              .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);
+
+    // Check for failure to launch
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+
+    // Check for failure to launch
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking "
+            "BatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
+            static_cast<int>(batch_memcpy_grid_size),
+            ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
+            (long long) stream);
+#endif
+
+    // Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
+    // up by next kernel
+    error =
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream)
+        .doit(batch_memcpy_non_blev_kernel,
+              input_buffer_it,
+              output_buffer_it,
+              buffer_sizes,
+              num_buffers,
+              d_blev_src_buffers,
+              d_blev_dst_buffers,
+              d_blev_buffer_sizes,
+              d_blev_block_offsets,
+              buffer_scan_tile_state,
+              block_scan_tile_state);
+
+    // Check for failure to launch
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking "
+            "MultiBlockBatchMemcpyKernel<<<%d, %d, 0, %lld>>>()\n",
+            static_cast<int>(batch_memcpy_blev_grid_size),
+            BLEV_BLOCK_THREADS,
+            (long long) stream);
+#endif
+
+    error =
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
+        .doit(multi_block_memcpy_kernel,
+              d_blev_src_buffers,
+              d_blev_dst_buffers,
+              d_blev_buffer_sizes,
+              d_blev_block_offsets,
+              buffer_scan_tile_state,
+              batch_memcpy_grid_size - 1);
+
+    // Check for failure to launch
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+
+    return error;
+  }
+
+  //------------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //------------------------------------------------------------------------------
+  /**
+   * Internal dispatch routine
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputBufferIt input_buffer_it,
+    OutputBufferIt output_buffer_it,
+    BufferSizeIteratorT buffer_sizes,
+    BufferOffsetT num_buffers,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchBatchMemcpy::MaxPolicy;
+
+    cudaError_t error = cudaSuccess;
+
+    // Get PTX version
+    int ptx_version = 0;
+    error           = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Create dispatch functor
+    DispatchBatchMemcpy dispatch(
+      d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
+
+    // Dispatch to chained policy
+    error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+    return error;
+  }
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_for.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_for.cuh
new file mode 100644
index 000000000..de0189490
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_for.cuh
@@ -0,0 +1,206 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/device/dispatch/kernels/for_each.cuh>
+#include <cub/device/dispatch/tuning/tuning_for.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace for_each
+{
+
+// The dispatch layer is in the detail namespace until we figure out tuning API
+template <class OffsetT, class OpT, class PolicyHubT = policy_hub_t>
+struct dispatch_t : PolicyHubT
+{
+  OffsetT num_items;
+  OpT op;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION dispatch_t(OffsetT num_items, OpT op, cudaStream_t stream)
+      : num_items(num_items)
+      , op(op)
+      , stream(stream)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION
+  _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::false_type /* block size is not known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int block_threads = 256;
+    cudaError_t error = cudaSuccess;
+
+    NV_IF_TARGET(NV_IS_HOST,
+                 (int _{}; //
+                  error = cudaOccupancyMaxPotentialBlockSize(
+                    &_, &block_threads, detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>);));
+
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::dynamic_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION
+  _CCCL_FORCEINLINE cudaError_t Invoke(::cuda::std::true_type /* block size is known at compile time */)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    cudaError_t error              = cudaSuccess;
+    constexpr int block_threads    = ActivePolicyT::for_policy_t::block_threads;
+    constexpr int items_per_thread = ActivePolicyT::for_policy_t::items_per_thread;
+
+    const auto tile_size = static_cast<OffsetT>(block_threads * items_per_thread);
+    const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking detail::for_each::static_kernel<<<%d, %d, 0, %lld>>>(), "
+            "%d items per thread\n",
+            static_cast<int>(num_tiles),
+            static_cast<int>(block_threads),
+            reinterpret_cast<long long>(stream),
+            static_cast<int>(items_per_thread));
+#endif
+
+    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+              static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
+              .doit(detail::for_each::static_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
+    error = CubDebug(error);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      CubDebug(error = SyncStream(stream));
+    }
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    constexpr bool static_block_size = ActivePolicyT::for_policy_t::block_threads > 0;
+    return Invoke<ActivePolicyT>(::cuda::std::integral_constant<bool, static_block_size>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream)
+  {
+    using max_policy_t = typename dispatch_t::MaxPolicy;
+
+    int ptx_version   = 0;
+    cudaError_t error = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch(num_items, op, stream);
+
+    error = CubDebug(max_policy_t::Invoke(ptx_version, dispatch));
+
+    return error;
+  }
+};
+
+} // namespace for_each
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 000000000..e658fdb45
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1631 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ *   cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s)
+ *   from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_histogram.cuh>
+#include <cub/device/dispatch/tuning/tuning_histogram.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_temporary_storage.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/functional>
+#include <cuda/std/__algorithm/copy.h>
+#include <cuda/std/__algorithm/transform.h>
+#include <cuda/std/array>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <cstdio>
+#include <iterator>
+#include <limits>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ *
+ * @tparam NUM_ACTIVE_CHANNELS
+ *   Number of channels actively being histogrammed
+ *
+ * @tparam CounterT
+ *   Integer type for counting sample occurrences per histogram bin
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param num_output_bins_wrapper
+ *   Number of output histogram bins per channel
+ *
+ * @param d_output_histograms_wrapper
+ *   Histogram counter data having logical dimensions
+ *   `CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]`
+ *
+ * @param tile_queue
+ *   Drain queue descriptor for dynamically mapping tile data onto thread blocks
+ */
+template <int NUM_ACTIVE_CHANNELS, typename CounterT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramInitKernel(
+  ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
+  ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,
+  GridQueue<int> tile_queue)
+{
+  if ((threadIdx.x == 0) && (blockIdx.x == 0))
+  {
+    tile_queue.ResetDrain();
+  }
+
+  int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+#pragma unroll
+  for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+  {
+    if (output_bin < num_output_bins_wrapper[CHANNEL])
+    {
+      d_output_histograms_wrapper[CHANNEL][output_bin] = 0;
+    }
+  }
+}
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).
+ * Computes privatized histograms, one per thread block.
+ *
+ *
+ * @tparam AgentHistogramPolicyT
+ *   Parameterized AgentHistogramPolicy tuning policy type
+ *
+ * @tparam PRIVATIZED_SMEM_BINS
+ *   Maximum number of histogram bins per channel (e.g., up to 256)
+ *
+ * @tparam NUM_CHANNELS
+ *   Number of channels interleaved in the input data (may be greater than the number of channels
+ *   being actively histogrammed)
+ *
+ * @tparam NUM_ACTIVE_CHANNELS
+ *   Number of channels actively being histogrammed
+ *
+ * @tparam SampleIteratorT
+ *   The input iterator type. @iterator.
+ *
+ * @tparam CounterT
+ *   Integer type for counting sample occurrences per histogram bin
+ *
+ * @tparam PrivatizedDecodeOpT
+ *   The transform operator type for determining privatized counter indices from samples,
+ *   one for each channel
+ *
+ * @tparam OutputDecodeOpT
+ *   The transform operator type for determining output bin-ids from privatized counter indices,
+ *   one for each channel
+ *
+ * @tparam OffsetT
+ *   integer type for global offsets
+ *
+ * @param d_samples
+ *   Input data to reduce
+ *
+ * @param num_output_bins_wrapper
+ *   The number bins per final output histogram
+ *
+ * @param num_privatized_bins_wrapper
+ *   The number bins per privatized histogram
+ *
+ * @param d_output_histograms_wrapper
+ *   Reference to final output histograms
+ *
+ * @param d_privatized_histograms_wrapper
+ *   Reference to privatized histograms
+ *
+ * @param output_decode_op_wrapper
+ *   The transform operator for determining output bin-ids from privatized counter indices,
+ *   one for each channel
+ *
+ * @param privatized_decode_op_wrapper
+ *   The transform operator for determining privatized counter indices from samples,
+ *   one for each channel
+ *
+ * @param num_row_pixels
+ *   The number of multi-channel pixels per row in the region of interest
+ *
+ * @param num_rows
+ *   The number of rows in the region of interest
+ *
+ * @param row_stride_samples
+ *   The number of samples between starts of consecutive rows in the region of interest
+ *
+ * @param tiles_per_row
+ *   Number of image tiles per row
+ *
+ * @param tile_queue
+ *   Drain queue descriptor for dynamically mapping tile data onto thread blocks
+ */
+template <typename ChainedPolicyT,
+          int PRIVATIZED_SMEM_BINS,
+          int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename PrivatizedDecodeOpT,
+          typename OutputDecodeOpT,
+          typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
+    SampleIteratorT d_samples,
+    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
+    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper,
+    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,
+    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper,
+    ::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper,
+    ::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper,
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    int tiles_per_row,
+    GridQueue<int> tile_queue)
+{
+  // Thread block type for compositing input tiles
+  using AgentHistogramPolicyT = typename ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT;
+  using AgentHistogramT =
+    AgentHistogram<AgentHistogramPolicyT,
+                   PRIVATIZED_SMEM_BINS,
+                   NUM_CHANNELS,
+                   NUM_ACTIVE_CHANNELS,
+                   SampleIteratorT,
+                   CounterT,
+                   PrivatizedDecodeOpT,
+                   OutputDecodeOpT,
+                   OffsetT>;
+
+  // Shared memory for AgentHistogram
+  __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+  AgentHistogramT agent(
+    temp_storage,
+    d_samples,
+    num_output_bins_wrapper.data(),
+    num_privatized_bins_wrapper.data(),
+    d_output_histograms_wrapper.data(),
+    d_privatized_histograms_wrapper.data(),
+    output_decode_op_wrapper.data(),
+    privatized_decode_op_wrapper.data());
+
+  // Initialize counters
+  agent.InitBinCounters();
+
+  // Consume input tiles
+  agent.ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue);
+
+  // Store output to global (if necessary)
+  agent.StoreOutput();
+}
+
+namespace detail
+{
+
+template <int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          int PRIVATIZED_SMEM_BINS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename PrivatizedDecodeOpT,
+          typename OutputDecodeOpT,
+          typename OffsetT,
+          typename MaxPolicyT>
+struct dispatch_histogram
+{
+  void* d_temp_storage;
+  size_t& temp_storage_bytes;
+  SampleIteratorT d_samples;
+  CounterT** d_output_histograms;
+  const int* num_privatized_levels;
+  PrivatizedDecodeOpT* privatized_decode_op;
+  const int* num_output_levels;
+  OutputDecodeOpT* output_decode_op;
+  int max_num_output_bins;
+  OffsetT num_row_pixels;
+  OffsetT num_rows;
+  OffsetT row_stride_samples;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION dispatch_histogram(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_privatized_levels[NUM_ACTIVE_CHANNELS],
+    PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS],
+    int max_num_output_bins,
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_samples(d_samples)
+      , d_output_histograms(d_output_histograms)
+      , num_privatized_levels(num_privatized_levels)
+      , privatized_decode_op(privatized_decode_op)
+      , num_output_levels(num_output_levels)
+      , output_decode_op(output_decode_op)
+      , max_num_output_bins(max_num_output_bins)
+      , num_row_pixels(num_row_pixels)
+      , num_rows(num_rows)
+      , row_stride_samples(row_stride_samples)
+      , stream(stream)
+  {}
+
+  template <typename ActivePolicyT, typename DeviceHistogramInitKernelT, typename DeviceHistogramSweepKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  Invoke(DeviceHistogramInitKernelT histogram_init_kernel, DeviceHistogramSweepKernelT histogram_sweep_kernel)
+  {
+    cudaError error = cudaSuccess;
+
+    constexpr int block_threads     = ActivePolicyT::AgentHistogramPolicyT::BLOCK_THREADS;
+    constexpr int pixels_per_thread = ActivePolicyT::AgentHistogramPolicyT::PIXELS_PER_THREAD;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get SM count
+      int sm_count;
+      error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get SM occupancy for histogram_sweep_kernel
+      int histogram_sweep_sm_occupancy;
+      error = CubDebug(MaxSmOccupancy(histogram_sweep_sm_occupancy, histogram_sweep_kernel, block_threads));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get device occupancy for histogram_sweep_kernel
+      int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+      if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+      {
+        // Treat as a single linear array of samples
+        num_row_pixels *= num_rows;
+        num_rows           = 1;
+        row_stride_samples = num_row_pixels * NUM_CHANNELS;
+      }
+
+      // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+      int pixels_per_tile = block_threads * pixels_per_thread;
+      int tiles_per_row   = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
+      int blocks_per_row  = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+      int blocks_per_col =
+        (blocks_per_row > 0) ? int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : 0;
+      int num_thread_blocks = blocks_per_row * blocks_per_col;
+
+      dim3 sweep_grid_dims;
+      sweep_grid_dims.x = (unsigned int) blocks_per_row;
+      sweep_grid_dims.y = (unsigned int) blocks_per_col;
+      sweep_grid_dims.z = 1;
+
+      // Temporary storage allocation requirements
+      constexpr int NUM_ALLOCATIONS      = NUM_ACTIVE_CHANNELS + 1;
+      void* allocations[NUM_ALLOCATIONS] = {};
+      size_t allocation_sizes[NUM_ALLOCATIONS];
+
+      for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+      {
+        allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+      }
+
+      allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+      // Alias the temporary allocations from the single storage blob (or compute the
+      // necessary size of the blob)
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      // Construct the grid queue descriptor
+      GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+      // Wrap arrays so we can pass them by-value to the kernel
+      ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+      ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+      ::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+      ::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+      ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+      ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+
+      auto* typedAllocations = reinterpret_cast<CounterT**>(allocations);
+      ::cuda::std::copy(
+        d_output_histograms, d_output_histograms + NUM_ACTIVE_CHANNELS, d_output_histograms_wrapper.begin());
+      ::cuda::std::copy(
+        typedAllocations, typedAllocations + NUM_ACTIVE_CHANNELS, d_privatized_histograms_wrapper.begin());
+      // TODO(bgruber): we can probably skip copying the function objects when they are empty
+      ::cuda::std::copy(
+        privatized_decode_op, privatized_decode_op + NUM_ACTIVE_CHANNELS, privatized_decode_op_wrapper.begin());
+      ::cuda::std::copy(output_decode_op, output_decode_op + NUM_ACTIVE_CHANNELS, output_decode_op_wrapper.begin());
+
+      auto minus_one = cuda::proclaim_return_type<int>([](int levels) {
+        return levels - 1;
+      });
+      ::cuda::std::transform(
+        num_privatized_levels,
+        num_privatized_levels + NUM_ACTIVE_CHANNELS,
+        num_privatized_bins_wrapper.begin(),
+        minus_one);
+      ::cuda::std::transform(
+        num_output_levels, num_output_levels + NUM_ACTIVE_CHANNELS, num_output_bins_wrapper.begin(), minus_one);
+      int histogram_init_block_threads = 256;
+
+      int histogram_init_grid_dims =
+        (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+// Log DeviceHistogramInitKernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+              histogram_init_grid_dims,
+              histogram_init_block_threads,
+              (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke histogram_init_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        histogram_init_grid_dims, histogram_init_block_threads, 0, stream)
+        .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue);
+
+      // Return if empty problem
+      if ((blocks_per_row == 0) || (blocks_per_col == 0))
+      {
+        break;
+      }
+
+// Log histogram_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
+              "per thread, %d SM occupancy\n",
+              sweep_grid_dims.x,
+              sweep_grid_dims.y,
+              sweep_grid_dims.z,
+              block_threads,
+              (long long) stream,
+              pixels_per_thread,
+              histogram_sweep_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke histogram_sweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
+        .doit(histogram_sweep_kernel,
+              d_samples,
+              num_output_bins_wrapper,
+              num_privatized_bins_wrapper,
+              d_output_histograms_wrapper,
+              d_privatized_histograms_wrapper,
+              output_decode_op_wrapper,
+              privatized_decode_op_wrapper,
+              num_row_pixels,
+              num_rows,
+              row_stride_samples,
+              tiles_per_row,
+              tile_queue);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    return Invoke<ActivePolicyT>(
+      DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+      DeviceHistogramSweepKernel<MaxPolicyT,
+                                 PRIVATIZED_SMEM_BINS,
+                                 NUM_CHANNELS,
+                                 NUM_ACTIVE_CHANNELS,
+                                 SampleIteratorT,
+                                 CounterT,
+                                 PrivatizedDecodeOpT,
+                                 OutputDecodeOpT,
+                                 OffsetT>);
+  }
+};
+
+} // namespace detail
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ *
+ * @tparam NUM_CHANNELS
+ *   Number of channels interleaved in the input data (may be greater than the number of channels
+ *   being actively histogrammed)
+ *
+ * @tparam NUM_ACTIVE_CHANNELS
+ *   Number of channels actively being histogrammed
+ *
+ * @tparam SampleIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam CounterT
+ *   Integer type for counting sample occurrences per histogram bin
+ *
+ * @tparam LevelT
+ *   Type for specifying bin level boundaries
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename LevelT,
+          typename OffsetT,
+          typename SelectedPolicy = //
+          detail::device_histogram_policy_hub< //
+            cub::detail::value_t<SampleIteratorT>,
+            CounterT,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS>>
+struct DispatchHistogram : SelectedPolicy
+{
+  static_assert(NUM_CHANNELS <= 4, "Histograms only support up to 4 channels");
+  static_assert(NUM_ACTIVE_CHANNELS <= NUM_CHANNELS,
+                "Active channels must be at most the number of total channels of the input samples");
+
+public:
+  //---------------------------------------------------------------------
+  // Types and constants
+  //---------------------------------------------------------------------
+
+  /// The sample value type of the input iterator
+  using SampleT = cub::detail::value_t<SampleIteratorT>;
+
+  enum
+  {
+    // Maximum number of bins per channel for which we will use a privatized smem strategy
+    MAX_PRIVATIZED_SMEM_BINS = 256
+  };
+
+  //---------------------------------------------------------------------
+  // Transform functors for converting samples to bin-ids
+  //---------------------------------------------------------------------
+
+  // Searches for bin given a list of bin-boundary levels
+  template <typename LevelIteratorT>
+  struct SearchTransform
+  {
+    LevelIteratorT d_levels; // Pointer to levels array
+    int num_output_levels; // Number of levels in array
+
+    /**
+     * @brief Initializer
+     *
+     * @param d_levels_ Pointer to levels array
+     * @param num_output_levels_ Number of levels in array
+     */
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void Init(LevelIteratorT d_levels_, int num_output_levels_)
+    {
+      this->d_levels          = d_levels_;
+      this->num_output_levels = num_output_levels_;
+    }
+
+    // Method for converting samples to bin-ids
+    template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    {
+      /// Level iterator wrapper type
+      // Wrap the native input pointer with CacheModifiedInputIterator
+      // or Directly use the supplied input iterator type
+      using WrappedLevelIteratorT =
+        ::cuda::std::_If<std::is_pointer<LevelIteratorT>::value,
+                         CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,
+                         LevelIteratorT>;
+
+      WrappedLevelIteratorT wrapped_levels(d_levels);
+
+      int num_bins = num_output_levels - 1;
+      if (valid)
+      {
+        bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+        if (bin >= num_bins)
+        {
+          bin = -1;
+        }
+      }
+    }
+  };
+
+  // Scales samples to evenly-spaced bins
+  struct ScaleTransform
+  {
+  private:
+    using CommonT = typename ::cuda::std::common_type<LevelT, SampleT>::type;
+    static_assert(::cuda::std::is_convertible<CommonT, int>::value,
+                  "The common type of `LevelT` and `SampleT` must be "
+                  "convertible to `int`.");
+    static_assert(::cuda::std::is_trivially_copyable<CommonT>::value,
+                  "The common type of `LevelT` and `SampleT` must be "
+                  "trivially copyable.");
+
+    // An arithmetic type that's used for bin computation of integral types, guaranteed to not
+    // overflow for (max_level - min_level) * scale.fraction.bins. Since we drop invalid samples
+    // of less than min_level, (sample - min_level) is guaranteed to be non-negative. We use the
+    // rule: 2^l * 2^r = 2^(l + r) to determine a sufficiently large type to hold the
+    // multiplication result.
+    // If CommonT used to be a 128-bit wide integral type already, we use CommonT's arithmetic
+    using IntArithmeticT = ::cuda::std::_If< //
+      sizeof(SampleT) + sizeof(CommonT) <= sizeof(uint32_t), //
+      uint32_t, //
+#if CUB_IS_INT128_ENABLED
+      ::cuda::std::_If< //
+        (::cuda::std::is_same<CommonT, __int128_t>::value || //
+         ::cuda::std::is_same<CommonT, __uint128_t>::value), //
+        CommonT, //
+        uint64_t> //
+#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
+      uint64_t
+#endif // !CUB_IS_INT128_ENABLED
+      >;
+
+    // Alias template that excludes __[u]int128 from the integral types
+    template <typename T>
+    using is_integral_excl_int128 =
+#if CUB_IS_INT128_ENABLED
+      ::cuda::std::_If<::cuda::std::is_same<T, __int128_t>::value&& ::cuda::std::is_same<T, __uint128_t>::value,
+                       ::cuda::std::false_type,
+                       ::cuda::std::is_integral<T>>;
+#else // ^^^ CUB_IS_INT128_ENABLED ^^^ / vvv !CUB_IS_INT128_ENABLED vvv
+      ::cuda::std::is_integral<T>;
+#endif // !CUB_IS_INT128_ENABLED
+
+    union ScaleT
+    {
+      // Used when CommonT is not floating-point to avoid intermediate
+      // rounding errors (see NVIDIA/cub#489).
+      struct FractionT
+      {
+        CommonT bins;
+        CommonT range;
+      } fraction;
+
+      // Used when CommonT is floating-point as an optimization.
+      CommonT reciprocal;
+    };
+
+    CommonT m_max; // Max sample level (exclusive)
+    CommonT m_min; // Min sample level (inclusive)
+    ScaleT m_scale; // Bin scaling
+
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT
+    ComputeScale(int num_levels, T max_level, T min_level, ::cuda::std::true_type /* is_fp */)
+    {
+      ScaleT result;
+      result.reciprocal = static_cast<T>(static_cast<T>(num_levels - 1) / static_cast<T>(max_level - min_level));
+      return result;
+    }
+
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT
+    ComputeScale(int num_levels, T max_level, T min_level, ::cuda::std::false_type /* is_fp */)
+    {
+      ScaleT result;
+      result.fraction.bins  = static_cast<T>(num_levels - 1);
+      result.fraction.range = static_cast<T>(max_level - min_level);
+      return result;
+    }
+
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, T max_level, T min_level)
+    {
+      return this->ComputeScale(num_levels, max_level, min_level, ::cuda::std::is_floating_point<T>{});
+    }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScaleT ComputeScale(int num_levels, __half max_level, __half min_level)
+    {
+      ScaleT result;
+      NV_IF_TARGET(NV_PROVIDES_SM_53,
+                   (result.reciprocal = __hdiv(__float2half(num_levels - 1), __hsub(max_level, min_level));),
+                   (result.reciprocal = __float2half(
+                      static_cast<float>(num_levels - 1) / (__half2float(max_level) - __half2float(min_level)));))
+      return result;
+    }
+#endif // __CUDA_FP16_TYPES_EXIST__
+
+    // All types but __half:
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(T sample, T max_level, T min_level)
+    {
+      return sample >= min_level && sample < max_level;
+    }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int SampleIsValid(__half sample, __half max_level, __half min_level)
+    {
+      NV_IF_TARGET(
+        NV_PROVIDES_SM_53,
+        (return __hge(sample, min_level) && __hlt(sample, max_level);),
+        (return __half2float(sample) >= __half2float(min_level) && __half2float(sample) < __half2float(max_level);));
+    }
+#endif // __CUDA_FP16_TYPES_EXIST__
+
+    /**
+     * @brief Bin computation for floating point (and extended floating point) types
+     */
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::true_type /* is_fp */)
+    {
+      return static_cast<int>((sample - min_level) * scale.reciprocal);
+    }
+
+    /**
+     * @brief Bin computation for custom types and __[u]int128
+     */
+    template <typename T>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int
+    ComputeBin(T sample, T min_level, ScaleT scale, ::cuda::std::false_type /* is_fp */)
+    {
+      return static_cast<int>(((sample - min_level) * scale.fraction.bins) / scale.fraction.range);
+    }
+
+    /**
+     * @brief Bin computation for integral types of up to 64-bit types
+     */
+    template <typename T, typename ::cuda::std::enable_if<is_integral_excl_int128<T>::value, int>::type = 0>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    {
+      return static_cast<int>(
+        (static_cast<IntArithmeticT>(sample - min_level) * static_cast<IntArithmeticT>(scale.fraction.bins))
+        / static_cast<IntArithmeticT>(scale.fraction.range));
+    }
+
+    template <typename T, typename ::cuda::std::enable_if<!is_integral_excl_int128<T>::value, int>::type = 0>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(T sample, T min_level, ScaleT scale)
+    {
+      return this->ComputeBin(sample, min_level, scale, ::cuda::std::is_floating_point<T>{});
+    }
+
+#ifdef __CUDA_FP16_TYPES_EXIST__
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE int ComputeBin(__half sample, __half min_level, ScaleT scale)
+    {
+      NV_IF_TARGET(
+        NV_PROVIDES_SM_53,
+        (return static_cast<int>(__hmul(__hsub(sample, min_level), scale.reciprocal));),
+        (return static_cast<int>((__half2float(sample) - __half2float(min_level)) * __half2float(scale.reciprocal));));
+    }
+#endif // __CUDA_FP16_TYPES_EXIST__
+
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool
+    MayOverflow(CommonT /* num_bins */, ::cuda::std::false_type /* is_integral */)
+    {
+      return false;
+    }
+
+    /**
+     * @brief Returns true if the bin computation for a given combination of range `(max_level -
+     * min_level)` and number of bins may overflow.
+     */
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool MayOverflow(CommonT num_bins, ::cuda::std::true_type /* is_integral */)
+    {
+      return static_cast<IntArithmeticT>(m_max - m_min)
+           > (::cuda::std::numeric_limits<IntArithmeticT>::max() / static_cast<IntArithmeticT>(num_bins));
+    }
+
+  public:
+    /**
+     * @brief Initializes the ScaleTransform for the given parameters
+     * @return cudaErrorInvalidValue if the ScaleTransform for the given values may overflow,
+     * cudaSuccess otherwise
+     */
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int num_levels, LevelT max_level, LevelT min_level)
+    {
+      m_max = static_cast<CommonT>(max_level);
+      m_min = static_cast<CommonT>(min_level);
+
+      // Check whether accurate bin computation for an integral sample type may overflow
+      if (MayOverflow(static_cast<CommonT>(num_levels - 1), ::cuda::std::is_integral<CommonT>{}))
+      {
+        return cudaErrorInvalidValue;
+      }
+
+      m_scale = this->ComputeScale(num_levels, m_max, m_min);
+      return cudaSuccess;
+    }
+
+    // Method for converting samples to bin-ids
+    template <CacheLoadModifier LOAD_MODIFIER>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(SampleT sample, int& bin, bool valid)
+    {
+      const CommonT common_sample = static_cast<CommonT>(sample);
+
+      if (valid && this->SampleIsValid(common_sample, m_max, m_min))
+      {
+        bin = this->ComputeBin(common_sample, m_min, m_scale);
+      }
+    }
+  };
+
+  // Pass-through bin transform operator
+  struct PassThruTransform
+  {
+// GCC 14 rightfully warns that when a value-initialized array of this struct is copied using memcpy, uninitialized
+// bytes may be accessed. To avoid this, we add a dummy member, so value initialization actually initializes the memory.
+#if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13
+    char dummy;
+#endif
+
+    // Method for converting samples to bin-ids
+    template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void BinSelect(_SampleT sample, int& bin, bool valid)
+    {
+      if (valid)
+      {
+        bin = (int) sample;
+      }
+    }
+  };
+
+  //---------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------
+
+  /**
+   * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to `temp_storage_bytes` and
+   *   no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_samples
+   *   The pointer to the multi-channel input sequence of data samples.
+   *   The samples from different channels are assumed to be interleaved
+   *   (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+   *
+   * @param d_output_histograms
+   *   The pointers to the histogram counter output arrays, one for each active channel.
+   *   For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param num_output_levels
+   *   The number of boundaries (levels) for delineating histogram samples in each active channel.
+   *   Implies that the number of bins for channel<sub><em>i</em></sub> is
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param d_levels
+   *   The pointers to the arrays of boundaries (levels), one for each active channel.
+   *   Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
+   *   inclusive and upper sample value boundaries are exclusive.
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   *
+   * @param is_byte_sample
+   *   type indicating whether or not SampleT is a 8b type
+   */
+  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    Int2Type<false> /*is_byte_sample*/)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    cudaError error  = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Use the search transform op for converting samples to privatized bins
+      using PrivatizedDecodeOpT = SearchTransform<const LevelT*>;
+
+      // Use the pass-thru transform op for converting privatized bins to output bins
+      using OutputDecodeOpT = PassThruTransform;
+
+      PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{};
+      OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{};
+      int max_levels = num_output_levels[0];
+
+      for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+      {
+        privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+        if (num_output_levels[channel] > max_levels)
+        {
+          max_levels = num_output_levels[channel];
+        }
+      }
+      int max_num_output_bins = max_levels - 1;
+
+      // Dispatch
+      if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+      {
+        // Too many bins to keep in shared memory.
+        constexpr int PRIVATIZED_SMEM_BINS = 0;
+
+        detail::dispatch_histogram<
+          NUM_CHANNELS,
+          NUM_ACTIVE_CHANNELS,
+          PRIVATIZED_SMEM_BINS,
+          SampleIteratorT,
+          CounterT,
+          PrivatizedDecodeOpT,
+          OutputDecodeOpT,
+          OffsetT,
+          MaxPolicyT>
+          dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_output_histograms,
+            num_output_levels,
+            privatized_decode_op,
+            num_output_levels,
+            output_decode_op,
+            max_num_output_bins,
+            num_row_pixels,
+            num_rows,
+            row_stride_samples,
+            stream);
+
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+      else
+      {
+        // Dispatch shared-privatized approach
+        constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+        detail::dispatch_histogram<
+          NUM_CHANNELS,
+          NUM_ACTIVE_CHANNELS,
+          PRIVATIZED_SMEM_BINS,
+          SampleIteratorT,
+          CounterT,
+          PrivatizedDecodeOpT,
+          OutputDecodeOpT,
+          OffsetT,
+          MaxPolicyT>
+          dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_output_histograms,
+            num_output_levels,
+            privatized_decode_op,
+            num_output_levels,
+            output_decode_op,
+            max_num_output_bins,
+            num_row_pixels,
+            num_rows,
+            row_stride_samples,
+            stream);
+
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    Int2Type<false> is_byte_sample)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return DispatchRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_output_histograms,
+      num_output_levels,
+      d_levels,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      stream,
+      is_byte_sample);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * Dispatch routine for HistogramRange, specialized for 8-bit sample types
+   * (computes 256-bin privatized histograms and then reduces to user-specified levels)
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to `temp_storage_bytes` and
+   *   no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_samples
+   *   The pointer to the multi-channel input sequence of data samples.
+   *   The samples from different channels are assumed to be interleaved
+   *   (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+   *
+   * @param d_output_histograms
+   *   The pointers to the histogram counter output arrays, one for each active channel.
+   *   For channel<sub><em>i</em></sub>, the allocation length of
+   *   `d_histograms[i]` should be `num_output_levels[i] - 1`.
+   *
+   * @param num_output_levels
+   *   The number of boundaries (levels) for delineating histogram samples in each active channel.
+   *   Implies that the number of bins for channel<sub><em>i</em></sub> is
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param d_levels
+   *   The pointers to the arrays of boundaries (levels), one for each active channel.
+   *   Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
+   *   inclusive and upper sample value boundaries are exclusive.
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+   *
+   * @param is_byte_sample
+   *   Marker type indicating whether or not SampleT is a 8b type
+   */
+  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    Int2Type<true> /*is_byte_sample*/)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    cudaError error  = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Use the pass-thru transform op for converting samples to privatized bins
+      using PrivatizedDecodeOpT = PassThruTransform;
+
+      // Use the search transform op for converting privatized bins to output bins
+      using OutputDecodeOpT = SearchTransform<const LevelT*>;
+
+      int num_privatized_levels[NUM_ACTIVE_CHANNELS];
+      PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{};
+      OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{};
+      int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
+
+      for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+      {
+        num_privatized_levels[channel] = 257;
+        output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+        if (num_output_levels[channel] > max_levels)
+        {
+          max_levels = num_output_levels[channel];
+        }
+      }
+      int max_num_output_bins = max_levels - 1;
+
+      constexpr int PRIVATIZED_SMEM_BINS = 256;
+
+      detail::dispatch_histogram<
+        NUM_CHANNELS,
+        NUM_ACTIVE_CHANNELS,
+        PRIVATIZED_SMEM_BINS,
+        SampleIteratorT,
+        CounterT,
+        PrivatizedDecodeOpT,
+        OutputDecodeOpT,
+        OffsetT,
+        MaxPolicyT>
+        dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          d_samples,
+          d_output_histograms,
+          num_privatized_levels,
+          privatized_decode_op,
+          num_output_levels,
+          output_decode_op,
+          max_num_output_bins,
+          num_row_pixels,
+          num_rows,
+          row_stride_samples,
+          stream);
+
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    Int2Type<true> is_byte_sample)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return DispatchRange(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_output_histograms,
+      num_output_levels,
+      d_levels,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      stream,
+      is_byte_sample);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_samples
+   *   The pointer to the input sequence of sample items.
+   *   The samples from different channels are assumed to be interleaved
+   *   (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+   *
+   * @param d_output_histograms
+   *   The pointers to the histogram counter output arrays, one for each active channel.
+   *   For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param num_output_levels
+   *   The number of bin level boundaries for delineating histogram samples in each active channel.
+   *   Implies that the number of bins for channel<sub><em>i</em></sub> is
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param lower_level
+   *   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+   *
+   * @param upper_level
+   *   The upper sample value bound (exclusive) for the highest histogram bin in each active
+   * channel.
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+   *
+   * @param is_byte_sample
+   *   Marker type indicating whether or not SampleT is a 8b type
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    Int2Type<false> /*is_byte_sample*/)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    cudaError error  = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Use the scale transform op for converting samples to privatized bins
+      using PrivatizedDecodeOpT = ScaleTransform;
+
+      // Use the pass-thru transform op for converting privatized bins to output bins
+      using OutputDecodeOpT = PassThruTransform;
+
+      PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{};
+      OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{};
+      int max_levels = num_output_levels[0];
+
+      for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+      {
+        error = CubDebug(
+          privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel]));
+        if (error != cudaSuccess)
+        {
+          // Make sure to also return a reasonable value for `temp_storage_bytes` in case of
+          // an overflow of the bin computation, in which case a subsequent algorithm
+          // invocation will also fail
+          if (!d_temp_storage)
+          {
+            temp_storage_bytes = 1U;
+          }
+          return error;
+        }
+
+        if (num_output_levels[channel] > max_levels)
+        {
+          max_levels = num_output_levels[channel];
+        }
+      }
+      int max_num_output_bins = max_levels - 1;
+
+      if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+      {
+        // Dispatch shared-privatized approach
+        constexpr int PRIVATIZED_SMEM_BINS = 0;
+
+        detail::dispatch_histogram<
+          NUM_CHANNELS,
+          NUM_ACTIVE_CHANNELS,
+          PRIVATIZED_SMEM_BINS,
+          SampleIteratorT,
+          CounterT,
+          PrivatizedDecodeOpT,
+          OutputDecodeOpT,
+          OffsetT,
+          MaxPolicyT>
+          dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_output_histograms,
+            num_output_levels,
+            privatized_decode_op,
+            num_output_levels,
+            output_decode_op,
+            max_num_output_bins,
+            num_row_pixels,
+            num_rows,
+            row_stride_samples,
+            stream);
+
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+      else
+      {
+        // Dispatch shared-privatized approach
+        constexpr int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+        detail::dispatch_histogram<
+          NUM_CHANNELS,
+          NUM_ACTIVE_CHANNELS,
+          PRIVATIZED_SMEM_BINS,
+          SampleIteratorT,
+          CounterT,
+          PrivatizedDecodeOpT,
+          OutputDecodeOpT,
+          OffsetT,
+          MaxPolicyT>
+          dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_output_histograms,
+            num_output_levels,
+            privatized_decode_op,
+            num_output_levels,
+            output_decode_op,
+            max_num_output_bins,
+            num_row_pixels,
+            num_rows,
+            row_stride_samples,
+            stream);
+
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    Int2Type<false> is_byte_sample)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return DispatchEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_output_histograms,
+      num_output_levels,
+      lower_level,
+      upper_level,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      stream,
+      is_byte_sample);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * Dispatch routine for HistogramEven, specialized for 8-bit sample types
+   * (computes 256-bin privatized histograms and then reduces to user-specified levels)
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to `temp_storage_bytes` and
+   *   no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_samples
+   *   The pointer to the input sequence of sample items. The samples from different channels are
+   *   assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of
+   *   four RGBA 8-bit samples).
+   *
+   * @param d_output_histograms
+   *   The pointers to the histogram counter output arrays, one for each active channel.
+   *   For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param num_output_levels
+   *   The number of bin level boundaries for delineating histogram samples in each active channel.
+   *   Implies that the number of bins for channel<sub><em>i</em></sub> is
+   *   `num_output_levels[i] - 1`.
+   *
+   * @param lower_level
+   *   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+   *
+   * @param upper_level
+   *   The upper sample value bound (exclusive) for the highest histogram bin in each active
+   * channel.
+   *
+   * @param num_row_pixels
+   *   The number of multi-channel pixels per row in the region of interest
+   *
+   * @param num_rows
+   *   The number of rows in the region of interest
+   *
+   * @param row_stride_samples
+   *   The number of samples between starts of consecutive rows in the region of interest
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+   *
+   * @param is_byte_sample
+   *   type indicating whether or not SampleT is a 8b type
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    Int2Type<true> /*is_byte_sample*/)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    cudaError error  = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Use the pass-thru transform op for converting samples to privatized bins
+      using PrivatizedDecodeOpT = PassThruTransform;
+
+      // Use the scale transform op for converting privatized bins to output bins
+      using OutputDecodeOpT = ScaleTransform;
+
+      int num_privatized_levels[NUM_ACTIVE_CHANNELS];
+      PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]{};
+      OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]{};
+      int max_levels = num_output_levels[0];
+
+      for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+      {
+        num_privatized_levels[channel] = 257;
+
+        output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel]);
+
+        if (num_output_levels[channel] > max_levels)
+        {
+          max_levels = num_output_levels[channel];
+        }
+      }
+      int max_num_output_bins = max_levels - 1;
+
+      constexpr int PRIVATIZED_SMEM_BINS = 256;
+
+      detail::dispatch_histogram<
+        NUM_CHANNELS,
+        NUM_ACTIVE_CHANNELS,
+        PRIVATIZED_SMEM_BINS,
+        SampleIteratorT,
+        CounterT,
+        PrivatizedDecodeOpT,
+        OutputDecodeOpT,
+        OffsetT,
+        MaxPolicyT>
+        dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          d_samples,
+          d_output_histograms,
+          num_privatized_levels,
+          privatized_decode_op,
+          num_output_levels,
+          output_decode_op,
+          max_num_output_bins,
+          num_row_pixels,
+          num_rows,
+          row_stride_samples,
+          stream);
+
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SampleIteratorT d_samples,
+    CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS],
+    const int num_output_levels[NUM_ACTIVE_CHANNELS],
+    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
+    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
+    OffsetT num_row_pixels,
+    OffsetT num_rows,
+    OffsetT row_stride_samples,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    Int2Type<true> is_byte_sample)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return DispatchEven(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_samples,
+      d_output_histograms,
+      num_output_levels,
+      lower_level,
+      upper_level,
+      num_row_pixels,
+      num_rows,
+      row_stride_samples,
+      stream,
+      is_byte_sample);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge.cuh
new file mode 100644
index 000000000..dd90787f7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -0,0 +1,354 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+namespace merge
+{
+_CCCL_INLINE_VAR constexpr int fallback_BLOCK_THREADS    = 64;
+_CCCL_INLINE_VAR constexpr int fallback_ITEMS_PER_THREAD = 1;
+
+template <typename DefaultPolicy, class... Args>
+class choose_merge_agent
+{
+  using default_agent_t = agent_t<DefaultPolicy, Args...>;
+  using fallback_agent_t =
+    agent_t<policy_wrapper_t<DefaultPolicy, fallback_BLOCK_THREADS, fallback_ITEMS_PER_THREAD>, Args...>;
+
+  // Use fallback if merge agent exceeds maximum shared memory, but the fallback agent still fits
+  static constexpr bool use_fallback = sizeof(typename default_agent_t::TempStorage) > max_smem_per_block
+                                    && sizeof(typename fallback_agent_t::TempStorage) <= max_smem_per_block;
+
+public:
+  using type = ::cuda::std::__conditional_t<use_fallback, fallback_agent_t, default_agent_t>;
+};
+
+// Computes the merge path intersections at equally wide intervals. The approach is outlined in the paper:
+// Odeh et al, "Merge Path - Parallel Merging Made Simple" * doi : 10.1109 / IPDPSW .2012.202
+// The algorithm is the same as AgentPartition for merge sort, but that agent handles a lot more.
+template <typename MaxPolicy,
+          typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp>
+CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
+  KeyIt1 keys1,
+  Offset keys1_count,
+  KeyIt2 keys2,
+  Offset keys2_count,
+  Offset num_partitions,
+  Offset* merge_partitions,
+  CompareOp compare_op)
+{
+  // items_per_tile must be the same of the merge kernel later, so we have to consider whether a fallback agent will be
+  // selected for the merge agent that changes the tile size
+  constexpr int items_per_tile =
+    choose_merge_agent<typename MaxPolicy::ActivePolicy::merge_policy,
+                       KeyIt1,
+                       ValueIt1,
+                       KeyIt2,
+                       ValueIt2,
+                       KeyIt3,
+                       ValueIt3,
+                       Offset,
+                       CompareOp>::type::policy::ITEMS_PER_TILE;
+  const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (partition_idx < num_partitions)
+  {
+    const Offset partition_at       = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
+    merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
+  }
+}
+
+template <typename MaxPolicy,
+          typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp>
+__launch_bounds__(
+  choose_merge_agent<typename MaxPolicy::ActivePolicy::merge_policy,
+                     KeyIt1,
+                     ValueIt1,
+                     KeyIt2,
+                     ValueIt2,
+                     KeyIt3,
+                     ValueIt3,
+                     Offset,
+                     CompareOp>::type::policy::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
+    KeyIt1 keys1,
+    ValueIt1 items1,
+    Offset num_keys1,
+    KeyIt2 keys2,
+    ValueIt2 items2,
+    Offset num_keys2,
+    KeyIt3 keys_result,
+    ValueIt3 items_result,
+    CompareOp compare_op,
+    Offset* merge_partitions,
+    vsmem_t global_temp_storage)
+{
+  // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
+  using key_t = value_t<KeyIt1>;
+  static_assert(::cuda::std::__invokable<CompareOp, key_t, key_t>::value,
+                "Comparison operator cannot compare two keys");
+  static_assert(
+    ::cuda::std::is_convertible<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>::value,
+    "Comparison operator must be convertible to bool");
+
+  using MergeAgent = typename choose_merge_agent<
+    typename MaxPolicy::ActivePolicy::merge_policy,
+    KeyIt1,
+    ValueIt1,
+    KeyIt2,
+    ValueIt2,
+    KeyIt3,
+    ValueIt3,
+    Offset,
+    CompareOp>::type;
+  using MergePolicy = typename MergeAgent::policy;
+
+  using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
+  using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
+  __shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
+  auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
+  MergeAgent{
+    temp_storage.Alias(),
+    make_load_iterator(MergePolicy{}, keys1),
+    make_load_iterator(MergePolicy{}, items1),
+    num_keys1,
+    make_load_iterator(MergePolicy{}, keys2),
+    make_load_iterator(MergePolicy{}, items2),
+    num_keys2,
+    keys_result,
+    items_result,
+    compare_op,
+    merge_partitions}();
+  vsmem_helper_t::discard_temp_storage(temp_storage);
+}
+
+template <typename KeyT, typename ValueT>
+struct device_merge_policy_hub
+{
+  static constexpr bool has_values = !::cuda::std::is_same<ValueT, NullType>::value;
+
+  using tune_type = char[has_values ? sizeof(KeyT) + sizeof(ValueT) : sizeof(KeyT)];
+
+  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  {
+    using merge_policy =
+      agent_policy_t<128,
+                     Nominal4BItemsToItems<tune_type>(7),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy350 : ChainedPolicy<350, policy350, policy300>
+  {
+    using merge_policy =
+      agent_policy_t<256,
+                     Nominal4BItemsToItems<tune_type>(11),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_LDG,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy520 : ChainedPolicy<520, policy520, policy350>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(13),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_LDG,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  struct policy600 : ChainedPolicy<600, policy600, policy520>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(15),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  using max_policy = policy600;
+};
+
+template <typename KeyIt1,
+          typename ValueIt1,
+          typename KeyIt2,
+          typename ValueIt2,
+          typename KeyIt3,
+          typename ValueIt3,
+          typename Offset,
+          typename CompareOp,
+          typename PolicyHub = device_merge_policy_hub<value_t<KeyIt1>, value_t<ValueIt1>>>
+struct dispatch_t
+{
+  void* d_temp_storage;
+  std::size_t& temp_storage_bytes;
+  KeyIt1 d_keys1;
+  ValueIt1 d_values1;
+  Offset num_items1;
+  KeyIt2 d_keys2;
+  ValueIt2 d_values2;
+  Offset num_items2;
+  KeyIt3 d_keys_out;
+  ValueIt3 d_values_out;
+  CompareOp compare_op;
+  cudaStream_t stream;
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using max_policy_t   = typename PolicyHub::max_policy;
+    using merge_policy_t = typename ActivePolicy::merge_policy;
+    using agent_t =
+      typename choose_merge_agent<merge_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>::
+        type;
+
+    const auto num_tiles = ::cuda::ceil_div(num_items1 + num_items2, agent_t::policy::ITEMS_PER_TILE);
+    void* allocations[2] = {nullptr, nullptr};
+    {
+      const std::size_t merge_partitions_size      = (1 + num_tiles) * sizeof(Offset);
+      const std::size_t virtual_shared_memory_size = num_tiles * vsmem_helper_impl<agent_t>::vsmem_per_block;
+      const std::size_t allocation_sizes[2]        = {merge_partitions_size, virtual_shared_memory_size};
+      const auto error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    // Return if only temporary storage was requested or there is no work to be done
+    if (d_temp_storage == nullptr || num_tiles == 0)
+    {
+      return cudaSuccess;
+    }
+
+    auto merge_partitions = static_cast<Offset*>(allocations[0]);
+
+    // parition the merge path
+    {
+      const Offset num_partitions               = num_tiles + 1;
+      constexpr int threads_per_partition_block = 256; // TODO(bgruber): no policy?
+      const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));
+
+      auto error = CubDebug(
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          partition_grid_size, threads_per_partition_block, 0, stream)
+          .doit(device_partition_merge_path_kernel<
+                  max_policy_t,
+                  KeyIt1,
+                  ValueIt1,
+                  KeyIt2,
+                  ValueIt2,
+                  KeyIt3,
+                  ValueIt3,
+                  Offset,
+                  CompareOp>,
+                d_keys1,
+                num_items1,
+                d_keys2,
+                num_items2,
+                num_partitions,
+                merge_partitions,
+                compare_op));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+      error = CubDebug(DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    // merge
+    if (num_tiles > 0)
+    {
+      auto vshmem_ptr = vsmem_t{allocations[1]};
+      auto error      = CubDebug(
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          static_cast<int>(num_tiles), static_cast<int>(agent_t::policy::BLOCK_THREADS), 0, stream)
+          .doit(
+            device_merge_kernel<max_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>,
+            d_keys1,
+            d_values1,
+            num_items1,
+            d_keys2,
+            d_values2,
+            num_items2,
+            d_keys_out,
+            d_values_out,
+            compare_op,
+            merge_partitions,
+            vshmem_ptr));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+      error = CubDebug(DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    return cudaSuccess;
+  }
+
+  template <typename... Args>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(Args&&... args)
+  {
+    int ptx_version = 0;
+    auto error      = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+    dispatch_t dispatch{::cuda::std::forward<Args>(args)...};
+    error = CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    return cudaSuccess;
+  }
+};
+} // namespace merge
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge_sort.cuh
new file mode 100644
index 000000000..307c53c1f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -0,0 +1,764 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_merge_sort.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <thrust/detail/integer_math.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/**
+ * @brief Helper class template that provides two agent template instantiations: one instantiated with the default
+ * policy and one with the fallback policy. This helps to avoid having to enlist all the agent's template parameters
+ * twice: once for the default agent and once for the fallback agent
+ */
+template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+struct dual_policy_agent_helper_t
+{
+  using default_agent_t  = AgentT<DefaultPolicyT, AgentParamsT...>;
+  using fallback_agent_t = AgentT<FallbackPolicyT, AgentParamsT...>;
+
+  static constexpr auto default_size  = sizeof(typename default_agent_t::TempStorage);
+  static constexpr auto fallback_size = sizeof(typename fallback_agent_t::TempStorage);
+};
+
+/**
+ * @brief Helper class template for merge sort-specific virtual shared memory handling. The merge sort algorithm in its
+ * current implementation relies on the fact that both the sorting as well as the merging kernels use the same tile
+ * size. This circumstance needs to be respected when determining whether the fallback policy for large user types is
+ * applicable: we must either use the fallback for both or for none of the two agents.
+ */
+template <typename DefaultPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+class merge_sort_vsmem_helper_t
+{
+private:
+  // Default fallback policy with a smaller tile size
+  using fallback_policy_t = cub::detail::policy_wrapper_t<DefaultPolicyT, 64, 1>;
+
+  // Helper for the `AgentBlockSort` template with one member type alias for the agent template instantiated with the
+  // default policy and one instantiated with the fallback policy
+  using block_sort_helper_t = dual_policy_agent_helper_t<
+    DefaultPolicyT,
+    fallback_policy_t,
+    AgentBlockSort,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+  using default_block_sort_agent_t  = typename block_sort_helper_t::default_agent_t;
+  using fallback_block_sort_agent_t = typename block_sort_helper_t::fallback_agent_t;
+
+  // Helper for the `AgentMerge` template with one member type alias for the agent template instantiated with the
+  // default policy and one instantiated with the fallback policy
+  using merge_helper_t = dual_policy_agent_helper_t<
+    DefaultPolicyT,
+    fallback_policy_t,
+    AgentMerge,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+  using default_merge_agent_t  = typename merge_helper_t::default_agent_t;
+  using fallback_merge_agent_t = typename merge_helper_t::fallback_agent_t;
+
+  // Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
+  // available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
+  // available shared memory
+  static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
+  static constexpr auto max_fallback_size =
+    (cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
+  static constexpr bool uses_fallback_policy =
+    (max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);
+
+public:
+  using policy_t = ::cuda::std::_If<uses_fallback_policy, fallback_policy_t, DefaultPolicyT>;
+  using block_sort_agent_t =
+    ::cuda::std::_If<uses_fallback_policy, fallback_block_sort_agent_t, default_block_sort_agent_t>;
+  using merge_agent_t = ::cuda::std::_If<uses_fallback_policy, fallback_merge_agent_t, default_merge_agent_t>;
+};
+} // namespace detail
+
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+__launch_bounds__(
+  cub::detail::merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>::policy_t::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(
+    bool ping,
+    KeyInputIteratorT keys_in,
+    ValueInputIteratorT items_in,
+    KeyIteratorT keys_out,
+    ValueIteratorT items_out,
+    OffsetT keys_count,
+    KeyT* tmp_keys_out,
+    ValueT* tmp_items_out,
+    CompareOpT compare_op,
+    cub::detail::vsmem_t vsmem)
+{
+  using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+
+  using ActivePolicyT = typename MergeSortHelperT::policy_t;
+
+  using AgentBlockSortT = typename MergeSortHelperT::block_sort_agent_t;
+
+  using VSmemHelperT = cub::detail::vsmem_helper_impl<AgentBlockSortT>;
+
+  // Static shared memory allocation
+  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentBlockSortT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
+
+  AgentBlockSortT agent(
+    ping,
+    temp_storage,
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
+    keys_count,
+    keys_out,
+    items_out,
+    tmp_keys_out,
+    tmp_items_out,
+    compare_op);
+
+  agent.Process();
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VSmemHelperT::discard_temp_storage(temp_storage);
+}
+
+template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(
+  bool ping,
+  KeyIteratorT keys_ping,
+  KeyT* keys_pong,
+  OffsetT keys_count,
+  OffsetT num_partitions,
+  OffsetT* merge_partitions,
+  CompareOpT compare_op,
+  OffsetT target_merged_tiles_number,
+  int items_per_tile)
+{
+  OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (partition_idx < num_partitions)
+  {
+    AgentPartition<KeyIteratorT, OffsetT, CompareOpT, KeyT> agent(
+      ping,
+      keys_ping,
+      keys_pong,
+      keys_count,
+      partition_idx,
+      merge_partitions,
+      compare_op,
+      target_merged_tiles_number,
+      items_per_tile,
+      num_partitions);
+
+    agent.Process();
+  }
+}
+
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename KeyT,
+          typename ValueT>
+__launch_bounds__(
+  cub::detail::merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>::policy_t::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(
+    bool ping,
+    KeyIteratorT keys_ping,
+    ValueIteratorT items_ping,
+    OffsetT keys_count,
+    KeyT* keys_pong,
+    ValueT* items_pong,
+    CompareOpT compare_op,
+    OffsetT* merge_partitions,
+    OffsetT target_merged_tiles_number,
+    cub::detail::vsmem_t vsmem)
+{
+  using MergeSortHelperT = cub::detail::merge_sort_vsmem_helper_t<
+    typename ChainedPolicyT::ActivePolicy::MergeSortPolicy,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyIteratorT,
+    ValueIteratorT,
+    OffsetT,
+    CompareOpT,
+    KeyT,
+    ValueT>;
+
+  using ActivePolicyT = typename MergeSortHelperT::policy_t;
+
+  using AgentMergeT = typename MergeSortHelperT::merge_agent_t;
+
+  using VSmemHelperT = cub::detail::vsmem_helper_impl<AgentMergeT>;
+
+  // Static shared memory allocation
+  __shared__ typename VSmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentMergeT::TempStorage& temp_storage = VSmemHelperT::get_temp_storage(static_temp_storage, vsmem);
+
+  AgentMergeT agent(
+    ping,
+    temp_storage,
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
+    THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
+    keys_count,
+    keys_pong,
+    items_pong,
+    keys_ping,
+    items_ping,
+    compare_op,
+    merge_partitions,
+    target_merged_tiles_number);
+
+  agent.Process();
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VSmemHelperT::discard_temp_storage(temp_storage);
+}
+
+/*******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename KeyIteratorT>
+struct DeviceMergeSortPolicy
+{
+  using KeyT = cub::detail::value_t<KeyIteratorT>;
+
+  //----------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //----------------------------------------------------------------------------
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           Nominal4BItemsToItems<KeyT>(11),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+// NVBug 3384810
+#if defined(_NVHPC_CUDA)
+  using Policy520 = Policy350;
+#else
+  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<512,
+                           Nominal4BItemsToItems<KeyT>(15),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_LDG,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+#endif
+
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
+  {
+    using MergeSortPolicy =
+      AgentMergeSortPolicy<256,
+                           Nominal4BItemsToItems<KeyT>(17),
+                           cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                           cub::LOAD_DEFAULT,
+                           cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+
+  /// MaxPolicy
+  using MaxPolicy = Policy600;
+};
+
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyIteratorT,
+          typename ValueIteratorT,
+          typename OffsetT,
+          typename CompareOpT,
+          typename SelectedPolicy = DeviceMergeSortPolicy<KeyIteratorT>>
+struct DispatchMergeSort : SelectedPolicy
+{
+  using KeyT   = cub::detail::value_t<KeyIteratorT>;
+  using ValueT = cub::detail::value_t<ValueIteratorT>;
+
+  /// Whether or not there are values to be trucked along with keys
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  // Problem state
+
+  /// Device-accessible allocation of temporary storage. When nullptr, the required
+  /// allocation size is written to \p temp_storage_bytes and no work is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of \p d_temp_storage allocation
+  std::size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of unsorted input keys
+  KeyInputIteratorT d_input_keys;
+
+  /// Pointer to the input sequence of unsorted input values
+  ValueInputIteratorT d_input_items;
+
+  /// Pointer to the output sequence of sorted input keys
+  KeyIteratorT d_output_keys;
+
+  /// Pointer to the output sequence of sorted input values
+  ValueIteratorT d_output_items;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// Comparison function object which returns true if the first argument is
+  /// ordered before the second
+  CompareOpT compare_op;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  // Constructor
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input_keys(d_input_keys)
+      , d_input_items(d_input_items)
+      , d_output_keys(d_output_keys)
+      , d_output_items(d_output_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchMergeSort(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_input_keys(d_input_keys)
+      , d_input_items(d_input_items)
+      , d_output_keys(d_output_keys)
+      , d_output_items(d_output_items)
+      , num_items(num_items)
+      , compare_op(compare_op)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+
+  // Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MergePolicyT = typename ActivePolicyT::MergeSortPolicy;
+
+    using merge_sort_helper_t = cub::detail::merge_sort_vsmem_helper_t<
+      MergePolicyT,
+      KeyInputIteratorT,
+      ValueInputIteratorT,
+      KeyIteratorT,
+      ValueIteratorT,
+      OffsetT,
+      CompareOpT,
+      KeyT,
+      ValueT>;
+
+    using BlockSortVSmemHelperT  = cub::detail::vsmem_helper_impl<typename merge_sort_helper_t::block_sort_agent_t>;
+    using MergeAgentVSmemHelperT = cub::detail::vsmem_helper_impl<typename merge_sort_helper_t::merge_agent_t>;
+
+    using MaxPolicyT = typename DispatchMergeSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    if (num_items == 0)
+    {
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 0;
+      }
+      return error;
+    }
+
+    do
+    {
+      constexpr auto tile_size = merge_sort_helper_t::policy_t::ITEMS_PER_TILE;
+      const auto num_tiles     = ::cuda::ceil_div(num_items, tile_size);
+
+      const auto merge_partitions_size         = static_cast<std::size_t>(1 + num_tiles) * sizeof(OffsetT);
+      const auto temporary_keys_storage_size   = static_cast<std::size_t>(num_items * sizeof(KeyT));
+      const auto temporary_values_storage_size = static_cast<std::size_t>(num_items * sizeof(ValueT)) * !KEYS_ONLY;
+
+      /**
+       * Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases,
+       * merge sort allocates virtual shared memory that resides in global memory.
+       */
+      const std::size_t block_sort_smem_size       = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
+      const std::size_t merge_smem_size            = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
+      const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
+
+      void* allocations[4]            = {nullptr, nullptr, nullptr, nullptr};
+      std::size_t allocation_sizes[4] = {
+        merge_partitions_size, temporary_keys_storage_size, temporary_values_storage_size, virtual_shared_memory_size};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      const int num_passes = static_cast<int>(THRUST_NS_QUALIFIER::detail::log2_ri(num_tiles));
+
+      /*
+       * The algorithm consists of stages. At each stage, there are input and output arrays. There are two pairs of
+       * arrays allocated (keys and items). One pair is from function arguments and another from temporary storage. Ping
+       * is a helper variable that controls which of these two pairs of arrays is an input and which is an output for a
+       * current stage. If the ping is true - the current stage stores its result in the temporary storage. The
+       * temporary storage acts as input data otherwise.
+       *
+       * Block sort is executed before the main loop. It stores its result in  the pair of arrays that will be an input
+       * of the next stage. The initial value of the ping variable is selected so that the result of the final stage is
+       * stored in the input arrays.
+       */
+      bool ping = num_passes % 2 == 0;
+
+      auto merge_partitions = static_cast<OffsetT*>(allocations[0]);
+      auto keys_buffer      = static_cast<KeyT*>(allocations[1]);
+      auto items_buffer     = static_cast<ValueT*>(allocations[2]);
+
+      // Invoke DeviceMergeSortBlockSortKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        static_cast<int>(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream)
+        .doit(
+          DeviceMergeSortBlockSortKernel<
+            MaxPolicyT,
+            KeyInputIteratorT,
+            ValueInputIteratorT,
+            KeyIteratorT,
+            ValueIteratorT,
+            OffsetT,
+            CompareOpT,
+            KeyT,
+            ValueT>,
+          ping,
+          d_input_keys,
+          d_input_items,
+          d_output_keys,
+          d_output_items,
+          num_items,
+          keys_buffer,
+          items_buffer,
+          compare_op,
+          cub::detail::vsmem_t{allocations[3]});
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      const OffsetT num_partitions              = num_tiles + 1;
+      constexpr int threads_per_partition_block = 256;
+      const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
+      {
+        const OffsetT target_merged_tiles_number = OffsetT(2) << pass;
+
+        // Partition
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          partition_grid_size, threads_per_partition_block, 0, stream)
+          .doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
+                ping,
+                d_output_keys,
+                keys_buffer,
+                num_items,
+                num_partitions,
+                merge_partitions,
+                compare_op,
+                target_merged_tiles_number,
+                tile_size);
+
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Merge
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream)
+          .doit(
+            DeviceMergeSortMergeKernel<MaxPolicyT,
+                                       KeyInputIteratorT,
+                                       ValueInputIteratorT,
+                                       KeyIteratorT,
+                                       ValueIteratorT,
+                                       OffsetT,
+                                       CompareOpT,
+                                       KeyT,
+                                       ValueT>,
+            ping,
+            d_output_keys,
+            d_output_items,
+            num_items,
+            keys_buffer,
+            items_buffer,
+            compare_op,
+            merge_partitions,
+            target_merged_tiles_number,
+            cub::detail::vsmem_t{allocations[3]});
+
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchMergeSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchMergeSort dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_input_keys,
+        d_input_items,
+        d_output_keys,
+        d_output_items,
+        num_items,
+        compare_op,
+        stream,
+        ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KeyInputIteratorT d_input_keys,
+    ValueInputIteratorT d_input_items,
+    KeyIteratorT d_output_keys,
+    ValueIteratorT d_output_items,
+    OffsetT num_items,
+    CompareOpT compare_op,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_input_keys,
+      d_input_items,
+      d_output_keys,
+      d_output_items,
+      num_items,
+      compare_op,
+      stream);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 000000000..1da6febf2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,3310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across
+ * a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_radix_sort_downsweep.cuh>
+#include <cub/agent/agent_radix_sort_histogram.cuh>
+#include <cub/agent/agent_radix_sort_onesweep.cuh>
+#include <cub/agent/agent_radix_sort_upsweep.cuh>
+#include <cub/agent/agent_scan.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+#include <stdio.h>
+
+// suppress warnings triggered by #pragma unroll:
+// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation
+// might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wpass-failed")
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Upsweep digit-counting kernel entry point (multi-block).
+ *        Computes privatized digit histograms, one per block.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys
+ *   Input keys buffer
+ *
+ * @param[out] d_spine
+ *   Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] num_bits
+ *   Number of bits of current radix digit
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapan equal number of tiles onto each thread block
+ */
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS)
+                                       : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
+    const KeyT* d_keys,
+    OffsetT* d_spine,
+    OffsetT /*num_items*/,
+    int current_bit,
+    int num_bits,
+    GridEvenShare<OffsetT> even_share,
+    DecomposerT decomposer = {})
+{
+  using ActiveUpsweepPolicyT =
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+
+  using ActiveDownsweepPolicyT =
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+
+  enum
+  {
+    TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+                         ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+  };
+
+  // Parameterize AgentRadixSortUpsweep type for the current configuration
+  using AgentRadixSortUpsweepT = AgentRadixSortUpsweep<ActiveUpsweepPolicyT, KeyT, OffsetT, DecomposerT>;
+
+  // Shared memory storage
+  __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+  // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+  even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+  AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits, decomposer);
+
+  upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+  CTA_SYNC();
+
+  // Write out digit counts (striped)
+  upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+/**
+ * @brief Spine scan kernel entry point (single-block).
+ *        Computes an exclusive prefix sum over the privatized digit histograms
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in,out] d_spine
+ *   Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_counts
+ *   Total number of bin-counts
+ */
+template <typename ChainedPolicyT, typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(OffsetT* d_spine, int num_counts)
+{
+  // Parameterize the AgentScan type for the current configuration
+  using AgentScanT =
+    AgentScan<typename ChainedPolicyT::ActivePolicy::ScanPolicy, OffsetT*, OffsetT*, cub::Sum, OffsetT, OffsetT, OffsetT>;
+
+  // Shared memory storage
+  __shared__ typename AgentScanT::TempStorage temp_storage;
+
+  // Block scan instance
+  AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0));
+
+  // Process full input tiles
+  int block_offset = 0;
+  BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+  while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+  {
+    block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+    block_offset += AgentScanT::TILE_ITEMS;
+  }
+
+  // Process the remaining partial tile (if any).
+  if (block_offset < num_counts)
+  {
+    block_scan.template ConsumeTile<false, true>(block_offset, prefix_op, num_counts - block_offset);
+  }
+}
+
+/**
+ * @brief Downsweep pass kernel entry point (multi-block).
+ *        Scatters keys (and values) into corresponding bins for the current digit place.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] d_spine
+ *   Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] num_bits
+ *   Number of bits of current radix digit
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapan equal number of tiles onto each thread block
+ */
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS)
+                                       : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    OffsetT* d_spine,
+    OffsetT num_items,
+    int current_bit,
+    int num_bits,
+    GridEvenShare<OffsetT> even_share,
+    DecomposerT decomposer = {})
+{
+  using ActiveUpsweepPolicyT =
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>;
+
+  using ActiveDownsweepPolicyT =
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                     typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>;
+
+  enum
+  {
+    TILE_ITEMS = CUB_MAX(ActiveUpsweepPolicyT::BLOCK_THREADS * ActiveUpsweepPolicyT::ITEMS_PER_THREAD,
+                         ActiveDownsweepPolicyT::BLOCK_THREADS * ActiveDownsweepPolicyT::ITEMS_PER_THREAD)
+  };
+
+  // Parameterize AgentRadixSortDownsweep type for the current configuration
+  using AgentRadixSortDownsweepT =
+    AgentRadixSortDownsweep<ActiveDownsweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+
+  // Shared memory storage
+  __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+  // Initialize even-share descriptor for this thread block
+  even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+  // Process input tiles
+  AgentRadixSortDownsweepT(
+    temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits, decomposer)
+    .ProcessRegion(even_share.block_offset, even_share.block_end);
+}
+
+/**
+ * @brief Single pass kernel entry point (single-block).
+ *        Fully sorts a tile of input.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] end_bit
+ *   The past-the-end (most-significant) bit index needed for key comparison
+ */
+template <typename ChainedPolicyT,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    OffsetT num_items,
+    int current_bit,
+    int end_bit,
+    DecomposerT decomposer = {})
+{
+  // Constants
+  enum
+  {
+    BLOCK_THREADS    = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+    ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+    KEYS_ONLY        = std::is_same<ValueT, NullType>::value,
+  };
+
+  // BlockRadixSort type
+  using BlockRadixSortT =
+    BlockRadixSort<KeyT,
+                   BLOCK_THREADS,
+                   ITEMS_PER_THREAD,
+                   ValueT,
+                   ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+                   (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                   ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>;
+
+  // BlockLoad type (keys)
+  using BlockLoadKeys =
+    BlockLoad<KeyT, BLOCK_THREADS, ITEMS_PER_THREAD, ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM>;
+
+  // BlockLoad type (values)
+  using BlockLoadValues =
+    BlockLoad<ValueT, BLOCK_THREADS, ITEMS_PER_THREAD, ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM>;
+
+  // Unsigned word for key bits
+  using traits           = detail::radix::traits_t<KeyT>;
+  using bit_ordered_type = typename traits::bit_ordered_type;
+
+  // Shared memory storage
+  __shared__ union TempStorage
+  {
+    typename BlockRadixSortT::TempStorage sort;
+    typename BlockLoadKeys::TempStorage load_keys;
+    typename BlockLoadValues::TempStorage load_values;
+
+  } temp_storage;
+
+  // Keys and values for the block
+  KeyT keys[ITEMS_PER_THREAD];
+  ValueT values[ITEMS_PER_THREAD];
+
+  // Get default (min/max) value for out-of-bounds keys
+  bit_ordered_type default_key_bits =
+    IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer);
+
+  KeyT default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+  // Load keys
+  BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+  CTA_SYNC();
+
+  // Load values
+  if (!KEYS_ONLY)
+  {
+    // Register pressure work-around: moving num_items through shfl prevents compiler
+    // from reusing guards/addressing from prior guarded loads
+    num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+    BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+    CTA_SYNC();
+  }
+
+  // Sort tile
+  BlockRadixSortT(temp_storage.sort)
+    .SortBlockedToStriped(
+      keys, values, current_bit, end_bit, Int2Type<IS_DESCENDING>(), Int2Type<KEYS_ONLY>(), decomposer);
+
+// Store keys and values
+#pragma unroll
+  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+  {
+    int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+    if (item_offset < num_items)
+    {
+      d_keys_out[item_offset] = keys[ITEM];
+      if (!KEYS_ONLY)
+      {
+        d_values_out[item_offset] = values[ITEM];
+      }
+    }
+  }
+}
+
+/**
+ * @brief Segmented radix sorting pass (one block per segment)
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets @iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length `num_segments`,
+ *   such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
+ *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length `num_segments`,
+ *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+ *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
+ *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
+ *   the <em>i</em><sup>th</sup> is considered empty.
+ *
+ * @param[in] num_segments
+ *   The number of segments that comprise the sorting data
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] pass_bits
+ *   Number of bits of current radix digit
+ */
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS
+                                       : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int /*num_segments*/,
+    int current_bit,
+    int pass_bits,
+    DecomposerT decomposer = {})
+{
+  //
+  // Constants
+  //
+
+  using SegmentedPolicyT =
+    ::cuda::std::_If<ALT_DIGIT_BITS,
+                     typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+                     typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>;
+
+  enum
+  {
+    BLOCK_THREADS    = SegmentedPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD,
+    RADIX_BITS       = SegmentedPolicyT::RADIX_BITS,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+    RADIX_DIGITS     = 1 << RADIX_BITS,
+    KEYS_ONLY        = std::is_same<ValueT, NullType>::value,
+  };
+
+  // Upsweep type
+  using BlockUpsweepT = AgentRadixSortUpsweep<SegmentedPolicyT, KeyT, OffsetT, DecomposerT>;
+
+  // Digit-scan type
+  using DigitScanT = BlockScan<OffsetT, BLOCK_THREADS>;
+
+  // Downsweep type
+  using BlockDownsweepT = AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>;
+
+  enum
+  {
+    /// Number of bin-starting offsets tracked per thread
+    BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+  };
+
+  //
+  // Process input tiles
+  //
+
+  // Shared memory storage
+  __shared__ union
+  {
+    typename BlockUpsweepT::TempStorage upsweep;
+    typename BlockDownsweepT::TempStorage downsweep;
+    struct
+    {
+      volatile OffsetT reverse_counts_in[RADIX_DIGITS];
+      volatile OffsetT reverse_counts_out[RADIX_DIGITS];
+      typename DigitScanT::TempStorage scan;
+    };
+
+  } temp_storage;
+
+  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
+  OffsetT segment_end   = d_end_offsets[blockIdx.x];
+  OffsetT num_items     = segment_end - segment_begin;
+
+  // Check if empty segment
+  if (num_items <= 0)
+  {
+    return;
+  }
+
+  // Upsweep
+  BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits, decomposer);
+  upsweep.ProcessRegion(segment_begin, segment_end);
+
+  CTA_SYNC();
+
+  // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+  OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+  upsweep.ExtractCounts(bin_count);
+
+  CTA_SYNC();
+
+  if (IS_DESCENDING)
+  {
+// Reverse bin counts
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+      }
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+      }
+    }
+  }
+
+  // Scan
+  OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass
+                                               // (valid in the first RADIX_DIGITS threads)
+  DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+#pragma unroll
+  for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+  {
+    bin_offset[track] += segment_begin;
+  }
+
+  if (IS_DESCENDING)
+  {
+// Reverse bin offsets
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+      }
+    }
+
+    CTA_SYNC();
+
+#pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+      int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+      if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+      {
+        bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+      }
+    }
+  }
+
+  CTA_SYNC();
+
+  // Downsweep
+  BlockDownsweepT downsweep(
+    temp_storage.downsweep,
+    bin_offset,
+    num_items,
+    d_keys_in,
+    d_keys_out,
+    d_values_in,
+    d_values_out,
+    current_bit,
+    pass_bits,
+    decomposer);
+  downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+/******************************************************************************
+ * Onesweep kernels
+ ******************************************************************************/
+
+/**
+ * Kernel for computing multiple histograms
+ */
+
+/**
+ * Histogram kernel
+ */
+template <typename ChainedPolicyT,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(
+  OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit, DecomposerT decomposer = {})
+{
+  using HistogramPolicyT = typename ChainedPolicyT::ActivePolicy::HistogramPolicy;
+  using AgentT           = AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+  __shared__ typename AgentT::TempStorage temp_storage;
+  AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit, decomposer);
+  agent.Process();
+}
+
+template <typename ChainedPolicyT,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename PortionOffsetT,
+          typename AtomicOffsetT = PortionOffsetT,
+          typename DecomposerT   = detail::identity_decomposer_t>
+CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
+  DeviceRadixSortOnesweepKernel(
+    AtomicOffsetT* d_lookback,
+    AtomicOffsetT* d_ctrs,
+    OffsetT* d_bins_out,
+    const OffsetT* d_bins_in,
+    KeyT* d_keys_out,
+    const KeyT* d_keys_in,
+    ValueT* d_values_out,
+    const ValueT* d_values_in,
+    PortionOffsetT num_items,
+    int current_bit,
+    int num_bits,
+    DecomposerT decomposer = {})
+{
+  using OnesweepPolicyT = typename ChainedPolicyT::ActivePolicy::OnesweepPolicy;
+  using AgentT =
+    AgentRadixSortOnesweep<OnesweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT, DecomposerT>;
+  __shared__ typename AgentT::TempStorage s;
+
+  AgentT agent(
+    s,
+    d_lookback,
+    d_ctrs,
+    d_bins_out,
+    d_bins_in,
+    d_keys_out,
+    d_keys_in,
+    d_values_out,
+    d_values_in,
+    num_items,
+    current_bit,
+    num_bits,
+    decomposer);
+  agent.Process();
+}
+
+/**
+ * Exclusive sum kernel
+ */
+template <typename ChainedPolicyT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+{
+  using ExclusiveSumPolicyT     = typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy;
+  constexpr int RADIX_BITS      = ExclusiveSumPolicyT::RADIX_BITS;
+  constexpr int RADIX_DIGITS    = 1 << RADIX_BITS;
+  constexpr int BLOCK_THREADS   = ExclusiveSumPolicyT::BLOCK_THREADS;
+  constexpr int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS;
+  using BlockScan               = cub::BlockScan<OffsetT, BLOCK_THREADS>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // load the bins
+  OffsetT bins[BINS_PER_THREAD];
+  int bin_start = blockIdx.x * RADIX_DIGITS;
+#pragma unroll
+  for (int u = 0; u < BINS_PER_THREAD; ++u)
+  {
+    int bin = threadIdx.x * BINS_PER_THREAD + u;
+    if (bin >= RADIX_DIGITS)
+    {
+      break;
+    }
+    bins[u] = d_bins[bin_start + bin];
+  }
+
+  // compute offsets
+  BlockScan(temp_storage).ExclusiveSum(bins, bins);
+
+// store the offsets
+#pragma unroll
+  for (int u = 0; u < BINS_PER_THREAD; ++u)
+  {
+    int bin = threadIdx.x * BINS_PER_THREAD + u;
+    if (bin >= RADIX_DIGITS)
+    {
+      break;
+    }
+    d_bins[bin_start + bin] = bins[u];
+  }
+}
+
+namespace detail
+{
+namespace radix
+{
+
+// default
+template <std::size_t KeySize, std::size_t ValueSize, std::size_t OffsetSize>
+struct sm90_small_key_tuning
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 23;
+};
+
+// clang-format off
+
+// keys
+template <> struct sm90_small_key_tuning<1,  0, 4> { static constexpr int threads = 512; static constexpr int items = 19; };
+template <> struct sm90_small_key_tuning<1,  0, 8> { static constexpr int threads = 512; static constexpr int items = 19; };
+template <> struct sm90_small_key_tuning<2,  0, 4> { static constexpr int threads = 512; static constexpr int items = 19; };
+template <> struct sm90_small_key_tuning<2,  0, 8> { static constexpr int threads = 512; static constexpr int items = 19; };
+
+// pairs  8:xx
+template <> struct sm90_small_key_tuning<1,  1, 4> { static constexpr int threads = 512; static constexpr int items = 15; };
+template <> struct sm90_small_key_tuning<1,  1, 8> { static constexpr int threads = 448; static constexpr int items = 16; };
+template <> struct sm90_small_key_tuning<1,  2, 4> { static constexpr int threads = 512; static constexpr int items = 17; };
+template <> struct sm90_small_key_tuning<1,  2, 8> { static constexpr int threads = 512; static constexpr int items = 14; };
+template <> struct sm90_small_key_tuning<1,  4, 4> { static constexpr int threads = 512; static constexpr int items = 17; };
+template <> struct sm90_small_key_tuning<1,  4, 8> { static constexpr int threads = 512; static constexpr int items = 14; };
+template <> struct sm90_small_key_tuning<1,  8, 4> { static constexpr int threads = 384; static constexpr int items = 23; };
+template <> struct sm90_small_key_tuning<1,  8, 8> { static constexpr int threads = 384; static constexpr int items = 18; };
+template <> struct sm90_small_key_tuning<1, 16, 4> { static constexpr int threads = 512; static constexpr int items = 22; };
+template <> struct sm90_small_key_tuning<1, 16, 8> { static constexpr int threads = 512; static constexpr int items = 22; };
+
+// pairs 16:xx
+template <> struct sm90_small_key_tuning<2,  1, 4> { static constexpr int threads = 384; static constexpr int items = 14; };
+template <> struct sm90_small_key_tuning<2,  1, 8> { static constexpr int threads = 384; static constexpr int items = 16; };
+template <> struct sm90_small_key_tuning<2,  2, 4> { static constexpr int threads = 384; static constexpr int items = 15; };
+template <> struct sm90_small_key_tuning<2,  2, 8> { static constexpr int threads = 448; static constexpr int items = 16; };
+template <> struct sm90_small_key_tuning<2,  4, 4> { static constexpr int threads = 512; static constexpr int items = 17; };
+template <> struct sm90_small_key_tuning<2,  4, 8> { static constexpr int threads = 512; static constexpr int items = 12; };
+template <> struct sm90_small_key_tuning<2,  8, 4> { static constexpr int threads = 384; static constexpr int items = 23; };
+template <> struct sm90_small_key_tuning<2,  8, 8> { static constexpr int threads = 512; static constexpr int items = 23; };
+template <> struct sm90_small_key_tuning<2, 16, 4> { static constexpr int threads = 512; static constexpr int items = 21; };
+template <> struct sm90_small_key_tuning<2, 16, 8> { static constexpr int threads = 576; static constexpr int items = 22; };
+// clang-format on
+
+} // namespace radix
+} // namespace detail
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * @brief Tuning policy for kernel specialization
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename KeyT, typename ValueT, typename OffsetT>
+struct DeviceRadixSortPolicy
+{
+  //------------------------------------------------------------------------------
+  // Constants
+  //------------------------------------------------------------------------------
+
+  // Whether this is a keys-only (or key-value) sort
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  // Dominant-sized key/value type
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+  //------------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //------------------------------------------------------------------------------
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented
+                                                       // keys/s (K40m)
+      ONESWEEP            = false,
+      ONESWEEP_RADIX_BITS = 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      21,
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // Scan policy
+    using ScanPolicy =
+      AgentScanPolicy<1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS>;
+
+    // Keys-only downsweep policies
+    using DownsweepPolicyKeys = AgentRadixSortDownsweepPolicy<
+      128,
+      9,
+      DominantT,
+      BLOCK_LOAD_WARP_TRANSPOSE,
+      LOAD_LDG,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicyKeys = AgentRadixSortDownsweepPolicy<
+      64,
+      18,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Key-value pairs downsweep policies
+    using DownsweepPolicyPairs    = DownsweepPolicyKeys;
+    using AltDownsweepPolicyPairs = AgentRadixSortDownsweepPolicy<
+      128,
+      15,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Downsweep policies
+    using DownsweepPolicy = ::cuda::std::_If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>;
+
+    using AltDownsweepPolicy = ::cuda::std::_If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = DownsweepPolicy;
+    using AltUpsweepPolicy = AltDownsweepPolicy;
+
+    // Single-tile policy
+    using SingleTilePolicy = DownsweepPolicy;
+
+    // Segmented policies
+    using SegmentedPolicy    = DownsweepPolicy;
+    using AltSegmentedPolicy = AltDownsweepPolicy;
+  };
+
+  /// SM50
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX)
+      ONESWEEP               = false,
+      ONESWEEP_RADIX_BITS    = 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 1, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      21,
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      160,
+      39,
+      DominantT,
+      BLOCK_LOAD_WARP_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_BASIC,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      16,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = DownsweepPolicy;
+    using AltUpsweepPolicy = AltDownsweepPolicy;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      31,
+      DominantT,
+      BLOCK_LOAD_WARP_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      11,
+      DominantT,
+      BLOCK_LOAD_WARP_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  /// SM60 (GP100)
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100)
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100)
+      ONESWEEP               = sizeof(KeyT) >= sizeof(uint32_t), // 10.0B 32b keys/s (GP100, 64M random keys)
+      ONESWEEP_RADIX_BITS    = 8,
+      OFFSET_64BIT           = sizeof(OffsetT) == 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      OFFSET_64BIT ? 29 : 30,
+      DominantT,
+      2,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      25,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      OFFSET_64BIT ? 32 : 39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = DownsweepPolicy;
+    using AltUpsweepPolicy = AltDownsweepPolicy;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      11,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  /// SM61 (GP104)
+  struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080)
+      ONESWEEP               = sizeof(KeyT) >= sizeof(uint32_t),
+      ONESWEEP_RADIX_BITS    = 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      30,
+      DominantT,
+      2,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      31,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      35,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = AgentRadixSortUpsweepPolicy<128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS>;
+    using AltUpsweepPolicy = AgentRadixSortUpsweepPolicy<128, 16, DominantT, LOAD_LDG, PRIMARY_RADIX_BITS - 1>;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      11,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  /// SM62 (Tegra, less RF)
+  struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS  = 5,
+      ALT_RADIX_BITS      = PRIMARY_RADIX_BITS - 1,
+      ONESWEEP            = sizeof(KeyT) >= sizeof(uint32_t),
+      ONESWEEP_RADIX_BITS = 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      30,
+      DominantT,
+      2,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      16,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      16,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      ALT_RADIX_BITS>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = DownsweepPolicy;
+    using AltUpsweepPolicy = AltDownsweepPolicy;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy    = DownsweepPolicy;
+    using AltSegmentedPolicy = AltDownsweepPolicy;
+  };
+
+  /// SM70 (GV100)
+  struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100)
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100)
+      ONESWEEP               = sizeof(KeyT) >= sizeof(uint32_t), // 15.8B 32b keys/s (V100-SXM2, 64M random keys)
+      ONESWEEP_RADIX_BITS    = 8,
+      OFFSET_64BIT           = sizeof(OffsetT) == 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<256, 8, 8, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      256,
+      sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23,
+      DominantT,
+      4,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      512,
+      23,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      (sizeof(KeyT) > 1) ? 256 : 128,
+      OFFSET_64BIT ? 46 : 47,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Upsweep policies
+    using UpsweepPolicy = AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>;
+    using AltUpsweepPolicy =
+      AgentRadixSortUpsweepPolicy<256, OFFSET_64BIT ? 46 : 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1>;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      11,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  /// SM80
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5,
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5,
+      ONESWEEP               = sizeof(KeyT) >= sizeof(uint32_t),
+      ONESWEEP_RADIX_BITS    = 8,
+      OFFSET_64BIT           = sizeof(OffsetT) == 8,
+    };
+
+    // Histogram policy
+    using HistogramPolicy = AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>;
+
+    // Exclusive sum policy
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    // Onesweep policy
+    using OnesweepPolicy = AgentRadixSortOnesweepPolicy<
+      384,
+      OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21,
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    // ScanPolicy
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    // Downsweep policies
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      512,
+      23,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      (sizeof(KeyT) > 1) ? 256 : 128,
+      47,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    // Upsweep policies
+    using UpsweepPolicy    = AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>;
+    using AltUpsweepPolicy = AgentRadixSortUpsweepPolicy<256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1>;
+
+    // Single-tile policy
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    // Segmented policies
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      11,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  /// SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy800>
+  {
+    enum
+    {
+      PRIMARY_RADIX_BITS     = (sizeof(KeyT) > 1) ? 7 : 5,
+      SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5,
+      SEGMENTED_RADIX_BITS   = (sizeof(KeyT) > 1) ? 6 : 5,
+      ONESWEEP               = true,
+      ONESWEEP_RADIX_BITS    = 8,
+      OFFSET_64BIT           = sizeof(OffsetT) == 8 ? 1 : 0,
+      FLOAT_KEYS             = std::is_same<KeyT, float>::value ? 1 : 0,
+    };
+
+    using HistogramPolicy    = AgentRadixSortHistogramPolicy<128, 16, 1, KeyT, ONESWEEP_RADIX_BITS>;
+    using ExclusiveSumPolicy = AgentRadixSortExclusiveSumPolicy<256, ONESWEEP_RADIX_BITS>;
+
+    using OnesweepPolicyKey32 = AgentRadixSortOnesweepPolicy<
+      384,
+      KEYS_ONLY ? 20 - OFFSET_64BIT - FLOAT_KEYS
+                : (sizeof(ValueT) < 8 ? (OFFSET_64BIT ? 17 : 23) : (OFFSET_64BIT ? 29 : 30)),
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    using OnesweepPolicyKey64 = AgentRadixSortOnesweepPolicy<
+      384,
+      sizeof(ValueT) < 8 ? 30 : 24,
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_SORT_STORE_DIRECT,
+      ONESWEEP_RADIX_BITS>;
+
+    using OnesweepLargeKeyPolicy = //
+      ::cuda::std::_If<sizeof(KeyT) == 4, OnesweepPolicyKey32, OnesweepPolicyKey64>;
+
+    using OnesweepSmallKeyPolicySizes = //
+      detail::radix::sm90_small_key_tuning<sizeof(KeyT), KEYS_ONLY ? 0 : sizeof(ValueT), sizeof(OffsetT)>;
+    using OnesweepSmallKeyPolicy = AgentRadixSortOnesweepPolicy<
+      OnesweepSmallKeyPolicySizes::threads,
+      OnesweepSmallKeyPolicySizes::items,
+      DominantT,
+      1,
+      RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_SORT_STORE_DIRECT,
+      8>;
+    using OnesweepPolicy = //
+      ::cuda::std::_If<sizeof(KeyT) < 4, //
+                       OnesweepSmallKeyPolicy, //
+                       OnesweepLargeKeyPolicy>;
+
+    using ScanPolicy =
+      AgentScanPolicy<512,
+                      23,
+                      OffsetT,
+                      BLOCK_LOAD_WARP_TRANSPOSE,
+                      LOAD_DEFAULT,
+                      BLOCK_STORE_WARP_TRANSPOSE,
+                      BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    using DownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      512,
+      23,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS>;
+
+    using AltDownsweepPolicy = AgentRadixSortDownsweepPolicy<
+      (sizeof(KeyT) > 1) ? 256 : 128,
+      47,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      PRIMARY_RADIX_BITS - 1>;
+
+    using UpsweepPolicy    = AgentRadixSortUpsweepPolicy<256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>;
+    using AltUpsweepPolicy = AgentRadixSortUpsweepPolicy<256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1>;
+
+    using SingleTilePolicy = AgentRadixSortDownsweepPolicy<
+      256,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_LDG,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SINGLE_TILE_RADIX_BITS>;
+
+    using SegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      192,
+      39,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS>;
+
+    using AltSegmentedPolicy = AgentRadixSortDownsweepPolicy<
+      384,
+      11,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      SEGMENTED_RADIX_BITS - 1>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam DecomposerT
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>,
+          typename DecomposerT    = detail::identity_decomposer_t>
+struct DispatchRadixSort : SelectedPolicy
+{
+  //------------------------------------------------------------------------------
+  // Constants
+  //------------------------------------------------------------------------------
+
+  // Whether this is a keys-only (or key-value) sort
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  //------------------------------------------------------------------------------
+  // Problem state
+  //------------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage.
+  //  When nullptr, the required allocation size is written to `temp_storage_bytes` and no work is
+  //  done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+  /// updated to point to the sorted output keys
+  DoubleBuffer<KeyT>& d_keys;
+
+  /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+  /// updated to point to the sorted output values
+  DoubleBuffer<ValueT>& d_values;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// The beginning (least-significant) bit index needed for key comparison
+  int begin_bit;
+
+  /// The past-the-end (most-significant) bit index needed for key comparison
+  int end_bit;
+
+  /// CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /// PTX version
+  int ptx_version;
+
+  /// Whether is okay to overwrite source buffers
+  bool is_overwrite_okay;
+
+  DecomposerT decomposer;
+
+  //------------------------------------------------------------------------------
+  // Constructor
+  //------------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    int ptx_version,
+    DecomposerT decomposer = {})
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , begin_bit(begin_bit)
+      , end_bit(end_bit)
+      , stream(stream)
+      , ptx_version(ptx_version)
+      , is_overwrite_okay(is_overwrite_okay)
+      , decomposer(decomposer)
+  {}
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchRadixSort(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , begin_bit(begin_bit)
+      , end_bit(end_bit)
+      , stream(stream)
+      , ptx_version(ptx_version)
+      , is_overwrite_okay(is_overwrite_okay)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+
+  //------------------------------------------------------------------------------
+  // Small-problem (single tile) invocation
+  //------------------------------------------------------------------------------
+
+  /**
+   * @brief Invoke a single block to sort in-core
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam SingleTileKernelT
+   *   Function type of cub::DeviceRadixSortSingleTileKernel
+   *
+   * @param[in] single_tile_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+   */
+  template <typename ActivePolicyT, typename SingleTileKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  InvokeSingleTile(SingleTileKernelT single_tile_kernel)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Return if the caller is simply requesting the size of the storage allocation
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 1;
+        break;
+      }
+
+// Log single_tile_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit "
+              "%d, bit_grain %d\n",
+              1,
+              ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+              (long long) stream,
+              ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD,
+              1,
+              begin_bit,
+              ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+#endif
+
+      // Invoke upsweep_kernel with same grid size as downsweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
+        .doit(single_tile_kernel,
+              d_keys.Current(),
+              d_keys.Alternate(),
+              d_values.Current(),
+              d_values.Alternate(),
+              num_items,
+              begin_bit,
+              end_bit,
+              decomposer);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Update selector
+      d_keys.selector ^= 1;
+      d_values.selector ^= 1;
+    } while (0);
+
+    return error;
+  }
+
+  //------------------------------------------------------------------------------
+  // Normal problem size invocation
+  //------------------------------------------------------------------------------
+
+  /**
+   * Invoke a three-kernel sorting pass at the current bit.
+   */
+  template <typename PassConfigT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass(
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    OffsetT* d_spine,
+    int /*spine_length*/,
+    int& current_bit,
+    PassConfigT& pass_config)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+// Log upsweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, "
+              "bit_grain %d\n",
+              pass_config.even_share.grid_size,
+              pass_config.upsweep_config.block_threads,
+              (long long) stream,
+              pass_config.upsweep_config.items_per_thread,
+              pass_config.upsweep_config.sm_occupancy,
+              current_bit,
+              pass_bits);
+#endif
+
+      // Spine length written by the upsweep kernel in the current pass.
+      int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;
+
+      // Invoke upsweep_kernel with same grid size as downsweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream)
+        .doit(pass_config.upsweep_kernel,
+              d_keys_in,
+              d_spine,
+              num_items,
+              current_bit,
+              pass_bits,
+              pass_config.even_share,
+              decomposer);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+// Log scan_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+              1,
+              pass_config.scan_config.block_threads,
+              (long long) stream,
+              pass_config.scan_config.items_per_thread);
+#endif
+
+      // Invoke scan_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
+        .doit(pass_config.scan_kernel, d_spine, pass_spine_length);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+// Log downsweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+              pass_config.even_share.grid_size,
+              pass_config.downsweep_config.block_threads,
+              (long long) stream,
+              pass_config.downsweep_config.items_per_thread,
+              pass_config.downsweep_config.sm_occupancy);
+#endif
+
+      // Invoke downsweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream)
+        .doit(pass_config.downsweep_kernel,
+              d_keys_in,
+              d_keys_out,
+              d_values_in,
+              d_values_out,
+              d_spine,
+              num_items,
+              current_bit,
+              pass_bits,
+              pass_config.even_share,
+              decomposer);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Update current bit
+      current_bit += pass_bits;
+    } while (0);
+
+    return error;
+  }
+
+  /// Pass configuration structure
+  template <typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT>
+  struct PassConfig
+  {
+    UpsweepKernelT upsweep_kernel;
+    KernelConfig upsweep_config;
+    ScanKernelT scan_kernel;
+    KernelConfig scan_config;
+    DownsweepKernelT downsweep_kernel;
+    KernelConfig downsweep_config;
+    int radix_bits;
+    int radix_digits;
+    int max_downsweep_grid_size;
+    GridEvenShare<OffsetT> even_share;
+
+    /// Initialize pass configuration
+    template <typename UpsweepPolicyT, typename ScanPolicyT, typename DownsweepPolicyT>
+    CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InitPassConfig(
+      UpsweepKernelT upsweep_kernel,
+      ScanKernelT scan_kernel,
+      DownsweepKernelT downsweep_kernel,
+      int /*ptx_version*/,
+      int sm_count,
+      OffsetT num_items)
+    {
+      cudaError error = cudaSuccess;
+      do
+      {
+        this->upsweep_kernel   = upsweep_kernel;
+        this->scan_kernel      = scan_kernel;
+        this->downsweep_kernel = downsweep_kernel;
+        radix_bits             = DownsweepPolicyT::RADIX_BITS;
+        radix_digits           = 1 << radix_bits;
+
+        error = CubDebug(upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        error = CubDebug(scan_config.Init<ScanPolicyT>(scan_kernel));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        error = CubDebug(downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
+
+        even_share.DispatchInit(
+          num_items, max_downsweep_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+      } while (0);
+      return error;
+    }
+  };
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeOnesweep()
+  {
+    using MaxPolicyT = typename DispatchRadixSort::MaxPolicy;
+    // PortionOffsetT is used for offsets within a portion, and must be signed.
+    using PortionOffsetT = int;
+    using AtomicOffsetT  = PortionOffsetT;
+
+    // compute temporary storage size
+    constexpr int RADIX_BITS                = ActivePolicyT::ONESWEEP_RADIX_BITS;
+    constexpr int RADIX_DIGITS              = 1 << RADIX_BITS;
+    constexpr int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD;
+    constexpr int ONESWEEP_BLOCK_THREADS    = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS;
+    constexpr int ONESWEEP_TILE_ITEMS       = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS;
+    // portions handle inputs with >=2**30 elements, due to the way lookback works
+    // for testing purposes, one portion is <= 2**28 elements
+    constexpr PortionOffsetT PORTION_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
+    int num_passes                        = ::cuda::ceil_div(end_bit - begin_bit, RADIX_BITS);
+    OffsetT num_portions                  = static_cast<OffsetT>(::cuda::ceil_div(num_items, PORTION_SIZE));
+    PortionOffsetT max_num_blocks =
+      ::cuda::ceil_div(static_cast<int>(CUB_MIN(num_items, static_cast<OffsetT>(PORTION_SIZE))), ONESWEEP_TILE_ITEMS);
+
+    size_t value_size         = KEYS_ONLY ? 0 : sizeof(ValueT);
+    size_t allocation_sizes[] = {
+      // bins
+      num_portions * num_passes * RADIX_DIGITS * sizeof(OffsetT),
+      // lookback
+      max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+      // extra key buffer
+      is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT),
+      // extra value buffer
+      is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size,
+      // counters
+      num_portions * num_passes * sizeof(AtomicOffsetT),
+    };
+    constexpr int NUM_ALLOCATIONS      = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]);
+    void* allocations[NUM_ALLOCATIONS] = {};
+    AliasTemporaries<NUM_ALLOCATIONS>(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+
+    // just return if no temporary storage is provided
+    cudaError_t error = cudaSuccess;
+    if (d_temp_storage == nullptr)
+    {
+      return error;
+    }
+
+    OffsetT* d_bins           = (OffsetT*) allocations[0];
+    AtomicOffsetT* d_lookback = (AtomicOffsetT*) allocations[1];
+    KeyT* d_keys_tmp2         = (KeyT*) allocations[2];
+    ValueT* d_values_tmp2     = (ValueT*) allocations[3];
+    AtomicOffsetT* d_ctrs     = (AtomicOffsetT*) allocations[4];
+
+    do
+    {
+      // initialization
+      error = CubDebug(cudaMemsetAsync(d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // compute num_passes histograms with RADIX_DIGITS bins each
+      error = CubDebug(cudaMemsetAsync(d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+      int device  = -1;
+      int num_sms = 0;
+
+      error = CubDebug(cudaGetDevice(&device));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      error = CubDebug(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      constexpr int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
+      int histo_blocks_per_sm           = 1;
+      auto histogram_kernel = DeviceRadixSortHistogramKernel<MaxPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
+
+      error = CubDebug(
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+// log histogram_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
+              "%d SM occupancy, bit_grain %d\n",
+              histo_blocks_per_sm * num_sms,
+              HISTO_BLOCK_THREADS,
+              reinterpret_cast<long long>(stream),
+              ActivePolicyT::HistogramPolicy::ITEMS_PER_THREAD,
+              histo_blocks_per_sm,
+              ActivePolicyT::HistogramPolicy::RADIX_BITS);
+#endif
+
+      error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+                histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream)
+                .doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
+      error = CubDebug(error);
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // exclusive sums to determine starts
+      constexpr int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
+
+// log exclusive_sum_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
+              num_passes,
+              SCAN_BLOCK_THREADS,
+              reinterpret_cast<long long>(stream),
+              ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS);
+#endif
+
+      error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
+                .doit(DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>, d_bins);
+      error = CubDebug(error);
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // use the other buffer if no overwrite is allowed
+      KeyT* d_keys_tmp     = d_keys.Alternate();
+      ValueT* d_values_tmp = d_values.Alternate();
+      if (!is_overwrite_okay && num_passes % 2 == 0)
+      {
+        d_keys.d_buffers[1]   = d_keys_tmp2;
+        d_values.d_buffers[1] = d_values_tmp2;
+      }
+
+      for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; current_bit += RADIX_BITS, ++pass)
+      {
+        int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
+        for (OffsetT portion = 0; portion < num_portions; ++portion)
+        {
+          PortionOffsetT portion_num_items = static_cast<PortionOffsetT>(
+            CUB_MIN(num_items - portion * PORTION_SIZE, static_cast<OffsetT>(PORTION_SIZE)));
+
+          PortionOffsetT num_blocks = ::cuda::ceil_div(portion_num_items, ONESWEEP_TILE_ITEMS);
+
+          error = CubDebug(cudaMemsetAsync(d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream));
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+
+// log onesweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+          _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
+                  "current bit %d, bit_grain %d, portion %d/%d\n",
+                  num_blocks,
+                  ONESWEEP_BLOCK_THREADS,
+                  reinterpret_cast<long long>(stream),
+                  ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD,
+                  current_bit,
+                  num_bits,
+                  static_cast<int>(portion),
+                  static_cast<int>(num_portions));
+#endif
+
+          auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+            MaxPolicyT,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT,
+            PortionOffsetT,
+            AtomicOffsetT,
+            DecomposerT>;
+
+          error =
+            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
+              .doit(onesweep_kernel,
+                    d_lookback,
+                    d_ctrs + portion * num_passes + pass,
+                    portion < num_portions - 1 ? d_bins + ((portion + 1) * num_passes + pass) * RADIX_DIGITS : nullptr,
+                    d_bins + (portion * num_passes + pass) * RADIX_DIGITS,
+                    d_keys.Alternate(),
+                    d_keys.Current() + portion * PORTION_SIZE,
+                    d_values.Alternate(),
+                    d_values.Current() + portion * PORTION_SIZE,
+                    portion_num_items,
+                    current_bit,
+                    num_bits,
+                    decomposer);
+          error = CubDebug(error);
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+
+          error = CubDebug(detail::DebugSyncStream(stream));
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+        }
+
+        if (error != cudaSuccess)
+        {
+          break;
+        }
+
+        // use the temporary buffers if no overwrite is allowed
+        if (!is_overwrite_okay && pass == 0)
+        {
+          d_keys   = num_passes % 2 == 0 ? DoubleBuffer<KeyT>(d_keys_tmp, d_keys_tmp2)
+                                         : DoubleBuffer<KeyT>(d_keys_tmp2, d_keys_tmp);
+          d_values = num_passes % 2 == 0 ? DoubleBuffer<ValueT>(d_values_tmp, d_values_tmp2)
+                                         : DoubleBuffer<ValueT>(d_values_tmp2, d_values_tmp);
+        }
+        d_keys.selector ^= 1;
+        d_values.selector ^= 1;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  /**
+   * @brief Invocation (run multiple digit passes)
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam UpsweepKernelT
+   *   Function type of cub::DeviceRadixSortUpsweepKernel
+   *
+   * @tparam ScanKernelT
+   *   Function type of cub::SpineScanKernel
+   *
+   * @tparam DownsweepKernelT
+   *   Function type of cub::DeviceRadixSortDownsweepKernel
+   *
+   * @param[in] upsweep_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+   *
+   * @param[in] alt_upsweep_kernel
+   *   Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+   *
+   * @param[in] scan_kernel
+   *   Kernel function pointer to parameterization of cub::SpineScanKernel
+   *
+   * @param[in] downsweep_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+   *
+   * @param[in] alt_downsweep_kernel
+   *   Alternate kernel function pointer to parameterization of
+   *   cub::DeviceRadixSortDownsweepKernel
+   */
+  template <typename ActivePolicyT, typename UpsweepKernelT, typename ScanKernelT, typename DownsweepKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t InvokePasses(
+    UpsweepKernelT upsweep_kernel,
+    UpsweepKernelT alt_upsweep_kernel,
+    ScanKernelT scan_kernel,
+    DownsweepKernelT downsweep_kernel,
+    DownsweepKernelT alt_downsweep_kernel)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get SM count
+      int sm_count;
+      error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Init regular and alternate-digit kernel configurations
+      PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+      error = pass_config.template InitPassConfig<typename ActivePolicyT::UpsweepPolicy,
+                                                  typename ActivePolicyT::ScanPolicy,
+                                                  typename ActivePolicyT::DownsweepPolicy>(
+        upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items);
+      if (error)
+      {
+        break;
+      }
+
+      error = alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltUpsweepPolicy,
+                                                      typename ActivePolicyT::ScanPolicy,
+                                                      typename ActivePolicyT::AltDownsweepPolicy>(
+        alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items);
+      if (error)
+      {
+        break;
+      }
+
+      // Get maximum spine length
+      int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+      int spine_length  = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+      // Temporary storage allocation requirements
+      void* allocations[3]       = {};
+      size_t allocation_sizes[3] = {
+        // bytes needed for privatized block digit histograms
+        spine_length * sizeof(OffsetT),
+
+        // bytes needed for 3rd keys buffer
+        (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),
+
+        // bytes needed for 3rd values buffer
+        (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),
+      };
+
+      // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Return if the caller is simply requesting the size of the storage allocation
+      if (d_temp_storage == nullptr)
+      {
+        return cudaSuccess;
+      }
+
+      // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our
+      // preferred digit size
+      int num_bits           = end_bit - begin_bit;
+      int num_passes         = ::cuda::ceil_div(num_bits, pass_config.radix_bits);
+      bool is_num_passes_odd = num_passes & 1;
+      int max_alt_passes     = (num_passes * pass_config.radix_bits) - num_bits;
+      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+      // Alias the temporary storage allocations
+      OffsetT* d_spine = static_cast<OffsetT*>(allocations[0]);
+
+      DoubleBuffer<KeyT> d_keys_remaining_passes(
+        (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+        (is_overwrite_okay)   ? d_keys.Current()
+        : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1])
+                              : d_keys.Alternate());
+
+      DoubleBuffer<ValueT> d_values_remaining_passes(
+        (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+        (is_overwrite_okay)   ? d_values.Current()
+        : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2])
+                              : d_values.Alternate());
+
+      // Run first pass, consuming from the input's current buffers
+      int current_bit = begin_bit;
+      error           = CubDebug(InvokePass(
+        d_keys.Current(),
+        d_keys_remaining_passes.Current(),
+        d_values.Current(),
+        d_values_remaining_passes.Current(),
+        d_spine,
+        spine_length,
+        current_bit,
+        (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Run remaining passes
+      while (current_bit < end_bit)
+      {
+        error = CubDebug(InvokePass(
+          d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+          d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+          d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+          d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+          d_spine,
+          spine_length,
+          current_bit,
+          (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Invert selectors
+        d_keys_remaining_passes.selector ^= 1;
+        d_values_remaining_passes.selector ^= 1;
+      }
+
+      // Update selector
+      if (!is_overwrite_okay)
+      {
+        num_passes = 1; // Sorted data always ends up in the other vector
+      }
+
+      d_keys.selector   = (d_keys.selector + num_passes) & 1;
+      d_values.selector = (d_values.selector + num_passes) & 1;
+    } while (0);
+
+    return error;
+  }
+
+  //------------------------------------------------------------------------------
+  // Chained policy invocation
+  //------------------------------------------------------------------------------
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeManyTiles(Int2Type<false>)
+  {
+    // Invoke upsweep-downsweep
+    using MaxPolicyT = typename DispatchRadixSort::MaxPolicy;
+    return InvokePasses<ActivePolicyT>(
+      DeviceRadixSortUpsweepKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
+      DeviceRadixSortUpsweepKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT, DecomposerT>,
+      RadixSortScanBinsKernel<MaxPolicyT, OffsetT>,
+      DeviceRadixSortDownsweepKernel<MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>,
+      DeviceRadixSortDownsweepKernel<MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeManyTiles(Int2Type<true>)
+  {
+    // Invoke onesweep
+    return InvokeOnesweep<ActivePolicyT>();
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokeCopy()
+  {
+    // is_overwrite_okay == false here
+    // Return the number of temporary bytes if requested
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = 1;
+      return cudaSuccess;
+    }
+
+// Copy keys
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long) num_items, (long long) stream);
+#endif
+    cudaError_t error = cudaSuccess;
+
+    error = CubDebug(
+      cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), cudaMemcpyDefault, stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+    d_keys.selector ^= 1;
+
+    // Copy values if necessary
+    if (!KEYS_ONLY)
+    {
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long) num_items, (long long) stream);
+#endif
+      error = CubDebug(cudaMemcpyAsync(
+        d_values.Alternate(), d_values.Current(), num_items * sizeof(ValueT), cudaMemcpyDefault, stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+    d_values.selector ^= 1;
+
+    return error;
+  }
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT        = typename DispatchRadixSort::MaxPolicy;
+    using SingleTilePolicyT = typename ActivePolicyT::SingleTilePolicy;
+
+    // Return if empty problem, or if no bits to sort and double-buffering is used
+    if (num_items == 0 || (begin_bit == end_bit && is_overwrite_okay))
+    {
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 1;
+      }
+      return cudaSuccess;
+    }
+
+    // Check if simple copy suffices (is_overwrite_okay == false at this point)
+    if (begin_bit == end_bit)
+    {
+      bool has_uva      = false;
+      cudaError_t error = detail::HasUVA(has_uva);
+      if (error != cudaSuccess)
+      {
+        return error;
+      }
+      if (has_uva)
+      {
+        return InvokeCopy();
+      }
+    }
+
+    // Force kernel code-generation in all compiler passes
+    if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+    {
+      // Small, single tile size
+      return InvokeSingleTile<ActivePolicyT>(
+        DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>);
+    }
+    else
+    {
+      // Regular size
+      return InvokeManyTiles<ActivePolicyT>(Int2Type<ActivePolicyT::ONESWEEP>());
+    }
+  }
+
+  //------------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //------------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When nullptr, the required
+   *   allocation size is written to `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Double-buffer whose current buffer contains the unsorted input keys and,
+   *   upon return, is updated to point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer whose current buffer contains the unsorted input values and,
+   *   upon return, is updated to point to the sorted output values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] begin_bit
+   *   The beginning (least-significant) bit index needed for key comparison
+   *
+   * @param[in] end_bit
+   *   The past-the-end (most-significant) bit index needed for key comparison
+   *
+   * @param[in] is_overwrite_okay
+   *   Whether is okay to overwrite source buffers
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    DecomposerT decomposer = {})
+  {
+    using MaxPolicyT = typename DispatchRadixSort::MaxPolicy;
+
+    cudaError_t error;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchRadixSort dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        begin_bit,
+        end_bit,
+        is_overwrite_okay,
+        stream,
+        ptx_version,
+        decomposer);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
+  }
+};
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide
+ * radix sort
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets @iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>,
+          typename DecomposerT    = detail::identity_decomposer_t>
+struct DispatchSegmentedRadixSort : SelectedPolicy
+{
+  //------------------------------------------------------------------------------
+  // Constants
+  //------------------------------------------------------------------------------
+
+  // Whether this is a keys-only (or key-value) sort
+  static constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  //------------------------------------------------------------------------------
+  // Parameter members
+  //------------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage.  When nullptr, the required allocation size
+  /// is written to `temp_storage_bytes` and no work is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+  /// updated to point to the sorted output keys
+  DoubleBuffer<KeyT>& d_keys;
+
+  /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+  /// updated to point to the sorted output values
+  DoubleBuffer<ValueT>& d_values;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// The number of segments that comprise the sorting data
+  OffsetT num_segments;
+
+  /// Random-access input iterator to the sequence of beginning offsets of length `num_segments`,
+  /// such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
+  /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+  BeginOffsetIteratorT d_begin_offsets;
+
+  /// Random-access input iterator to the sequence of ending offsets of length `num_segments`,
+  /// such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+  /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>. If <tt>d_end_offsets[i]-1</tt>
+  /// <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+  EndOffsetIteratorT d_end_offsets;
+
+  /// The beginning (least-significant) bit index needed for key comparison
+  int begin_bit;
+
+  /// The past-the-end (most-significant) bit index needed for key comparison
+  int end_bit;
+
+  /// CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /// PTX version
+  int ptx_version;
+
+  /// Whether is okay to overwrite source buffers
+  bool is_overwrite_okay;
+
+  DecomposerT decomposer;
+
+  //------------------------------------------------------------------------------
+  // Constructors
+  //------------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    OffsetT num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    int ptx_version,
+    DecomposerT decomposer = {})
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , begin_bit(begin_bit)
+      , end_bit(end_bit)
+      , stream(stream)
+      , ptx_version(ptx_version)
+      , is_overwrite_okay(is_overwrite_okay)
+      , decomposer(decomposer)
+  {}
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedRadixSort(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    OffsetT num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , begin_bit(begin_bit)
+      , end_bit(end_bit)
+      , stream(stream)
+      , ptx_version(ptx_version)
+      , is_overwrite_okay(is_overwrite_okay)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+
+  //------------------------------------------------------------------------------
+  // Multi-segment invocation
+  //------------------------------------------------------------------------------
+
+  /// Invoke a three-kernel sorting pass at the current bit.
+  template <typename PassConfigT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t InvokePass(
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    int& current_bit,
+    PassConfigT& pass_config)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+// Log kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking segmented_kernels<<<%lld, %lld, 0, %lld>>>(), "
+              "%lld items per thread, %lld SM occupancy, "
+              "current bit %d, bit_grain %d\n",
+              (long long) num_segments,
+              (long long) pass_config.segmented_config.block_threads,
+              (long long) stream,
+              (long long) pass_config.segmented_config.items_per_thread,
+              (long long) pass_config.segmented_config.sm_occupancy,
+              current_bit,
+              pass_bits);
+#endif
+
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        num_segments, pass_config.segmented_config.block_threads, 0, stream)
+        .doit(pass_config.segmented_kernel,
+              d_keys_in,
+              d_keys_out,
+              d_values_in,
+              d_values_out,
+              d_begin_offsets,
+              d_end_offsets,
+              num_segments,
+              current_bit,
+              pass_bits,
+              decomposer);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Update current bit
+      current_bit += pass_bits;
+    } while (0);
+
+    return error;
+  }
+
+  /// PassConfig data structure
+  template <typename SegmentedKernelT>
+  struct PassConfig
+  {
+    SegmentedKernelT segmented_kernel;
+    KernelConfig segmented_config;
+    int radix_bits;
+    int radix_digits;
+
+    /// Initialize pass configuration
+    template <typename SegmentedPolicyT>
+    CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+    InitPassConfig(SegmentedKernelT segmented_kernel)
+    {
+      this->segmented_kernel = segmented_kernel;
+      this->radix_bits       = SegmentedPolicyT::RADIX_BITS;
+      this->radix_digits     = 1 << radix_bits;
+
+      return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+    }
+  };
+
+  /**
+   * @brief Invocation (run multiple digit passes)
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam SegmentedKernelT
+   *   Function type of cub::DeviceSegmentedRadixSortKernel
+   *
+   * @param[in] segmented_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+   *
+   * @param[in] alt_segmented_kernel
+   *   Alternate kernel function pointer to parameterization of
+   *   cub::DeviceSegmentedRadixSortKernel
+   */
+  template <typename ActivePolicyT, typename SegmentedKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Init regular and alternate kernel configurations
+      PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+      if ((error = pass_config.template InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel)))
+      {
+        break;
+      }
+      if ((error =
+             alt_pass_config.template InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel)))
+      {
+        break;
+      }
+
+      // Temporary storage allocation requirements
+      void* allocations[2]       = {};
+      size_t allocation_sizes[2] = {
+        // bytes needed for 3rd keys buffer
+        (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),
+
+        // bytes needed for 3rd values buffer
+        (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),
+      };
+
+      // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Return if the caller is simply requesting the size of the storage allocation
+      if (d_temp_storage == nullptr)
+      {
+        if (temp_storage_bytes == 0)
+        {
+          temp_storage_bytes = 1;
+        }
+        return cudaSuccess;
+      }
+
+      // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our
+      // preferred digit size
+      int radix_bits         = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+      int alt_radix_bits     = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+      int num_bits           = end_bit - begin_bit;
+      int num_passes         = CUB_MAX(::cuda::ceil_div(num_bits, radix_bits), 1);
+      bool is_num_passes_odd = num_passes & 1;
+      int max_alt_passes     = (num_passes * radix_bits) - num_bits;
+      int alt_end_bit        = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+      DoubleBuffer<KeyT> d_keys_remaining_passes(
+        (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+        (is_overwrite_okay)   ? d_keys.Current()
+        : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0])
+                              : d_keys.Alternate());
+
+      DoubleBuffer<ValueT> d_values_remaining_passes(
+        (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+        (is_overwrite_okay)   ? d_values.Current()
+        : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1])
+                              : d_values.Alternate());
+
+      // Run first pass, consuming from the input's current buffers
+      int current_bit = begin_bit;
+
+      error = CubDebug(InvokePass(
+        d_keys.Current(),
+        d_keys_remaining_passes.Current(),
+        d_values.Current(),
+        d_values_remaining_passes.Current(),
+        current_bit,
+        (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Run remaining passes
+      while (current_bit < end_bit)
+      {
+        error = CubDebug(InvokePass(
+          d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+          d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+          d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+          d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+          current_bit,
+          (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Invert selectors and update current bit
+        d_keys_remaining_passes.selector ^= 1;
+        d_values_remaining_passes.selector ^= 1;
+      }
+
+      // Update selector
+      if (!is_overwrite_okay)
+      {
+        num_passes = 1; // Sorted data always ends up in the other vector
+      }
+
+      d_keys.selector   = (d_keys.selector + num_passes) & 1;
+      d_values.selector = (d_values.selector + num_passes) & 1;
+    } while (0);
+
+    return error;
+  }
+
+  //------------------------------------------------------------------------------
+  // Chained policy invocation
+  //------------------------------------------------------------------------------
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchSegmentedRadixSort::MaxPolicy;
+
+    // Return if empty problem, or if no bits to sort and double-buffering is used
+    if (num_items == 0 || num_segments == 0 || (begin_bit == end_bit && is_overwrite_okay))
+    {
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 1;
+      }
+      return cudaSuccess;
+    }
+
+    // Force kernel code-generation in all compiler passes
+    return InvokePasses<ActivePolicyT>(
+      DeviceSegmentedRadixSortKernel<
+        MaxPolicyT,
+        false,
+        IS_DESCENDING,
+        KeyT,
+        ValueT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        DecomposerT>,
+      DeviceSegmentedRadixSortKernel<
+        MaxPolicyT,
+        true,
+        IS_DESCENDING,
+        KeyT,
+        ValueT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        DecomposerT>);
+  }
+
+  //------------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //------------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.  When nullptr, the required allocation size
+   *   is written to `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in,out] d_keys
+   *   Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+   * updated to point to the sorted output keys
+   *
+   * @param[in,out] d_values
+   *   Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+   *   updated to point to the sorted output values
+   *
+   * @param[in] num_items
+   *   Number of items to sort
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of length
+   *   `num_segments`, such that <tt>d_begin_offsets[i]</tt> is the first element of the
+   *   <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length `num_segments`,
+   *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+   *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
+   *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
+   *   the <em>i</em><sup>th</sup> is considered empty.
+   *
+   * @param[in] begin_bit
+   *   The beginning (least-significant) bit index needed for key comparison
+   *
+   * @param[in] end_bit
+   *   The past-the-end (most-significant) bit index needed for key comparison
+   *
+   * @param[in] is_overwrite_okay
+   *   Whether is okay to overwrite source buffers
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchSegmentedRadixSort::MaxPolicy;
+
+    cudaError_t error;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchSegmentedRadixSort dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        begin_bit,
+        end_bit,
+        is_overwrite_okay,
+        stream,
+        ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    int num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    int begin_bit,
+    int end_bit,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      begin_bit,
+      end_bit,
+      is_overwrite_okay,
+      stream);
+  }
+};
+
+CUB_NAMESPACE_END
+
+_CCCL_DIAG_POP
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 000000000..23855d059
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,1273 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::DeviceReduce provides device-wide, parallel operations for
+ *       computing a reduction across a sequence of data items residing within
+ *       device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce.cuh>
+#include <cub/device/dispatch/kernels/reduce.cuh>
+#include <cub/grid/grid_even_share.cuh>
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/launcher/cuda_runtime.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_temporary_storage.cuh>
+
+#include <iterator>
+
+_CCCL_SUPPRESS_DEPRECATED_PUSH
+#include <cuda/std/functional>
+_CCCL_SUPPRESS_DEPRECATED_POP
+
+#include <stdio.h>
+
+CUB_NAMESPACE_BEGIN
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(T& /*val*/, OffsetT /*base_offset*/, IteratorT /*itr*/)
+{}
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+_CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput(
+  KeyValuePairT& val, OffsetT base_offset, ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+  val.key -= base_offset;
+}
+
+/**
+ * Segmented reduction (one block per segment)
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets
+ *   @iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets
+ *   @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `T operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of
+ *   length `num_segments`, such that `d_begin_offsets[i]` is the first element
+ *   of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+ *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+ *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
+ *   considered empty.
+ *
+ * @param[in] num_segments
+ *   The number of segments that comprise the sorting data
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ *
+ * @param[in] init
+ *   The initial value of the reduction
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT>
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  int /*num_segments*/,
+  ReductionOpT reduction_op,
+  InitT init)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  OffsetT segment_begin = d_begin_offsets[blockIdx.x];
+  OffsetT segment_end   = d_end_offsets[blockIdx.x];
+
+  // Check if empty problem
+  if (segment_begin == segment_end)
+  {
+    if (threadIdx.x == 0)
+    {
+      *(d_out + blockIdx.x) = init;
+    }
+    return;
+  }
+
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end);
+
+  // Normalize as needed
+  NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+  if (threadIdx.x == 0)
+  {
+    detail::reduce::finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate);
+  }
+}
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <typename PolicyT, typename = void>
+struct ReducePolicyWrapper : PolicyT
+{
+  CUB_RUNTIME_FUNCTION ReducePolicyWrapper(PolicyT base)
+      : PolicyT(base)
+  {}
+};
+
+template <typename StaticPolicyT>
+struct ReducePolicyWrapper<StaticPolicyT,
+                           _CUDA_VSTD::void_t<typename StaticPolicyT::ReducePolicy,
+                                              typename StaticPolicyT::SingleTilePolicy,
+                                              typename StaticPolicyT::SegmentedReducePolicy>> : StaticPolicyT
+{
+  CUB_RUNTIME_FUNCTION ReducePolicyWrapper(StaticPolicyT base)
+      : StaticPolicyT(base)
+  {}
+
+  CUB_DEFINE_SUB_POLICY_GETTER(Reduce)
+  CUB_DEFINE_SUB_POLICY_GETTER(SingleTile)
+  CUB_DEFINE_SUB_POLICY_GETTER(SegmentedReduce)
+};
+
+template <typename PolicyT>
+CUB_RUNTIME_FUNCTION ReducePolicyWrapper<PolicyT> MakeReducePolicyWrapper(PolicyT policy)
+{
+  return ReducePolicyWrapper<PolicyT>{policy};
+}
+
+/**
+ * @tparam AccumT
+ *   Accumulator data type
+ *
+ * OffsetT
+ *   Signed integer type for global offsets
+ *
+ * ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ */
+template <typename AccumT, typename OffsetT, typename ReductionOpT>
+struct DeviceReducePolicy
+{
+  //---------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //---------------------------------------------------------------------------
+
+  /// SM30
+  struct Policy300 : ChainedPolicy<300, Policy300, Policy300>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 20;
+    static constexpr int items_per_vec_load = 2;
+
+    // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+    using ReducePolicy =
+      AgentReducePolicy<threads_per_block,
+                        items_per_thread,
+                        AccumT,
+                        items_per_vec_load,
+                        BLOCK_REDUCE_WARP_REDUCTIONS,
+                        LOAD_DEFAULT>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 20;
+    static constexpr int items_per_vec_load = 4;
+
+    // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B
+    // items)
+    using ReducePolicy =
+      AgentReducePolicy<threads_per_block,
+                        items_per_thread,
+                        AccumT,
+                        items_per_vec_load,
+                        BLOCK_REDUCE_WARP_REDUCTIONS,
+                        LOAD_LDG>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  /// SM60
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+  {
+    static constexpr int threads_per_block  = 256;
+    static constexpr int items_per_thread   = 16;
+    static constexpr int items_per_vec_load = 4;
+
+    // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+    using ReducePolicy =
+      AgentReducePolicy<threads_per_block,
+                        items_per_thread,
+                        AccumT,
+                        items_per_vec_load,
+                        BLOCK_REDUCE_WARP_REDUCTIONS,
+                        LOAD_LDG>;
+
+    // SingleTilePolicy
+    using SingleTilePolicy = ReducePolicy;
+
+    // SegmentedReducePolicy
+    using SegmentedReducePolicy = ReducePolicy;
+  };
+
+  using MaxPolicy = Policy600;
+};
+
+template <typename MaxPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT,
+          typename TransformOpT>
+struct DeviceReduceKernelSource
+{
+  CUB_DEFINE_KERNEL_GETTER(
+    SingleTileKernel,
+    DeviceReduceSingleTileKernel<MaxPolicyT,
+                                 InputIteratorT,
+                                 OutputIteratorT,
+                                 OffsetT,
+                                 ReductionOpT,
+                                 InitT,
+                                 AccumT,
+                                 TransformOpT>)
+
+  CUB_DEFINE_KERNEL_GETTER(ReductionKernel,
+                           DeviceReduceKernel<MaxPolicyT, InputIteratorT, OffsetT, ReductionOpT, AccumT, TransformOpT>)
+
+  CUB_DEFINE_KERNEL_GETTER(
+    SingleTileSecondKernel,
+    DeviceReduceSingleTileKernel<MaxPolicyT,
+                                 AccumT*,
+                                 OutputIteratorT,
+                                 int, // Always used with int offsets
+                                 ReductionOpT,
+                                 InitT,
+                                 AccumT>)
+
+  CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize()
+  {
+    return sizeof(AccumT);
+  }
+};
+
+/******************************************************************************
+ * Single-problem dispatch
+ *****************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
+ *        device-wide reduction
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
+          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
+          typename TransformOpT   = ::cuda::std::__identity,
+          typename KernelSource   = DeviceReduceKernelSource<
+              typename SelectedPolicy::MaxPolicy,
+              InputIteratorT,
+              OutputIteratorT,
+              OffsetT,
+              ReductionOpT,
+              InitT,
+              AccumT,
+              TransformOpT>,
+          typename KernelLauncherFactory = TripleChevronFactory>
+struct DispatchReduce : SelectedPolicy
+{
+  //---------------------------------------------------------------------------
+  // Problem state
+  //---------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the output aggregate
+  OutputIteratorT d_out;
+
+  /// Total number of input items (i.e., length of `d_in`)
+  OffsetT num_items;
+
+  /// Binary reduction functor
+  ReductionOpT reduction_op;
+
+  /// The initial value of the reduction
+  InitT init;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  TransformOpT transform_op;
+
+  KernelSource kernel_source;
+
+  KernelLauncherFactory launcher_factory;
+
+  //---------------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    int ptx_version,
+    TransformOpT transform_op              = {},
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {})
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , ptx_version(ptx_version)
+      , transform_op(transform_op)
+      , kernel_source(kernel_source)
+      , launcher_factory(launcher_factory)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //---------------------------------------------------------------------------
+  // Small-problem (single tile) invocation
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invoke a single block block to reduce in-core
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam SingleTileKernelT
+   *   Function type of cub::DeviceReduceSingleTileKernel
+   *
+   * @param[in] single_tile_kernel
+   *   Kernel function pointer to parameterization of
+   *   cub::DeviceReduceSingleTileKernel
+   */
+  template <typename ActivePolicyT, typename SingleTileKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  InvokeSingleTile(SingleTileKernelT single_tile_kernel, ActivePolicyT policy = {})
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 1;
+        break;
+      }
+
+// Log single_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
+              "%d items per thread\n",
+              policy.SingleTile().BlockThreads(),
+              (long long) stream,
+              policy.SingleTile().ItemsPerThread());
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke single_reduce_sweep_kernel
+      launcher_factory(1, policy.SingleTile().BlockThreads(), 0, stream)
+        .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init, transform_op);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  //---------------------------------------------------------------------------
+  // Normal problem size invocation (two-pass)
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invoke two-passes to reduce
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam ReduceKernelT
+   *   Function type of cub::DeviceReduceKernel
+   *
+   * @tparam SingleTileKernelT
+   *   Function type of cub::DeviceReduceSingleTileKernel
+   *
+   * @param[in] reduce_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceReduceKernel
+   *
+   * @param[in] single_tile_kernel
+   *   Kernel function pointer to parameterization of
+   *   cub::DeviceReduceSingleTileKernel
+   */
+  template <typename ActivePolicyT, typename ReduceKernelT, typename SingleTileKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel, ActivePolicyT active_policy = {})
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get SM count
+      int sm_count;
+      error = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
+      // error = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Init regular kernel configuration
+      KernelConfig reduce_config;
+      error = CubDebug(reduce_config.Init(reduce_kernel, active_policy.Reduce(), launcher_factory));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+      // Even-share work distribution
+      int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(0);
+      GridEvenShare<OffsetT> even_share;
+      even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+      // Temporary storage allocation requirements
+      void* allocations[1]       = {};
+      size_t allocation_sizes[1] = {
+        max_blocks * kernel_source.AccumSize() // bytes needed for privatized block
+                                               // reductions
+      };
+
+      // Alias the temporary allocations from the single storage blob (or
+      // compute the necessary size of the blob)
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        return cudaSuccess;
+      }
+
+      // Alias the allocation for the privatized per-block reductions
+      AccumT* d_block_reductions = static_cast<AccumT*>(allocations[0]);
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = even_share.grid_size;
+
+// Log device_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking DeviceReduceKernel<<<%lu, %d, 0, %lld>>>(), %d items "
+              "per thread, %d SM occupancy\n",
+              (unsigned long) reduce_grid_size,
+              active_policy.Reduce().BlockThreads(),
+              (long long) stream,
+              active_policy.Reduce().ItemsPerThread(),
+              reduce_config.sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke DeviceReduceKernel
+      launcher_factory(reduce_grid_size, active_policy.Reduce().BlockThreads(), 0, stream)
+        .doit(reduce_kernel, d_in, d_block_reductions, num_items, even_share, reduction_op, transform_op);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+// Log single_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), "
+              "%d items per thread\n",
+              active_policy.SingleTile().BlockThreads(),
+              (long long) stream,
+              active_policy.SingleTile().ItemsPerThread());
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke DeviceReduceSingleTileKernel
+      launcher_factory(1, active_policy.SingleTile().BlockThreads(), 0, stream)
+        .doit(single_tile_kernel,
+              d_block_reductions,
+              d_out,
+              reduce_grid_size,
+              reduction_op,
+              init,
+              ::cuda::std::__identity{});
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  //---------------------------------------------------------------------------
+  // Chained policy invocation
+  //---------------------------------------------------------------------------
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
+  {
+    auto wrapped_policy = MakeReducePolicyWrapper(active_policy);
+    if (num_items <= static_cast<OffsetT>(
+          wrapped_policy.SingleTile().BlockThreads() * wrapped_policy.SingleTile().ItemsPerThread()))
+    {
+      // Small, single tile size
+      return InvokeSingleTile(kernel_source.SingleTileKernel(), wrapped_policy);
+    }
+    else
+    {
+      // Regular size
+      return InvokePasses(kernel_source.ReductionKernel(), kernel_source.SingleTileSecondKernel(), wrapped_policy);
+    }
+  }
+
+  //---------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine for computing a device-wide reduction
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param[in] reduction_op
+   *   Binary reduction functor
+   *
+   * @param[in] init
+   *   The initial value of the reduction
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  template <typename MaxPolicyT = typename DispatchReduce::MaxPolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    TransformOpT transform_op              = {},
+    KernelSource kernel_source             = {},
+    KernelLauncherFactory launcher_factory = {},
+    MaxPolicyT max_policy                  = {})
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(launcher_factory.PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchReduce dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        num_items,
+        reduction_op,
+        init,
+        stream,
+        ptx_version,
+        transform_op,
+        kernel_source,
+        launcher_factory);
+
+      // Dispatch to chained policy
+      error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
+ *        device-wide transpose reduce
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam TransformOpT
+ *   Unary transform functor type having member
+ *   `auto operator()(const T &a)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ */
+template <
+  typename InputIteratorT,
+  typename OutputIteratorT,
+  typename OffsetT,
+  typename ReductionOpT,
+  typename TransformOpT,
+  typename InitT,
+  typename AccumT = ::cuda::std::
+    __accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
+  typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
+  typename KernelSource    = DeviceReduceKernelSource<
+       typename SelectedPolicyT::MaxPolicy,
+       InputIteratorT,
+       OutputIteratorT,
+       OffsetT,
+       ReductionOpT,
+       InitT,
+       AccumT,
+       TransformOpT>,
+  typename KernelLauncherFactory = TripleChevronFactory>
+using DispatchTransformReduce =
+  DispatchReduce<InputIteratorT,
+                 OutputIteratorT,
+                 OffsetT,
+                 ReductionOpT,
+                 InitT,
+                 AccumT,
+                 SelectedPolicyT,
+                 TransformOpT,
+                 KernelSource,
+                 KernelLauncherFactory>;
+
+/******************************************************************************
+ * Segmented dispatch
+ *****************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
+ *        device-wide reduction
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets
+ *   @iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets
+ *   @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   value type
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT  = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
+          typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
+struct DispatchSegmentedReduce : SelectedPolicy
+{
+  //---------------------------------------------------------------------------
+  // Problem state
+  //---------------------------------------------------------------------------
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the output aggregate
+  OutputIteratorT d_out;
+
+  /// The number of segments that comprise the sorting data
+  int num_segments;
+
+  /// Random-access input iterator to the sequence of beginning offsets of
+  /// length `num_segments`, such that `d_begin_offsets[i]` is the first
+  /// element of the *i*<sup>th</sup> data segment in `d_keys_*` and
+  /// `d_values_*`
+  BeginOffsetIteratorT d_begin_offsets;
+
+  /// Random-access input iterator to the sequence of ending offsets of length
+  /// `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+  /// the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+  /// If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
+  /// considered empty.
+  EndOffsetIteratorT d_end_offsets;
+
+  /// Binary reduction functor
+  ReductionOpT reduction_op;
+
+  /// The initial value of the reduction
+  InitT init;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  //---------------------------------------------------------------------------
+  // Constructor
+  //---------------------------------------------------------------------------
+
+  /// Constructor
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedReduce(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , reduction_op(reduction_op)
+      , init(init)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //---------------------------------------------------------------------------
+  // Chained policy invocation
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Invocation
+   *
+   * @tparam ActivePolicyT
+   *   Umbrella policy active for the target device
+   *
+   * @tparam DeviceSegmentedReduceKernelT
+   *   Function type of cub::DeviceSegmentedReduceKernel
+   *
+   * @param[in] segmented_reduce_kernel
+   *   Kernel function pointer to parameterization of
+   *   cub::DeviceSegmentedReduceKernel
+   */
+  template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  InvokePasses(DeviceSegmentedReduceKernelT segmented_reduce_kernel)
+  {
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Return if the caller is simply requesting the size of the storage
+      // allocation
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = 1;
+        return cudaSuccess;
+      }
+
+      // Init kernel configuration
+      KernelConfig segmented_reduce_config;
+      error =
+        CubDebug(segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+// Log device_reduce_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), "
+              "%d items per thread, %d SM occupancy\n",
+              num_segments,
+              ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+              (long long) stream,
+              ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+              segmented_reduce_config.sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke DeviceReduceKernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream)
+        .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  /// Invocation
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchSegmentedReduce::MaxPolicy;
+
+    // Force kernel code-generation in all compiler passes
+    return InvokePasses<ActivePolicyT>(
+      DeviceSegmentedReduceKernel<
+        MaxPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        BeginOffsetIteratorT,
+        EndOffsetIteratorT,
+        OffsetT,
+        ReductionOpT,
+        InitT,
+        AccumT>);
+  }
+
+  //---------------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------------
+
+  /**
+   * @brief Internal dispatch routine for computing a device-wide reduction
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no work
+   *   is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Pointer to the output aggregate
+   *
+   * @param[in] num_segments
+   *   The number of segments that comprise the sorting data
+   *
+   * @param[in] d_begin_offsets
+   *   Random-access input iterator to the sequence of beginning offsets of
+   *   length `num_segments`, such that `d_begin_offsets[i]` is the first
+   *   element of the *i*<sup>th</sup> data segment in `d_keys_*` and
+   *   `d_values_*`
+   *
+   * @param[in] d_end_offsets
+   *   Random-access input iterator to the sequence of ending offsets of length
+   *   `num_segments`, such that `d_end_offsets[i] - 1` is the last element of
+   *   the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`.
+   *   If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*<sup>th</sup> is
+   *   considered empty.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction functor
+   *
+   * @param[in] init
+   *   The initial value of the reduction
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchSegmentedReduce::MaxPolicy;
+
+    if (num_segments <= 0)
+    {
+      return cudaSuccess;
+    }
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchSegmentedReduce dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        reduction_op,
+        init,
+        stream,
+        ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    ReductionOpT reduction_op,
+    InitT init,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      reduction_op,
+      init,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 000000000..00d728070
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,587 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::DeviceReduceByKey provides device-wide, parallel operations for
+ *       reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_reduce_by_key.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Multi-block reduce-by-key sweep kernel entry point
+ *
+ * @tparam AgentReduceByKeyPolicyT
+ *   Parameterized AgentReduceByKeyPolicyT tuning policy type
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam UniqueOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumRunsOutputIteratorT
+ *   Output iterator type for recording number of segments encountered
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam EqualityOpT
+ *   KeyT equality operator type
+ *
+ * @tparam ReductionOpT
+ *   ValueT reduction operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param d_keys_in
+ *   Pointer to the input sequence of keys
+ *
+ * @param d_unique_out
+ *   Pointer to the output sequence of unique keys (one key per run)
+ *
+ * @param d_values_in
+ *   Pointer to the input sequence of corresponding values
+ *
+ * @param d_aggregates_out
+ *   Pointer to the output sequence of value aggregates (one aggregate per run)
+ *
+ * @param d_num_runs_out
+ *   Pointer to total number of runs encountered
+ *   (i.e., the length of d_unique_out)
+ *
+ * @param tile_state
+ *   Tile status interface
+ *
+ * @param start_tile
+ *   The starting tile for the current grid
+ *
+ * @param equality_op
+ *   KeyT equality operator
+ *
+ * @param reduction_op
+ *   ValueT reduction operator
+ *
+ * @param num_items
+ *   Total number of items to select from
+ */
+template <typename ChainedPolicyT,
+          typename KeysInputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename ScanTileStateT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT,
+          typename AccumT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    ScanTileStateT tile_state,
+    int start_tile,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op,
+    OffsetT num_items)
+{
+  using AgentReduceByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT;
+
+  // Thread block type for reducing tiles of value segments
+  using AgentReduceByKeyT =
+    AgentReduceByKey<AgentReduceByKeyPolicyT,
+                     KeysInputIteratorT,
+                     UniqueOutputIteratorT,
+                     ValuesInputIteratorT,
+                     AggregatesOutputIteratorT,
+                     NumRunsOutputIteratorT,
+                     EqualityOpT,
+                     ReductionOpT,
+                     OffsetT,
+                     AccumT>;
+
+  // Shared memory for AgentReduceByKey
+  __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentReduceByKeyT(
+    temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op)
+    .ConsumeRange(num_items, tile_state, start_tile);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
+ *        DeviceReduceByKey
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam UniqueOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumRunsOutputIteratorT
+ *   Output iterator type for recording number of segments encountered
+ *
+ * @tparam EqualityOpT
+ *   KeyT equality operator type
+ *
+ * @tparam ReductionOpT
+ *   ValueT reduction operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <typename KeysInputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT,
+          typename AccumT = //
+          ::cuda::std::__accumulator_t<ReductionOpT,
+                                       cub::detail::value_t<ValuesInputIteratorT>,
+                                       cub::detail::value_t<ValuesInputIteratorT>>,
+          typename SelectedPolicy = //
+          detail::device_reduce_by_key_policy_hub< //
+            ReductionOpT, //
+            AccumT, //
+            cub::detail::non_void_value_t< //
+              UniqueOutputIteratorT, //
+              cub::detail::value_t<KeysInputIteratorT>>>>
+struct DispatchReduceByKey
+{
+  //-------------------------------------------------------------------------
+  // Types and constants
+  //-------------------------------------------------------------------------
+
+  // The input values type
+  using ValueInputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+  static constexpr int INIT_KERNEL_THREADS = 128;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
+
+  void* d_temp_storage;
+  size_t& temp_storage_bytes;
+  KeysInputIteratorT d_keys_in;
+  UniqueOutputIteratorT d_unique_out;
+  ValuesInputIteratorT d_values_in;
+  AggregatesOutputIteratorT d_aggregates_out;
+  NumRunsOutputIteratorT d_num_runs_out;
+  EqualityOpT equality_op;
+  ReductionOpT reduction_op;
+  OffsetT num_items;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchReduceByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys_in(d_keys_in)
+      , d_unique_out(d_unique_out)
+      , d_values_in(d_values_in)
+      , d_aggregates_out(d_aggregates_out)
+      , d_num_runs_out(d_num_runs_out)
+      , equality_op(equality_op)
+      , reduction_op(reduction_op)
+      , num_items(num_items)
+      , stream(stream)
+  {}
+
+  //---------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------
+
+  template <typename ActivePolicyT, typename ScanInitKernelT, typename ReduceByKeyKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  Invoke(ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel)
+  {
+    using AgentReduceByKeyPolicyT  = typename ActivePolicyT::ReduceByKeyPolicyT;
+    constexpr int block_threads    = AgentReduceByKeyPolicyT::BLOCK_THREADS;
+    constexpr int items_per_thread = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = block_threads * items_per_thread;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[1];
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break; // bytes needed for tile status descriptors
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute
+      // the necessary size of the blob)
+      void* allocations[1] = {};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_state;
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log init_kernel configuration
+      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(init_kernel, tile_state, num_tiles, d_num_runs_out);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Get SM occupancy for reduce_by_key_kernel
+      int reduce_by_key_sm_occupancy;
+      error = CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads));
+
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Run grids in epochs (in case number of tiles exceeds max x-dimension
+      int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+      for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+      {
+// Log reduce_by_key_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
+                "items per thread, %d SM occupancy\n",
+                start_tile,
+                scan_grid_size,
+                block_threads,
+                (long long) stream,
+                items_per_thread,
+                reduce_by_key_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke reduce_by_key_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+          .doit(reduce_by_key_kernel,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                tile_state,
+                start_tile,
+                equality_op,
+                reduction_op,
+                num_items);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    return Invoke<ActivePolicyT>(
+      DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+      DeviceReduceByKeyKernel<
+        MaxPolicyT,
+        KeysInputIteratorT,
+        UniqueOutputIteratorT,
+        ValuesInputIteratorT,
+        AggregatesOutputIteratorT,
+        NumRunsOutputIteratorT,
+        ScanTileStateT,
+        EqualityOpT,
+        ReductionOpT,
+        OffsetT,
+        AccumT>);
+  }
+
+  /**
+   * Internal dispatch routine
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in
+   *   Pointer to the input sequence of keys
+   *
+   * @param[out] d_unique_out
+   *   Pointer to the output sequence of unique keys (one key per run)
+   *
+   * @param[in] d_values_in
+   *   Pointer to the input sequence of corresponding values
+   *
+   * @param[out] d_aggregates_out
+   *   Pointer to the output sequence of value aggregates
+   *   (one aggregate per run)
+   *
+   * @param[out] d_num_runs_out
+   *   Pointer to total number of runs encountered
+   *   (i.e., the length of d_unique_out)
+   *
+   * @param[in] equality_op
+   *   KeyT equality operator
+   *
+   * @param[in] reduction_op
+   *   ValueT reduction operator
+   *
+   * @param[in] num_items
+   *   Total number of items to select from
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      DispatchReduceByKey dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys_in,
+        d_unique_out,
+        d_values_in,
+        d_aggregates_out,
+        d_num_runs_out,
+        equality_op,
+        reduction_op,
+        num_items,
+        stream);
+
+      // Dispatch
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    UniqueOutputIteratorT d_unique_out,
+    ValuesInputIteratorT d_values_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    ReductionOpT reduction_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_unique_out,
+      d_values_in,
+      d_aggregates_out,
+      d_num_runs_out,
+      equality_op,
+      reduction_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_rle.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 000000000..2a6a0b3b6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,576 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ *   cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of
+ *   data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_rle.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_run_length_encode.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ *
+ * @tparam AgentRlePolicyT
+ *   Parameterized AgentRlePolicyT tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OffsetsOutputIteratorT
+ *   Random-access output iterator type for writing run-offset values @iterator
+ *
+ * @tparam LengthsOutputIteratorT
+ *   Random-access output iterator type for writing run-length values @iterator
+ *
+ * @tparam NumRunsOutputIteratorT
+ *   Output iterator type for recording the number of runs encountered @iterator
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam EqualityOpT
+ *   T equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param d_in
+ *   Pointer to input sequence of data items
+ *
+ * @param d_offsets_out
+ *   Pointer to output sequence of run-offsets
+ *
+ * @param d_lengths_out
+ *   Pointer to output sequence of run-lengths
+ *
+ * @param d_num_runs_out
+ *   Pointer to total number of runs (i.e., length of `d_offsets_out`)
+ *
+ * @param tile_status
+ *   Tile status interface
+ *
+ * @param equality_op
+ *   Equality operator for input items
+ *
+ * @param num_items
+ *   Total number of input items (i.e., length of `d_in`)
+ *
+ * @param num_tiles
+ *   Total number of tiles for the entire problem
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename ScanTileStateT,
+          typename EqualityOpT,
+          typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    ScanTileStateT tile_status,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    int num_tiles)
+{
+  using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT;
+
+  // Thread block type for selecting data from input tiles
+  using AgentRleT =
+    AgentRle<AgentRlePolicyT, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, EqualityOpT, OffsetT>;
+
+  // Shared memory for AgentRle
+  __shared__ typename AgentRleT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items)
+    .ConsumeRange(num_tiles, tile_status, d_num_runs_out);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OffsetsOutputIteratorT
+ *   Random-access output iterator type for writing run-offset values @iterator
+ *
+ * @tparam LengthsOutputIteratorT
+ *   Random-access output iterator type for writing run-length values @iterator
+ *
+ * @tparam NumRunsOutputIteratorT
+ *   Output iterator type for recording the number of runs encountered @iterator
+ *
+ * @tparam EqualityOpT
+ *   T equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam SelectedPolicy
+ *   Implementation detail, do not specify directly, requirements on the
+ *   content of this type are subject to breaking change.
+ */
+template <typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename SelectedPolicy =
+            detail::device_non_trivial_runs_policy_hub<cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>,
+                                                       cub::detail::value_t<InputIteratorT>>>
+struct DeviceRleDispatch
+{
+  /******************************************************************************
+   * Types and constants
+   ******************************************************************************/
+
+  // The lengths output value type
+  using LengthT = cub::detail::non_void_value_t<LengthsOutputIteratorT, OffsetT>;
+
+  enum
+  {
+    INIT_KERNEL_THREADS = 128,
+  };
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<LengthT, OffsetT>;
+
+  void* d_temp_storage;
+  size_t& temp_storage_bytes;
+  InputIteratorT d_in;
+  OffsetsOutputIteratorT d_offsets_out;
+  LengthsOutputIteratorT d_lengths_out;
+  NumRunsOutputIteratorT d_num_runs_out;
+  EqualityOpT equality_op;
+  OffsetT num_items;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DeviceRleDispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_offsets_out(d_offsets_out)
+      , d_lengths_out(d_lengths_out)
+      , d_num_runs_out(d_num_runs_out)
+      , equality_op(equality_op)
+      , num_items(num_items)
+      , stream(stream)
+  {}
+
+  /******************************************************************************
+   * Dispatch entrypoints
+   ******************************************************************************/
+
+  /**
+   * Internal dispatch routine for computing a device-wide run-length-encode using the
+   * specified kernel functions.
+   *
+   * @tparam DeviceScanInitKernelPtr
+   *   Function type of cub::DeviceScanInitKernel
+   *
+   * @tparam DeviceRleSweepKernelPtr
+   *   Function type of cub::DeviceRleSweepKernelPtr
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param d_offsets_out
+   *   Pointer to the output sequence of run-offsets
+   *
+   * @param d_lengths_out
+   *   Pointer to the output sequence of run-lengths
+   *
+   * @param d_num_runs_out
+   *   Pointer to the total number of runs encountered (i.e., length of `d_offsets_out`)
+   *
+   * @param equality_op
+   *   Equality operator for input items
+   *
+   * @param num_items
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   *
+   * @param ptx_version
+   *   PTX version of dispatch kernels
+   *
+   * @param device_scan_init_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+   *
+   * @param device_rle_sweep_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+   */
+  template <typename ActivePolicyT, typename DeviceScanInitKernelPtr, typename DeviceRleSweepKernelPtr>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  Invoke(DeviceScanInitKernelPtr device_scan_init_kernel, DeviceRleSweepKernelPtr device_rle_sweep_kernel)
+  {
+    cudaError error = cudaSuccess;
+
+    constexpr int block_threads    = ActivePolicyT::RleSweepPolicyT::BLOCK_THREADS;
+    constexpr int items_per_thread = ActivePolicyT::RleSweepPolicyT::ITEMS_PER_THREAD;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = block_threads * items_per_thread;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[1];
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break; // bytes needed for tile status descriptors
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute the necessary size of
+      // the blob)
+      void* allocations[1] = {};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (error != cudaSuccess)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_status;
+      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log device_scan_init_kernel configuration
+      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+              init_grid_size,
+              INIT_KERNEL_THREADS,
+              (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Get SM occupancy for device_rle_sweep_kernel
+      int device_rle_kernel_sm_occupancy;
+      error = CubDebug(MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out
+                                      device_rle_sweep_kernel,
+                                      block_threads));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get grid size for scanning tiles
+      dim3 scan_grid_size;
+      scan_grid_size.z = 1;
+      scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x);
+      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+// Log device_rle_sweep_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per "
+              "thread, %d SM occupancy\n",
+              scan_grid_size.x,
+              scan_grid_size.y,
+              scan_grid_size.z,
+              block_threads,
+              (long long) stream,
+              items_per_thread,
+              device_rle_kernel_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke device_rle_sweep_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+        .doit(device_rle_sweep_kernel,
+              d_in,
+              d_offsets_out,
+              d_lengths_out,
+              d_num_runs_out,
+              tile_status,
+              equality_op,
+              num_items,
+              num_tiles);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <class ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    return Invoke<ActivePolicyT>(
+      DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+      DeviceRleSweepKernel<MaxPolicyT,
+                           InputIteratorT,
+                           OffsetsOutputIteratorT,
+                           LengthsOutputIteratorT,
+                           NumRunsOutputIteratorT,
+                           ScanTileStateT,
+                           EqualityOpT,
+                           OffsetT>);
+  }
+
+  /**
+   * Internal dispatch routine
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_in
+   *   Pointer to input sequence of data items
+   *
+   * @param d_offsets_out
+   *   Pointer to output sequence of run-offsets
+   *
+   * @param d_lengths_out
+   *   Pointer to output sequence of run-lengths
+   *
+   * @param d_num_runs_out
+   *   Pointer to total number of runs (i.e., length of `d_offsets_out`)
+   *
+   * @param equality_op
+   *   Equality operator for input items
+   *
+   * @param num_items
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      DeviceRleDispatch dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_offsets_out,
+        d_lengths_out,
+        d_num_runs_out,
+        equality_op,
+        num_items,
+        stream);
+
+      // Dispatch
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_offsets_out,
+      d_lengths_out,
+      d_num_runs_out,
+      equality_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 000000000..d1efaa01c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,616 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file cub::DeviceScan provides device-wide, parallel operations for
+ *       computing a prefix scan across a sequence of data items residing
+ *       within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_scan.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Initialization kernel for tile status initialization (multi-block)
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @param[in] tile_state
+ *   Tile status interface
+ *
+ * @param[in] num_tiles
+ *   Number of tiles
+ */
+template <typename ScanTileStateT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles)
+{
+  // Initialize tile status
+  tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @param[in] tile_state
+ *   Tile status interface
+ *
+ * @param[in] num_tiles
+ *   Number of tiles
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected
+ *   (i.e., length of `d_selected_out`)
+ */
+template <typename ScanTileStateT, typename NumSelectedIteratorT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceCompactInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+{
+  // Initialize tile status
+  tile_state.InitializeStatus(num_tiles);
+
+  // Initialize d_num_selected_out
+  if ((blockIdx.x == 0) && (threadIdx.x == 0))
+  {
+    *d_num_selected_out = 0;
+  }
+}
+
+/**
+ * @brief Scan kernel entry point (multi-block)
+ *
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading scan inputs @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Random-access output iterator type for writing scan outputs @iterator
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam ScanOpT
+ *   Binary scan functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitValueT
+ *   Initial value to seed the exclusive scan
+ *   (cub::NullType for inclusive scans)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @paramInput d_in
+ *   data
+ *
+ * @paramOutput d_out
+ *   data
+ *
+ * @paramTile tile_state
+ *   status interface
+ *
+ * @paramThe start_tile
+ *   starting tile for the current grid
+ *
+ * @paramBinary scan_op
+ *   scan functor
+ *
+ * @paramInitial init_value
+ *   value to seed the exclusive scan
+ *
+ * @paramTotal num_items
+ *   number of scan items for the entire problem
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanTileStateT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT,
+          bool ForceInclusive>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanTileStateT tile_state,
+    int start_tile,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items)
+{
+  using RealInitValueT = typename InitValueT::value_type;
+  using ScanPolicyT    = typename ChainedPolicyT::ActivePolicy::ScanPolicyT;
+
+  // Thread block type for scanning input tiles
+  using AgentScanT =
+    AgentScan<ScanPolicyT, InputIteratorT, OutputIteratorT, ScanOpT, RealInitValueT, OffsetT, AccumT, ForceInclusive>;
+
+  // Shared memory for AgentScan
+  __shared__ typename AgentScanT::TempStorage temp_storage;
+
+  RealInitValueT real_init_value = init_value;
+
+  // Process tiles
+  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for
+ *        DeviceScan
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading scan inputs @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Random-access output iterator type for writing scan outputs @iterator
+ *
+ * @tparam ScanOpT
+ *   Binary scan functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitValueT
+ *   The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ForceInclusive
+ *   Boolean flag to force InclusiveScan invocation when true.
+ *
+ */
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT         = ::cuda::std::__accumulator_t<ScanOpT,
+                                                                 cub::detail::value_t<InputIteratorT>,
+                                                                 ::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
+                                                                                  cub::detail::value_t<InputIteratorT>,
+                                                                                  typename InitValueT::value_type>>,
+          typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>,
+          bool ForceInclusive     = false>
+struct DispatchScan : SelectedPolicy
+{
+  //---------------------------------------------------------------------
+  // Constants and Types
+  //---------------------------------------------------------------------
+
+  static constexpr int INIT_KERNEL_THREADS = 128;
+
+  // The input value type
+  using InputT = cub::detail::value_t<InputIteratorT>;
+
+  /// Device-accessible allocation of temporary storage.  When nullptr, the
+  /// required allocation size is written to \p temp_storage_bytes and no work
+  /// is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of \p d_temp_storage allocation
+  size_t& temp_storage_bytes;
+
+  /// Iterator to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Iterator to the output sequence of data items
+  OutputIteratorT d_out;
+
+  /// Binary scan functor
+  ScanOpT scan_op;
+
+  /// Initial value to seed the exclusive scan
+  InitValueT init_value;
+
+  /// Total number of input items (i.e., the length of \p d_in)
+  OffsetT num_items;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  /**
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Iterator to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Iterator to the output sequence of data items
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , scan_op(scan_op)
+      , init_value(init_value)
+      , num_items(num_items)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScan(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , scan_op(scan_op)
+      , init_value(init_value)
+      , num_items(num_items)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+  {
+    using Policy         = typename ActivePolicyT::ScanPolicyT;
+    using ScanTileStateT = typename cub::ScanTileState<AccumT>;
+
+    // `LOAD_LDG` makes in-place execution UB and doesn't lead to better
+    // performance.
+    static_assert(Policy::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG,
+                  "The memory consistency model does not apply to texture "
+                  "accesses");
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[1];
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break; // bytes needed for tile status descriptors
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute
+      // the necessary size of the blob)
+      void* allocations[1] = {};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_state;
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log init_kernel configuration
+      int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(init_kernel, tile_state, num_tiles);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get SM occupancy for scan_kernel
+      int scan_sm_occupancy;
+      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
+                                      scan_kernel,
+                                      Policy::BLOCK_THREADS));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Run grids in epochs (in case number of tiles exceeds max x-dimension
+      int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+      for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+      {
+// Log scan_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
+                "per thread, %d SM occupancy\n",
+                start_tile,
+                scan_grid_size,
+                Policy::BLOCK_THREADS,
+                (long long) stream,
+                Policy::ITEMS_PER_THREAD,
+                scan_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke scan_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
+          .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT     = typename DispatchScan::MaxPolicy;
+    using ScanTileStateT = typename cub::ScanTileState<AccumT>;
+    // Ensure kernels are instantiated.
+    return Invoke<ActivePolicyT>(
+      DeviceScanInitKernel<ScanTileStateT>,
+      DeviceScanKernel<MaxPolicyT,
+                       InputIteratorT,
+                       OutputIteratorT,
+                       ScanTileStateT,
+                       ScanOpT,
+                       InitValueT,
+                       OffsetT,
+                       AccumT,
+                       ForceInclusive>);
+  }
+
+  /**
+   * @brief Internal dispatch routine
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_in
+   *   Iterator to the input sequence of data items
+   *
+   * @param[out] d_out
+   *   Iterator to the output sequence of data items
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   *
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchScan::MaxPolicy;
+
+    cudaError_t error;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchScan dispatch(
+        d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, scan_op, init_value, stream, ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value, num_items, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
new file mode 100644
index 000000000..aa04ce9f2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -0,0 +1,657 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file DeviceScan provides device-wide, parallel operations for computing a
+ *       prefix scan across a sequence of data items residing within
+ *       device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_scan_by_key.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_scan_by_key.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Scan by key kernel entry point (multi-block)
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesOutputIteratorT
+ *   Random-access output iterator type
+ *
+ * @tparam ScanByKeyTileStateT
+ *   Tile status interface type
+ *
+ * @tparam EqualityOp
+ *   Equality functor type
+ *
+ * @tparam ScanOpT
+ *   Scan functor type
+ *
+ * @tparam InitValueT
+ *   The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param d_keys_in
+ *   Input keys data
+ *
+ * @param d_keys_prev_in
+ *   Predecessor items for each tile
+ *
+ * @param d_values_in
+ *   Input values data
+ *
+ * @param d_values_out
+ *   Output values data
+ *
+ * @param tile_state
+ *   Tile status interface
+ *
+ * @param start_tile
+ *   The starting tile for the current grid
+ *
+ * @param equality_op
+ *   Binary equality functor
+ *
+ * @param scan_op
+ *   Binary scan functor
+ *
+ * @param init_value
+ *   Initial value to seed the exclusive scan
+ *
+ * @param num_items
+ *   Total number of scan items for the entire problem
+ */
+template <typename ChainedPolicyT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename ScanByKeyTileStateT,
+          typename EqualityOp,
+          typename ScanOpT,
+          typename InitValueT,
+          typename OffsetT,
+          typename AccumT,
+          typename KeyT = cub::detail::value_t<KeysInputIteratorT>>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
+    KeysInputIteratorT d_keys_in,
+    KeyT* d_keys_prev_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    ScanByKeyTileStateT tile_state,
+    int start_tile,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items)
+{
+  using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;
+
+  // Thread block type for scanning input tiles
+  using AgentScanByKeyT =
+    AgentScanByKey<ScanByKeyPolicyT,
+                   KeysInputIteratorT,
+                   ValuesInputIteratorT,
+                   ValuesOutputIteratorT,
+                   EqualityOp,
+                   ScanOpT,
+                   InitValueT,
+                   OffsetT,
+                   AccumT>;
+
+  // Shared memory for AgentScanByKey
+  __shared__ typename AgentScanByKeyT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentScanByKeyT(temp_storage, d_keys_in, d_keys_prev_in, d_values_in, d_values_out, equality_op, scan_op, init_value)
+    .ConsumeRange(num_items, tile_state, start_tile);
+}
+
+template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
+  ScanTileStateT tile_state,
+  KeysInputIteratorT d_keys_in,
+  cub::detail::value_t<KeysInputIteratorT>* d_keys_prev_in,
+  OffsetT items_per_tile,
+  int num_tiles)
+{
+  // Initialize tile status
+  tile_state.InitializeStatus(num_tiles);
+
+  const unsigned tid      = threadIdx.x + blockDim.x * blockIdx.x;
+  const OffsetT tile_base = static_cast<OffsetT>(tid) * items_per_tile;
+
+  if (tid > 0 && tid < num_tiles)
+  {
+    d_keys_prev_in[tid] = d_keys_in[tile_base - 1];
+  }
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels
+ *        for DeviceScan
+ *
+ * @tparam KeysInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesInputIteratorT
+ *   Random-access input iterator type
+ *
+ * @tparam ValuesOutputIteratorT
+ *   Random-access output iterator type
+ *
+ * @tparam EqualityOp
+ *   Equality functor type
+ *
+ * @tparam ScanOpT
+ *   Scan functor type
+ *
+ * @tparam InitValueT
+ *   The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ */
+template <
+  typename KeysInputIteratorT,
+  typename ValuesInputIteratorT,
+  typename ValuesOutputIteratorT,
+  typename EqualityOp,
+  typename ScanOpT,
+  typename InitValueT,
+  typename OffsetT,
+  typename AccumT = ::cuda::std::__accumulator_t<
+    ScanOpT,
+    cub::detail::value_t<ValuesInputIteratorT>,
+    ::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>>,
+  typename SelectedPolicy =
+    DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
+struct DispatchScanByKey : SelectedPolicy
+{
+  //---------------------------------------------------------------------
+  // Constants and Types
+  //---------------------------------------------------------------------
+
+  static constexpr int INIT_KERNEL_THREADS = 128;
+
+  // The input key type
+  using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+
+  // The input value type
+  using InputT = cub::detail::value_t<ValuesInputIteratorT>;
+
+  // Tile state used for the decoupled look-back
+  using ScanByKeyTileStateT = ReduceByKeyScanTileState<AccumT, int>;
+
+  /// Device-accessible allocation of temporary storage. When `nullptr`, the
+  /// required allocation size is written to `temp_storage_bytes` and no work
+  /// is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Iterator to the input sequence of key items
+  KeysInputIteratorT d_keys_in;
+
+  /// Iterator to the input sequence of value items
+  ValuesInputIteratorT d_values_in;
+
+  /// Iterator to the input sequence of value items
+  ValuesOutputIteratorT d_values_out;
+
+  /// Binary equality functor
+  EqualityOp equality_op;
+
+  /// Binary scan functor
+  ScanOpT scan_op;
+
+  /// Initial value to seed the exclusive scan
+  InitValueT init_value;
+
+  /// Total number of input items (i.e., the length of `d_in`)
+  OffsetT num_items;
+
+  /// CUDA stream to launch kernels within.
+  cudaStream_t stream;
+  int ptx_version;
+
+  /**
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in
+   *   Iterator to the input sequence of key items
+   *
+   * @param[in] d_values_in
+   *   Iterator to the input sequence of value items
+   *
+   * @param[out] d_values_out
+   *   Iterator to the input sequence of value items
+   *
+   * @param[in] equality_op
+   *   Binary equality functor
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys_in(d_keys_in)
+      , d_values_in(d_values_in)
+      , d_values_out(d_values_out)
+      , equality_op(equality_op)
+      , scan_op(scan_op)
+      , init_value(init_value)
+      , num_items(num_items)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchScanByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys_in(d_keys_in)
+      , d_values_in(d_values_in)
+      , d_values_out(d_values_out)
+      , equality_op(equality_op)
+      , scan_op(scan_op)
+      , init_value(init_value)
+      , num_items(num_items)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+  {
+    using Policy = typename ActivePolicyT::ScanByKeyPolicyT;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[2];
+      error = CubDebug(ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break; // bytes needed for tile status descriptors
+      }
+
+      allocation_sizes[1] = sizeof(KeyT) * (num_tiles + 1);
+
+      // Compute allocation pointers into the single storage blob (or compute
+      // the necessary size of the blob)
+      void* allocations[2] = {};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      KeyT* d_keys_prev_in = reinterpret_cast<KeyT*>(allocations[1]);
+
+      // Construct the tile status interface
+      ScanByKeyTileStateT tile_state;
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log init_kernel configuration
+      int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast<OffsetT>(tile_size), num_tiles);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Run grids in epochs (in case number of tiles exceeds max x-dimension
+      int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+      for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+      {
+// Log scan_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items "
+                "per thread\n",
+                start_tile,
+                scan_grid_size,
+                Policy::BLOCK_THREADS,
+                (long long) stream,
+                Policy::ITEMS_PER_THREAD);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke scan_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
+          .doit(scan_kernel,
+                d_keys_in,
+                d_keys_prev_in,
+                d_values_in,
+                d_values_out,
+                tile_state,
+                start_tile,
+                equality_op,
+                scan_op,
+                init_value,
+                num_items);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchScanByKey::MaxPolicy;
+
+    // Ensure kernels are instantiated.
+    return Invoke<ActivePolicyT>(
+      DeviceScanByKeyInitKernel<ScanByKeyTileStateT, KeysInputIteratorT, OffsetT>,
+      DeviceScanByKeyKernel<MaxPolicyT,
+                            KeysInputIteratorT,
+                            ValuesInputIteratorT,
+                            ValuesOutputIteratorT,
+                            ScanByKeyTileStateT,
+                            EqualityOp,
+                            ScanOpT,
+                            InitValueT,
+                            OffsetT,
+                            AccumT>);
+  }
+
+  /**
+   * @brief Internal dispatch routine
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage. When `nullptr`, the
+   *   required allocation size is written to `temp_storage_bytes` and no
+   *   work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in
+   *   Iterator to the input sequence of key items
+   *
+   * @param[in] d_values_in
+   *   Iterator to the input sequence of value items
+   *
+   * @param[out] d_values_out
+   *   Iterator to the input sequence of value items
+   *
+   * @param[in] equality_op
+   *   Binary equality functor
+   *
+   * @param[in] scan_op
+   *   Binary scan functor
+   *
+   * @param[in] init_value
+   *   Initial value to seed the exclusive scan
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of `d_in`)
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchScanByKey::MaxPolicy;
+
+    cudaError_t error;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchScanByKey dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys_in,
+        d_values_in,
+        d_values_out,
+        equality_op,
+        scan_op,
+        init_value,
+        num_items,
+        stream,
+        ptx_version);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeysInputIteratorT d_keys_in,
+    ValuesInputIteratorT d_values_in,
+    ValuesOutputIteratorT d_values_out,
+    EqualityOp equality_op,
+    ScanOpT scan_op,
+    InitValueT init_value,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_values_out,
+      equality_op,
+      scan_op,
+      init_value,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
new file mode 100644
index 000000000..80d8973c7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -0,0 +1,1690 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_segmented_radix_sort.cuh>
+#include <cub/agent/agent_sub_warp_merge_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/detail/device_double_buffer.cuh>
+#include <cub/detail/temporary_storage.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/thread/thread_sort.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cuda/std/type_traits>
+
+#include <type_traits>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief Fallback kernel, in case there's not enough segments to
+ *        take advantage of partitioning.
+ *
+ * In this case, the sorting method is still selected based on the segment size.
+ * If a single warp can sort the segment, the algorithm will use the sub-warp
+ * merge sort. Otherwise, the algorithm will use the in-shared-memory version of
+ * block radix sort. If data don't fit into shared memory, the algorithm will
+ * use in-global-memory radix sort.
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in,out] d_keys_double_buffer
+ *   Double keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in,out] d_values_double_buffer
+ *   Double values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
+ *   i-th data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
+ *   i-th data segment in `d_keys_*` and `d_values_*`.
+ *   If `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the i-th segment is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortFallbackKernel(
+    const KeyT* d_keys_in_orig,
+    KeyT* d_keys_out_orig,
+    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    const ValueT* d_values_in_orig,
+    ValueT* d_values_out_orig,
+    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+  using MediumPolicyT       = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT::MediumPolicyT;
+
+  const unsigned int segment_id = blockIdx.x;
+  OffsetT segment_begin         = d_begin_offsets[segment_id];
+  OffsetT segment_end           = d_end_offsets[segment_id];
+  OffsetT num_items             = segment_end - segment_begin;
+
+  if (num_items <= 0)
+  {
+    return;
+  }
+
+  using AgentSegmentedRadixSortT =
+    cub::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+
+  using WarpReduceT = cub::WarpReduce<KeyT>;
+
+  using AgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+
+  __shared__ union
+  {
+    typename AgentSegmentedRadixSortT::TempStorage block_sort;
+    typename WarpReduceT::TempStorage warp_reduce;
+    typename AgentWarpMergeSortT::TempStorage medium_warp_sort;
+  } temp_storage;
+
+  constexpr bool keys_only = std::is_same<ValueT, NullType>::value;
+  AgentSegmentedRadixSortT agent(num_items, temp_storage.block_sort);
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit   = sizeof(KeyT) * 8;
+
+  constexpr int cacheable_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+  d_keys_in_orig += segment_begin;
+  d_keys_out_orig += segment_begin;
+
+  if (!keys_only)
+  {
+    d_values_in_orig += segment_begin;
+    d_values_out_orig += segment_begin;
+  }
+
+  if (num_items <= MediumPolicyT::ITEMS_PER_TILE)
+  {
+    // Sort by a single warp
+    if (threadIdx.x < MediumPolicyT::WARP_THREADS)
+    {
+      AgentWarpMergeSortT(temp_storage.medium_warp_sort)
+        .ProcessSegment(num_items, d_keys_in_orig, d_keys_out_orig, d_values_in_orig, d_values_out_orig);
+    }
+  }
+  else if (num_items < cacheable_tile_size)
+  {
+    // Sort by a CTA if data fits into shared memory
+    agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig);
+  }
+  else
+  {
+    // Sort by a CTA with multiple reads from global memory
+    int current_bit = begin_bit;
+    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+
+    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+      d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
+
+    if (!keys_only)
+    {
+      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+        d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin);
+    }
+
+    agent.ProcessIterative(
+      current_bit,
+      pass_bits,
+      d_keys_in_orig,
+      d_values_in_orig,
+      d_keys_double_buffer.current(),
+      d_values_double_buffer.current());
+    current_bit += pass_bits;
+
+#pragma unroll 1
+    while (current_bit < end_bit)
+    {
+      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+
+      CTA_SYNC();
+      agent.ProcessIterative(
+        current_bit,
+        pass_bits,
+        d_keys_double_buffer.current(),
+        d_values_double_buffer.current(),
+        d_keys_double_buffer.alternate(),
+        d_values_double_buffer.alternate());
+
+      d_keys_double_buffer.swap();
+      d_values_double_buffer.swap();
+      current_bit += pass_bits;
+    }
+  }
+}
+
+/**
+ * @brief Single kernel for moderate size (less than a few thousand items)
+ *        segments.
+ *
+ * This kernel allocates a sub-warp per segment. Therefore, this kernel assigns
+ * a single thread block to multiple segments. Segments fall into two
+ * categories. An architectural warp usually sorts segments in the medium-size
+ * category, while a few threads sort segments in the small-size category. Since
+ * segments are partitioned, we know the last thread block index assigned to
+ * sort medium-size segments. A particular thread block can check this number to
+ * find out which category it was assigned to sort. In both cases, the
+ * merge sort is used.
+ *
+ * @param[in] small_segments
+ *   Number of segments that can be sorted by a warp part
+ *
+ * @param[in] medium_segments
+ *   Number of segments that can be sorted by a warp
+ *
+ * @param[in] medium_blocks
+ *   Number of CTAs assigned to process medium segments
+ *
+ * @param[in] d_small_segments_indices
+ *   Small segments mapping of length @p small_segments, such that
+ *   `d_small_segments_indices[i]` is the input segment index
+ *
+ * @param[in] d_medium_segments_indices
+ *   Medium segments mapping of length @p medium_segments, such that
+ *   `d_medium_segments_indices[i]` is the input segment index
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
+ *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
+    unsigned int small_segments,
+    unsigned int medium_segments,
+    unsigned int medium_blocks,
+    const unsigned int* d_small_segments_indices,
+    const unsigned int* d_medium_segments_indices,
+    const KeyT* d_keys_in,
+    KeyT* d_keys_out,
+    const ValueT* d_values_in,
+    ValueT* d_values_out,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  const unsigned int tid = threadIdx.x;
+  const unsigned int bid = blockIdx.x;
+
+  using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
+  using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+  using MediumPolicyT         = typename SmallAndMediumPolicyT::MediumPolicyT;
+  using SmallPolicyT          = typename SmallAndMediumPolicyT::SmallPolicyT;
+
+  constexpr int threads_per_medium_segment = MediumPolicyT::WARP_THREADS;
+  constexpr int threads_per_small_segment  = SmallPolicyT::WARP_THREADS;
+
+  using MediumAgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, MediumPolicyT, KeyT, ValueT, OffsetT>;
+
+  using SmallAgentWarpMergeSortT = AgentSubWarpSort<IS_DESCENDING, SmallPolicyT, KeyT, ValueT, OffsetT>;
+
+  constexpr auto segments_per_medium_block =
+    static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+
+  constexpr auto segments_per_small_block = static_cast<unsigned int>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+
+  __shared__ union
+  {
+    typename MediumAgentWarpMergeSortT::TempStorage medium_storage[segments_per_medium_block];
+
+    typename SmallAgentWarpMergeSortT::TempStorage small_storage[segments_per_small_block];
+  } temp_storage;
+
+  if (bid < medium_blocks)
+  {
+    const unsigned int sid_within_block  = tid / threads_per_medium_segment;
+    const unsigned int medium_segment_id = bid * segments_per_medium_block + sid_within_block;
+
+    if (medium_segment_id < medium_segments)
+    {
+      const unsigned int global_segment_id = d_medium_segments_indices[medium_segment_id];
+
+      const OffsetT segment_begin = d_begin_offsets[global_segment_id];
+      const OffsetT segment_end   = d_end_offsets[global_segment_id];
+      const OffsetT num_items     = segment_end - segment_begin;
+
+      MediumAgentWarpMergeSortT(temp_storage.medium_storage[sid_within_block])
+        .ProcessSegment(num_items,
+                        d_keys_in + segment_begin,
+                        d_keys_out + segment_begin,
+                        d_values_in + segment_begin,
+                        d_values_out + segment_begin);
+    }
+  }
+  else
+  {
+    const unsigned int sid_within_block = tid / threads_per_small_segment;
+    const unsigned int small_segment_id = (bid - medium_blocks) * segments_per_small_block + sid_within_block;
+
+    if (small_segment_id < small_segments)
+    {
+      const unsigned int global_segment_id = d_small_segments_indices[small_segment_id];
+
+      const OffsetT segment_begin = d_begin_offsets[global_segment_id];
+      const OffsetT segment_end   = d_end_offsets[global_segment_id];
+      const OffsetT num_items     = segment_end - segment_begin;
+
+      SmallAgentWarpMergeSortT(temp_storage.small_storage[sid_within_block])
+        .ProcessSegment(num_items,
+                        d_keys_in + segment_begin,
+                        d_keys_out + segment_begin,
+                        d_values_in + segment_begin,
+                        d_values_out + segment_begin);
+    }
+  }
+}
+
+/**
+ * @brief Single kernel for large size (more than a few thousand items) segments.
+ *
+ * @param[in] d_keys_in_orig
+ *   Input keys buffer
+ *
+ * @param[out] d_keys_out_orig
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in_orig
+ *   Input values buffer
+ *
+ * @param[out] d_values_out_orig
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length
+ *   `num_segments`, such that `d_begin_offsets[i]` is the first element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length
+ *   `num_segments`, such that `d_end_offsets[i]-1` is the last element of the
+ *   <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`. If
+ *   `d_end_offsets[i]-1 <= d_begin_offsets[i]`, the <em>i</em><sup>th</sup> is
+ *   considered empty.
+ */
+template <bool IS_DESCENDING,
+          typename ChainedPolicyT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT>
+__launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge(
+    const unsigned int* d_segments_indices,
+    const KeyT* d_keys_in_orig,
+    KeyT* d_keys_out_orig,
+    cub::detail::device_double_buffer<KeyT> d_keys_double_buffer,
+    const ValueT* d_values_in_orig,
+    ValueT* d_values_out_orig,
+    cub::detail::device_double_buffer<ValueT> d_values_double_buffer,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets)
+{
+  using ActivePolicyT       = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
+
+  constexpr int small_tile_size = LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD;
+
+  using AgentSegmentedRadixSortT =
+    cub::AgentSegmentedRadixSort<IS_DESCENDING, LargeSegmentPolicyT, KeyT, ValueT, OffsetT>;
+
+  __shared__ typename AgentSegmentedRadixSortT::TempStorage storage;
+
+  const unsigned int bid = blockIdx.x;
+
+  constexpr int begin_bit = 0;
+  constexpr int end_bit   = sizeof(KeyT) * 8;
+
+  const unsigned int global_segment_id = d_segments_indices[bid];
+  const OffsetT segment_begin          = d_begin_offsets[global_segment_id];
+  const OffsetT segment_end            = d_end_offsets[global_segment_id];
+  const OffsetT num_items              = segment_end - segment_begin;
+
+  constexpr bool keys_only = std::is_same<ValueT, NullType>::value;
+  AgentSegmentedRadixSortT agent(num_items, storage);
+
+  d_keys_in_orig += segment_begin;
+  d_keys_out_orig += segment_begin;
+
+  if (!keys_only)
+  {
+    d_values_in_orig += segment_begin;
+    d_values_out_orig += segment_begin;
+  }
+
+  if (num_items < small_tile_size)
+  {
+    // Sort in shared memory if the segment fits into it
+    agent.ProcessSinglePass(begin_bit, end_bit, d_keys_in_orig, d_values_in_orig, d_keys_out_orig, d_values_out_orig);
+  }
+  else
+  {
+    // Sort reading global memory multiple times
+    int current_bit = begin_bit;
+    int pass_bits   = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+
+    d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
+      d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
+
+    if (!keys_only)
+    {
+      d_values_double_buffer = cub::detail::device_double_buffer<ValueT>(
+        d_values_double_buffer.current() + segment_begin, d_values_double_buffer.alternate() + segment_begin);
+    }
+
+    agent.ProcessIterative(
+      current_bit,
+      pass_bits,
+      d_keys_in_orig,
+      d_values_in_orig,
+      d_keys_double_buffer.current(),
+      d_values_double_buffer.current());
+    current_bit += pass_bits;
+
+#pragma unroll 1
+    while (current_bit < end_bit)
+    {
+      pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
+
+      CTA_SYNC();
+      agent.ProcessIterative(
+        current_bit,
+        pass_bits,
+        d_keys_double_buffer.current(),
+        d_values_double_buffer.current(),
+        d_keys_double_buffer.alternate(),
+        d_values_double_buffer.alternate());
+
+      d_keys_double_buffer.swap();
+      d_values_double_buffer.swap();
+      current_bit += pass_bits;
+    }
+  }
+}
+
+/*
+ * Continuation is called after the partitioning stage. It launches kernels
+ * to sort large and small segments using the partitioning results. Separation
+ * of this stage is required to eliminate device-side synchronization in
+ * the CDP mode.
+ */
+template <typename LargeSegmentPolicyT,
+          typename SmallAndMediumPolicyT,
+          typename LargeKernelT,
+          typename SmallKernelT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortContinuation(
+  LargeKernelT large_kernel,
+  SmallKernelT small_kernel,
+  int num_segments,
+  KeyT* d_current_keys,
+  KeyT* d_final_keys,
+  detail::device_double_buffer<KeyT> d_keys_double_buffer,
+  ValueT* d_current_values,
+  ValueT* d_final_values,
+  detail::device_double_buffer<ValueT> d_values_double_buffer,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  unsigned int* group_sizes,
+  unsigned int* large_and_medium_segments_indices,
+  unsigned int* small_segments_indices,
+  cudaStream_t stream)
+{
+  cudaError error = cudaSuccess;
+
+  const unsigned int large_segments = group_sizes[0];
+
+  if (large_segments > 0)
+  {
+    // One CTA per segment
+    const unsigned int blocks_in_grid = large_segments;
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking "
+            "DeviceSegmentedSortKernelLarge<<<%d, %d, 0, %lld>>>()\n",
+            static_cast<int>(blocks_in_grid),
+            LargeSegmentPolicyT::BLOCK_THREADS,
+            (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
+      .doit(large_kernel,
+            large_and_medium_segments_indices,
+            d_current_keys,
+            d_final_keys,
+            d_keys_double_buffer,
+            d_current_values,
+            d_final_values,
+            d_values_double_buffer,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+  }
+
+  const unsigned int small_segments  = group_sizes[1];
+  const unsigned int medium_segments = static_cast<unsigned int>(num_segments) - (large_segments + small_segments);
+
+  const unsigned int small_blocks = ::cuda::ceil_div(small_segments, SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
+
+  const unsigned int medium_blocks =
+    ::cuda::ceil_div(medium_segments, SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
+
+  const unsigned int small_and_medium_blocks_in_grid = small_blocks + medium_blocks;
+
+  if (small_and_medium_blocks_in_grid)
+  {
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking "
+            "DeviceSegmentedSortKernelSmall<<<%d, %d, 0, %lld>>>()\n",
+            static_cast<int>(small_and_medium_blocks_in_grid),
+            SmallAndMediumPolicyT::BLOCK_THREADS,
+            (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream)
+      .doit(small_kernel,
+            small_segments,
+            medium_segments,
+            medium_blocks,
+            small_segments_indices,
+            large_and_medium_segments_indices + num_segments - medium_segments,
+            d_current_keys,
+            d_final_keys,
+            d_current_values,
+            d_final_values,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+  }
+
+  return error;
+}
+
+#ifdef CUB_RDC_ENABLED
+/*
+ * Continuation kernel is used only in the CDP mode. It's used to
+ * launch DeviceSegmentedSortContinuation as a separate kernel.
+ */
+template <typename ChainedPolicyT,
+          typename LargeKernelT,
+          typename SmallKernelT,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+__launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
+  LargeKernelT large_kernel,
+  SmallKernelT small_kernel,
+  int num_segments,
+  KeyT* d_current_keys,
+  KeyT* d_final_keys,
+  detail::device_double_buffer<KeyT> d_keys_double_buffer,
+  ValueT* d_current_values,
+  ValueT* d_final_values,
+  detail::device_double_buffer<ValueT> d_values_double_buffer,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  unsigned int* group_sizes,
+  unsigned int* large_and_medium_segments_indices,
+  unsigned int* small_segments_indices)
+{
+  using ActivePolicyT         = typename ChainedPolicyT::ActivePolicy;
+  using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
+  using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+
+  // In case of CDP:
+  // 1. each CTA has a different main stream
+  // 2. all streams are non-blocking
+  // 3. child grid always completes before the parent grid
+  // 4. streams can be used only from the CTA in which they were created
+  // 5. streams created on the host cannot be used on the device
+  //
+  // Due to (4, 5), we can't pass the user-provided stream in the continuation.
+  // Due to (1, 2, 3) it's safe to pass the main stream.
+  cudaError_t error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
+    large_kernel,
+    small_kernel,
+    num_segments,
+    d_current_keys,
+    d_final_keys,
+    d_keys_double_buffer,
+    d_current_values,
+    d_final_values,
+    d_values_double_buffer,
+    d_begin_offsets,
+    d_end_offsets,
+    group_sizes,
+    large_and_medium_segments_indices,
+    small_segments_indices,
+    0); // always launching on the main stream (see motivation above)
+
+  error = CubDebug(error);
+}
+#endif // CUB_RDC_ENABLED
+
+template <typename KeyT, typename ValueT>
+struct DeviceSegmentedSortPolicy
+{
+  using DominantT = ::cuda::std::_If<(sizeof(ValueT) > sizeof(KeyT)), ValueT, KeyT>;
+
+  static constexpr int KEYS_ONLY = std::is_same<ValueT, cub::NullType>::value;
+
+  //----------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //----------------------------------------------------------------------------
+
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    static constexpr int BLOCK_THREADS          = 128;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 6 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 300;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      9,
+      DominantT,
+      BLOCK_LOAD_WARP_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(5);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(5);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<4,
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32,
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 6 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 300;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      16,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(7);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(7);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 6 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      19,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MATCH,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 6 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 5 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      16,
+      DominantT,
+      BLOCK_LOAD_TRANSPOSE,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_RAKING_MEMOIZE,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<4, // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int RADIX_BITS             = sizeof(KeyT) > 1 ? 6 : 4;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      19,
+      DominantT,
+      BLOCK_LOAD_DIRECT,
+      LOAD_DEFAULT,
+      RADIX_RANK_MEMOIZE,
+      BLOCK_SCAN_WARP_SCANS,
+      RADIX_BITS>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(7);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 11 : 7);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 8), // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      23,
+      DominantT,
+      cub::BLOCK_LOAD_TRANSPOSE,
+      cub::LOAD_DEFAULT,
+      cub::RADIX_RANK_MEMOIZE,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      (sizeof(KeyT) > 1) ? 6 : 4>;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(9);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(KEYS_ONLY ? 7 : 11);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<(KEYS_ONLY ? 4 : 2), // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                       CacheLoadModifier::LOAD_DEFAULT>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<32, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                       CacheLoadModifier::LOAD_DEFAULT>>;
+  };
+
+  struct Policy860 : ChainedPolicy<860, Policy860, Policy800>
+  {
+    static constexpr int BLOCK_THREADS          = 256;
+    static constexpr int PARTITIONING_THRESHOLD = 500;
+
+    using LargeSegmentPolicy = cub::AgentRadixSortDownsweepPolicy<
+      BLOCK_THREADS,
+      23,
+      DominantT,
+      cub::BLOCK_LOAD_TRANSPOSE,
+      cub::LOAD_DEFAULT,
+      cub::RADIX_RANK_MEMOIZE,
+      cub::BLOCK_SCAN_WARP_SCANS,
+      (sizeof(KeyT) > 1) ? 6 : 4>;
+
+    static constexpr bool LARGE_ITEMS = sizeof(DominantT) > 4;
+
+    static constexpr int ITEMS_PER_SMALL_THREAD = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 7 : 9);
+
+    static constexpr int ITEMS_PER_MEDIUM_THREAD = Nominal4BItemsToItems<DominantT>(LARGE_ITEMS ? 9 : 7);
+
+    using SmallAndMediumSegmentedSortPolicyT = AgentSmallAndMediumSegmentedSortPolicy<
+
+      BLOCK_THREADS,
+
+      // Small policy
+      cub::AgentSubWarpMergeSortPolicy<(LARGE_ITEMS ? 8 : 2), // Threads per segment
+                                       ITEMS_PER_SMALL_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                       CacheLoadModifier::LOAD_LDG>,
+
+      // Medium policy
+      cub::AgentSubWarpMergeSortPolicy<16, // Threads per segment
+                                       ITEMS_PER_MEDIUM_THREAD,
+                                       WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                                       CacheLoadModifier::LOAD_LDG>>;
+  };
+
+  /// MaxPolicy
+  using MaxPolicy = Policy860;
+};
+
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename SelectedPolicy = DeviceSegmentedSortPolicy<KeyT, ValueT>>
+struct DispatchSegmentedSort : SelectedPolicy
+{
+  static constexpr int KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  struct LargeSegmentsSelectorT
+  {
+    OffsetT value{};
+    BeginOffsetIteratorT d_offset_begin{};
+    EndOffsetIteratorT d_offset_end{};
+
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+    LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+        : value(value)
+        , d_offset_begin(d_offset_begin)
+        , d_offset_end(d_offset_end)
+    {}
+
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const
+    {
+      const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id];
+      return segment_size > value;
+    }
+  };
+
+  struct SmallSegmentsSelectorT
+  {
+    OffsetT value{};
+    BeginOffsetIteratorT d_offset_begin{};
+    EndOffsetIteratorT d_offset_end{};
+
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
+    SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
+        : value(value)
+        , d_offset_begin(d_offset_begin)
+        , d_offset_end(d_offset_end)
+    {}
+
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(unsigned int segment_id) const
+    {
+      const OffsetT segment_size = d_offset_end[segment_id] - d_offset_begin[segment_id];
+      return segment_size < value;
+    }
+  };
+
+  // Partition selects large and small groups. The middle group is not selected.
+  static constexpr std::size_t num_selected_groups = 2;
+
+  /**
+   * Device-accessible allocation of temporary storage. When `nullptr`, the
+   * required allocation size is written to `temp_storage_bytes` and no work
+   * is done.
+   */
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  std::size_t& temp_storage_bytes;
+
+  /**
+   * Double-buffer whose current buffer contains the unsorted input keys and,
+   * upon return, is updated to point to the sorted output keys
+   */
+  DoubleBuffer<KeyT>& d_keys;
+
+  /**
+   * Double-buffer whose current buffer contains the unsorted input values and,
+   * upon return, is updated to point to the sorted output values
+   */
+  DoubleBuffer<ValueT>& d_values;
+
+  /// Number of items to sort
+  OffsetT num_items;
+
+  /// The number of segments that comprise the sorting data
+  int num_segments;
+
+  /**
+   * Random-access input iterator to the sequence of beginning offsets of length
+   * `num_segments`, such that `d_begin_offsets[i]` is the first element of the
+   * <em>i</em><sup>th</sup> data segment in `d_keys_*` and `d_values_*`
+   */
+  BeginOffsetIteratorT d_begin_offsets;
+
+  /**
+   * Random-access input iterator to the sequence of ending offsets of length
+   * `num_segments`, such that <tt>d_end_offsets[i]-1</tt> is the last element
+   * of the <em>i</em><sup>th</sup> data segment in `d_keys_*` and
+   * `d_values_*`. If `d_end_offsets[i]-1 <= d_begin_offsets[i]`,
+   * the <em>i</em><sup>th</sup> is considered empty.
+   */
+  EndOffsetIteratorT d_end_offsets;
+
+  /// Whether is okay to overwrite source buffers
+  bool is_overwrite_okay;
+
+  /// CUDA stream to launch kernels within.
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    bool is_overwrite_okay,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , is_overwrite_okay(is_overwrite_okay)
+      , stream(stream)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSegmentedSort(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys(d_keys)
+      , d_values(d_values)
+      , num_items(num_items)
+      , num_segments(num_segments)
+      , d_begin_offsets(d_begin_offsets)
+      , d_end_offsets(d_end_offsets)
+      , is_overwrite_okay(is_overwrite_okay)
+      , stream(stream)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT            = typename DispatchSegmentedSort::MaxPolicy;
+    using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
+    using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+
+    static_assert(LargeSegmentPolicyT::LOAD_MODIFIER != CacheLoadModifier::LOAD_LDG,
+                  "The memory consistency model does not apply to texture accesses");
+
+    static_assert(KEYS_ONLY || LargeSegmentPolicyT::LOAD_ALGORITHM != BLOCK_LOAD_STRIPED
+                    || SmallAndMediumPolicyT::MediumPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED
+                    || SmallAndMediumPolicyT::SmallPolicyT::LOAD_ALGORITHM != WARP_LOAD_STRIPED,
+                  "Striped load will make this algorithm unstable");
+
+    static_assert(SmallAndMediumPolicyT::MediumPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED
+                    || SmallAndMediumPolicyT::SmallPolicyT::STORE_ALGORITHM != WARP_STORE_STRIPED,
+                  "Striped stores will produce unsorted results");
+
+    constexpr int radix_bits = LargeSegmentPolicyT::RADIX_BITS;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      //------------------------------------------------------------------------
+      // Prepare temporary storage layout
+      //------------------------------------------------------------------------
+
+      const bool partition_segments = num_segments > ActivePolicyT::PARTITIONING_THRESHOLD;
+
+      cub::detail::temporary_storage::layout<5> temporary_storage_layout;
+
+      auto keys_slot                          = temporary_storage_layout.get_slot(0);
+      auto values_slot                        = temporary_storage_layout.get_slot(1);
+      auto large_and_medium_partitioning_slot = temporary_storage_layout.get_slot(2);
+      auto small_partitioning_slot            = temporary_storage_layout.get_slot(3);
+      auto group_sizes_slot                   = temporary_storage_layout.get_slot(4);
+
+      auto keys_allocation   = keys_slot->create_alias<KeyT>();
+      auto values_allocation = values_slot->create_alias<ValueT>();
+
+      if (!is_overwrite_okay)
+      {
+        keys_allocation.grow(num_items);
+
+        if (!KEYS_ONLY)
+        {
+          values_allocation.grow(num_items);
+        }
+      }
+
+      auto large_and_medium_segments_indices = large_and_medium_partitioning_slot->create_alias<unsigned int>();
+      auto small_segments_indices            = small_partitioning_slot->create_alias<unsigned int>();
+      auto group_sizes                       = group_sizes_slot->create_alias<unsigned int>();
+
+      std::size_t three_way_partition_temp_storage_bytes{};
+
+      LargeSegmentsSelectorT large_segments_selector(
+        SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE, d_begin_offsets, d_end_offsets);
+
+      SmallSegmentsSelectorT small_segments_selector(
+        SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE + 1, d_begin_offsets, d_end_offsets);
+
+      auto device_partition_temp_storage = keys_slot->create_alias<std::uint8_t>();
+
+      if (partition_segments)
+      {
+        large_and_medium_segments_indices.grow(num_segments);
+        small_segments_indices.grow(num_segments);
+        group_sizes.grow(num_selected_groups);
+
+        auto medium_indices_iterator =
+          THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get());
+
+        cub::DevicePartition::IfNoNVTX(
+          nullptr,
+          three_way_partition_temp_storage_bytes,
+          THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+          large_and_medium_segments_indices.get(),
+          small_segments_indices.get(),
+          medium_indices_iterator,
+          group_sizes.get(),
+          num_segments,
+          large_segments_selector,
+          small_segments_selector,
+          stream);
+
+        device_partition_temp_storage.grow(three_way_partition_temp_storage_bytes);
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        temp_storage_bytes = temporary_storage_layout.get_size();
+
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      if (num_items == 0 || num_segments == 0)
+      {
+        break;
+      }
+
+      error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      //------------------------------------------------------------------------
+      // Sort
+      //------------------------------------------------------------------------
+
+      const bool is_num_passes_odd = GetNumPasses(radix_bits) & 1;
+
+      /**
+       * This algorithm sorts segments that don't fit into shared memory with
+       * the in-global-memory radix sort. Radix sort splits key representation
+       * into multiple "digits". Each digit is RADIX_BITS wide. The algorithm
+       * iterates over these digits. Each of these iterations consists of a
+       * couple of stages. The first stage computes a histogram for a current
+       * digit in each segment key. This histogram helps to determine the
+       * starting position of the keys group with a similar digit.
+       * For example:
+       * keys_digits  = [ 1, 0, 0, 1 ]
+       * digit_prefix = [ 0, 2 ]
+       * The second stage checks the keys again and increments the prefix to
+       * determine the final position of the key:
+       *
+       *               expression            |  key  |   idx   |     result
+       * ----------------------------------- | ----- | ------- | --------------
+       * result[prefix[keys[0]]++] = keys[0] |   1   |    2    | [ ?, ?, 1, ? ]
+       * result[prefix[keys[1]]++] = keys[0] |   0   |    0    | [ 0, ?, 1, ? ]
+       * result[prefix[keys[2]]++] = keys[0] |   0   |    1    | [ 0, 0, 1, ? ]
+       * result[prefix[keys[3]]++] = keys[0] |   1   |    3    | [ 0, 0, 1, 1 ]
+       *
+       * If the resulting memory is aliased to the input one, we'll face the
+       * following issues:
+       *
+       *     input      |  key  |   idx   |   result/input   |      issue
+       * -------------- | ----- | ------- | ---------------- | ----------------
+       * [ 1, 0, 0, 1 ] |   1   |    2    | [ 1, 0, 1, 1 ]   | overwrite keys[2]
+       * [ 1, 0, 1, 1 ] |   0   |    0    | [ 0, 0, 1, 1 ]   |
+       * [ 0, 0, 1, 1 ] |   1   |    3    | [ 0, 0, 1, 1 ]   | extra key
+       * [ 0, 0, 1, 1 ] |   1   |    4    | [ 0, 0, 1, 1 ] 1 | OOB access
+       *
+       * To avoid these issues, we have to use extra memory. The extra memory
+       * holds temporary storage for writing intermediate results of each stage.
+       * Since we iterate over digits in keys, we potentially need:
+       * `sizeof(KeyT) * num_items * cuda::ceil_div(sizeof(KeyT),RADIX_BITS)`
+       * auxiliary memory bytes. To reduce the auxiliary memory storage
+       * requirements, the algorithm relies on a double buffer facility. The
+       * idea behind it is in swapping destination and source buffers at each
+       * iteration. This way, we can use only two buffers. One of these buffers
+       * can be the final algorithm output destination. Therefore, only one
+       * auxiliary array is needed. Depending on the number of iterations, we
+       * can initialize the double buffer so that the algorithm output array
+       * will match the double buffer result one at the final iteration.
+       * A user can provide this algorithm with a double buffer straightaway to
+       * further reduce the auxiliary memory requirements. `is_overwrite_okay`
+       * indicates this use case.
+       */
+      detail::device_double_buffer<KeyT> d_keys_double_buffer(
+        (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : keys_allocation.get(),
+        (is_overwrite_okay)   ? d_keys.Current()
+        : (is_num_passes_odd) ? keys_allocation.get()
+                              : d_keys.Alternate());
+
+      detail::device_double_buffer<ValueT> d_values_double_buffer(
+        (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : values_allocation.get(),
+        (is_overwrite_okay)   ? d_values.Current()
+        : (is_num_passes_odd) ? values_allocation.get()
+                              : d_values.Alternate());
+
+      if (partition_segments)
+      {
+        // Partition input segments into size groups and assign specialized
+        // kernels for each of them.
+        error = SortWithPartitioning<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
+          DeviceSegmentedSortKernelLarge<IS_DESCENDING,
+                                         MaxPolicyT,
+                                         KeyT,
+                                         ValueT,
+                                         BeginOffsetIteratorT,
+                                         EndOffsetIteratorT,
+                                         OffsetT>,
+          DeviceSegmentedSortKernelSmall<IS_DESCENDING,
+                                         MaxPolicyT,
+                                         KeyT,
+                                         ValueT,
+                                         BeginOffsetIteratorT,
+                                         EndOffsetIteratorT,
+                                         OffsetT>,
+          three_way_partition_temp_storage_bytes,
+          d_keys_double_buffer,
+          d_values_double_buffer,
+          large_segments_selector,
+          small_segments_selector,
+          device_partition_temp_storage,
+          large_and_medium_segments_indices,
+          small_segments_indices,
+          group_sizes);
+      }
+      else
+      {
+        // If there are not enough segments, there's no reason to spend time
+        // on extra partitioning steps.
+
+        error = SortWithoutPartitioning<LargeSegmentPolicyT>(
+          DeviceSegmentedSortFallbackKernel<IS_DESCENDING,
+                                            MaxPolicyT,
+                                            KeyT,
+                                            ValueT,
+                                            BeginOffsetIteratorT,
+                                            EndOffsetIteratorT,
+                                            OffsetT>,
+          d_keys_double_buffer,
+          d_values_double_buffer);
+      }
+
+      d_keys.selector   = GetFinalSelector(d_keys.selector, radix_bits);
+      d_values.selector = GetFinalSelector(d_values.selector, radix_bits);
+
+    } while (false);
+
+    return error;
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    bool is_overwrite_okay,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchSegmentedSort dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        is_overwrite_okay,
+        stream);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (false);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    DoubleBuffer<KeyT>& d_keys,
+    DoubleBuffer<ValueT>& d_values,
+    OffsetT num_items,
+    int num_segments,
+    BeginOffsetIteratorT d_begin_offsets,
+    EndOffsetIteratorT d_end_offsets,
+    bool is_overwrite_okay,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite_okay,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetNumPasses(int radix_bits)
+  {
+    constexpr int byte_size = 8;
+    constexpr int num_bits  = sizeof(KeyT) * byte_size;
+    const int num_passes    = ::cuda::ceil_div(num_bits, radix_bits);
+    return num_passes;
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE int GetFinalSelector(int selector, int radix_bits)
+  {
+    // Sorted data always ends up in the other vector
+    if (!is_overwrite_okay)
+    {
+      return (selector + 1) & 1;
+    }
+
+    return (selector + GetNumPasses(radix_bits)) & 1;
+  }
+
+  template <typename T>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE T* GetFinalOutput(int radix_bits, DoubleBuffer<T>& buffer)
+  {
+    const int final_selector = GetFinalSelector(buffer.selector, radix_bits);
+    return buffer.d_buffers[final_selector];
+  }
+
+  template <typename LargeSegmentPolicyT, typename SmallAndMediumPolicyT, typename LargeKernelT, typename SmallKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithPartitioning(
+    LargeKernelT large_kernel,
+    SmallKernelT small_kernel,
+    std::size_t three_way_partition_temp_storage_bytes,
+    cub::detail::device_double_buffer<KeyT>& d_keys_double_buffer,
+    cub::detail::device_double_buffer<ValueT>& d_values_double_buffer,
+    LargeSegmentsSelectorT& large_segments_selector,
+    SmallSegmentsSelectorT& small_segments_selector,
+    cub::detail::temporary_storage::alias<std::uint8_t>& device_partition_temp_storage,
+    cub::detail::temporary_storage::alias<unsigned int>& large_and_medium_segments_indices,
+    cub::detail::temporary_storage::alias<unsigned int>& small_segments_indices,
+    cub::detail::temporary_storage::alias<unsigned int>& group_sizes)
+  {
+    cudaError_t error = cudaSuccess;
+
+    auto medium_indices_iterator =
+      THRUST_NS_QUALIFIER::make_reverse_iterator(large_and_medium_segments_indices.get() + num_segments);
+
+    error = CubDebug(cub::DevicePartition::IfNoNVTX(
+      device_partition_temp_storage.get(),
+      three_way_partition_temp_storage_bytes,
+      THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+      large_and_medium_segments_indices.get(),
+      small_segments_indices.get(),
+      medium_indices_iterator,
+      group_sizes.get(),
+      num_segments,
+      large_segments_selector,
+      small_segments_selector,
+      stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // The device path is only used (and only compiles) when CDP is enabled.
+    // It's defined in a macro since we can't put `#ifdef`s inside of
+    // `NV_IF_TARGET`.
+#ifndef CUB_RDC_ENABLED
+
+#  define CUB_TEMP_DEVICE_CODE
+
+#else // CUB_RDC_ENABLED
+
+#  define CUB_TEMP_DEVICE_CODE                                                 \
+    using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;              \
+    error =                                                                    \
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \
+        .doit(                                                                 \
+          DeviceSegmentedSortContinuationKernel<                               \
+            MaxPolicyT,                                                        \
+            LargeKernelT,                                                      \
+            SmallKernelT,                                                      \
+            KeyT,                                                              \
+            ValueT,                                                            \
+            BeginOffsetIteratorT,                                              \
+            EndOffsetIteratorT>,                                               \
+          large_kernel,                                                        \
+          small_kernel,                                                        \
+          num_segments,                                                        \
+          d_keys.Current(),                                                    \
+          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),       \
+          d_keys_double_buffer,                                                \
+          d_values.Current(),                                                  \
+          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),   \
+          d_values_double_buffer,                                              \
+          d_begin_offsets,                                                     \
+          d_end_offsets,                                                       \
+          group_sizes.get(),                                                   \
+          large_and_medium_segments_indices.get(),                             \
+          small_segments_indices.get());                                       \
+    error = CubDebug(error);                                                   \
+                                                                               \
+    if (cudaSuccess != error)                                                  \
+    {                                                                          \
+      return error;                                                            \
+    }                                                                          \
+                                                                               \
+    error = CubDebug(detail::DebugSyncStream(stream));                         \
+    if (cudaSuccess != error)                                                  \
+    {                                                                          \
+      return error;                                                            \
+    }
+
+#endif // CUB_RDC_ENABLED
+
+    // Clang format mangles some of this NV_IF_TARGET block
+    // clang-format off
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (
+        unsigned int h_group_sizes[num_selected_groups];
+        error = CubDebug(cudaMemcpyAsync(h_group_sizes,
+                                             group_sizes.get(),
+                                             num_selected_groups *
+                                               sizeof(unsigned int),
+                                             cudaMemcpyDeviceToHost,
+                                             stream));
+
+        if (cudaSuccess != error)
+        {
+            return error;
+        }
+
+        error = CubDebug(SyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        error = DeviceSegmentedSortContinuation<LargeSegmentPolicyT,
+                                                SmallAndMediumPolicyT>(
+          large_kernel,
+          small_kernel,
+          num_segments,
+          d_keys.Current(),
+          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+          d_keys_double_buffer,
+          d_values.Current(),
+          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),
+          d_values_double_buffer,
+          d_begin_offsets,
+          d_end_offsets,
+          h_group_sizes,
+          large_and_medium_segments_indices.get(),
+          small_segments_indices.get(),
+          stream);),
+      // NV_IS_DEVICE:
+      (CUB_TEMP_DEVICE_CODE));
+    // clang-format on
+
+#undef CUB_TEMP_DEVICE_CODE
+
+    return error;
+  }
+
+  template <typename LargeSegmentPolicyT, typename FallbackKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t SortWithoutPartitioning(
+    FallbackKernelT fallback_kernel,
+    cub::detail::device_double_buffer<KeyT>& d_keys_double_buffer,
+    cub::detail::device_double_buffer<ValueT>& d_values_double_buffer)
+  {
+    cudaError_t error = cudaSuccess;
+
+    const auto blocks_in_grid       = static_cast<unsigned int>(num_segments);
+    constexpr auto threads_in_block = static_cast<unsigned int>(LargeSegmentPolicyT::BLOCK_THREADS);
+
+// Log kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog("Invoking DeviceSegmentedSortFallbackKernel<<<%d, %d, "
+            "0, %lld>>>(), %d items per thread, bit_grain %d\n",
+            blocks_in_grid,
+            threads_in_block,
+            (long long) stream,
+            LargeSegmentPolicyT::ITEMS_PER_THREAD,
+            LargeSegmentPolicyT::RADIX_BITS);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+    // Invoke fallback kernel
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
+      .doit(fallback_kernel,
+            d_keys.Current(),
+            GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys),
+            d_keys_double_buffer,
+            d_values.Current(),
+            GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values),
+            d_values_double_buffer,
+            d_begin_offsets,
+            d_end_offsets);
+
+    // Check for failure to launch
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Sync the stream if specified to flush runtime errors
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    return error;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_select_if.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 000000000..807ba62e4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,880 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ *   cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences
+ *   of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_select_if.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_select_if.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace select
+{
+// Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+using per_partition_offset_t = ::cuda::std::int32_t;
+
+template <typename TotalNumItemsT, bool IsStreamingInvocation>
+class streaming_context_t
+{
+private:
+  bool first_partition = true;
+  bool last_partition  = false;
+  TotalNumItemsT total_num_items{};
+  TotalNumItemsT total_previous_num_items{};
+
+  // We use a double-buffer for keeping track of the number of previously selected items
+  TotalNumItemsT* d_num_selected_in  = nullptr;
+  TotalNumItemsT* d_num_selected_out = nullptr;
+
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(
+    TotalNumItemsT* d_num_selected_in,
+    TotalNumItemsT* d_num_selected_out,
+    TotalNumItemsT total_num_items,
+    bool is_last_partition)
+      : last_partition(is_last_partition)
+      , total_num_items(total_num_items)
+      , d_num_selected_in(d_num_selected_in)
+      , d_num_selected_out(d_num_selected_out)
+  {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT num_items, bool next_partition_is_the_last)
+  {
+    ::cuda::std::swap(d_num_selected_in, d_num_selected_out);
+    first_partition = false;
+    last_partition  = next_partition_is_the_last;
+    total_previous_num_items += num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return first_partition ? TotalNumItemsT{0} : total_previous_num_items;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return first_partition;
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  {
+    return first_partition ? TotalNumItemsT{0} : *d_num_selected_in;
+  };
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return first_partition ? TotalNumItemsT{0} : (total_previous_num_items - num_previously_selected());
+  };
+
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT) const
+  {
+    return total_num_items;
+  }
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
+  {
+    if (last_partition)
+    {
+      *user_num_selected_out_it = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+    else
+    {
+      *d_num_selected_out = num_previously_selected() + static_cast<TotalNumItemsT>(num_selections);
+    }
+  }
+};
+
+template <typename TotalNumItemsT>
+class streaming_context_t<TotalNumItemsT, false>
+{
+public:
+  using total_num_items_t = TotalNumItemsT;
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE streaming_context_t(TotalNumItemsT*, TotalNumItemsT*, TotalNumItemsT, bool) {}
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void advance(TotalNumItemsT, bool) {};
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT input_offset() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT is_first_partition() const
+  {
+    return true;
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_selected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_previously_rejected() const
+  {
+    return TotalNumItemsT{0};
+  };
+
+  template <typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TotalNumItemsT num_total_items(OffsetT num_partition_items) const
+  {
+    return num_partition_items;
+  }
+
+  template <typename NumSelectedIteratorT, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void
+  update_num_selected(NumSelectedIteratorT user_num_selected_out_it, OffsetT num_selections) const
+  {
+    *user_num_selected_out_it = num_selections;
+  }
+};
+
+/**
+ * @brief Wrapper that partially specializes the `AgentSelectIf` on the non-type name parameter `KeepRejects`.
+ */
+template <bool KeepRejects, bool MayAlias>
+struct agent_select_if_wrapper_t
+{
+  // Using an explicit list of template parameters forwarded to AgentSelectIf, since MSVC complains about a template
+  // argument following a parameter pack expansion like `AgentSelectIf<Ts..., KeepRejects, MayAlias>`
+  template <typename AgentSelectIfPolicyT,
+            typename InputIteratorT,
+            typename FlagsInputIteratorT,
+            typename SelectedOutputIteratorT,
+            typename SelectOpT,
+            typename EqualityOpT,
+            typename OffsetT,
+            typename StreamingContextT>
+  struct agent_t
+      : public AgentSelectIf<AgentSelectIfPolicyT,
+                             InputIteratorT,
+                             FlagsInputIteratorT,
+                             SelectedOutputIteratorT,
+                             SelectOpT,
+                             EqualityOpT,
+                             OffsetT,
+                             StreamingContextT,
+                             KeepRejects,
+                             MayAlias>
+  {
+    using AgentSelectIf<AgentSelectIfPolicyT,
+                        InputIteratorT,
+                        FlagsInputIteratorT,
+                        SelectedOutputIteratorT,
+                        SelectOpT,
+                        EqualityOpT,
+                        OffsetT,
+                        StreamingContextT,
+                        KeepRejects,
+                        MayAlias>::AgentSelectIf;
+  };
+};
+} // namespace select
+} // namespace detail
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items
+ *
+ * @tparam FlagsInputIteratorT
+ *   Random-access input iterator type for reading selection flags (NullType* if a selection functor
+ *   or discontinuity flagging is to be used for selection)
+ *
+ * @tparam SelectedOutputIteratorT
+ *   Random-access output iterator type for writing selected items
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam SelectOpT
+ *   Selection operator type (NullType if selection flags or discontinuity flagging is
+ *   to be used for selection)
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type (NullType if selection functor or selection flags is
+ *   to be used for selection)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for offsets within a partition
+ *
+ * @tparam StreamingContextT
+ *   Type providing the context information for the current partition, with the following member functions:
+ *    input_offset() -> base offset for the input (and flags) iterator
+ *    num_previously_selected() -> base offset for the output iterator for selected items
+ *    num_previously_rejected() -> base offset for the output iterator for rejected items (partition only)
+ *    num_total_items() -> total number of items across all partitions (partition only)
+ *    update_num_selected(d_num_sel_out, num_selected) -> invoked by last CTA with number of selected
+ *
+ * @tparam KEEP_REJECTS
+ *   Whether or not we push rejected items to the back of the output
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[in] d_flags
+ *   Pointer to the input sequence of selection flags (if applicable)
+ *
+ * @param[out] d_selected_out
+ *   Pointer to the output sequence of selected data items
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+ *
+ * @param[in] tile_status
+ *   Tile status interface
+ *
+ * @param[in] select_op
+ *   Selection operator
+ *
+ * @param[in] equality_op
+ *   Equality operator
+ *
+ * @param[in] num_items
+ *   Total number of input items (i.e., length of \p d_in)
+ *
+ * @param[in] num_tiles
+ *   Total number of tiles for the entire problem
+ *
+ * @param[in] streaming_context
+ *   The context information for the current partition
+ *
+ * @param[in] vsmem
+ *   Memory to support virtual shared memory
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename FlagsInputIteratorT,
+          typename SelectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename ScanTileStateT,
+          typename SelectOpT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename StreamingContextT,
+          bool KEEP_REJECTS,
+          bool MayAlias>
+__launch_bounds__(int(
+  cub::detail::vsmem_helper_default_fallback_policy_t<
+    typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    InputIteratorT,
+    FlagsInputIteratorT,
+    SelectedOutputIteratorT,
+    SelectOpT,
+    EqualityOpT,
+    OffsetT,
+    StreamingContextT>::agent_policy_t::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(
+    InputIteratorT d_in,
+    FlagsInputIteratorT d_flags,
+    SelectedOutputIteratorT d_selected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ScanTileStateT tile_status,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    int num_tiles,
+    _CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
+    cub::detail::vsmem_t vsmem)
+{
+  using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+    typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT,
+    detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+    InputIteratorT,
+    FlagsInputIteratorT,
+    SelectedOutputIteratorT,
+    SelectOpT,
+    EqualityOpT,
+    OffsetT,
+    StreamingContextT>;
+
+  using AgentSelectIfPolicyT = typename VsmemHelperT::agent_policy_t;
+
+  // Thread block type for selecting data from input tiles
+  using AgentSelectIfT = typename VsmemHelperT::agent_t;
+
+  // Static shared memory allocation
+  __shared__ typename VsmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentSelectIfT::TempStorage& temp_storage = VsmemHelperT::get_temp_storage(static_temp_storage, vsmem);
+
+  // Process tiles
+  AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items, streaming_context)
+    .ConsumeRange(num_tiles, tile_status, d_num_selected_out);
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VsmemHelperT::discard_temp_storage(temp_storage);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items
+ *
+ * @tparam FlagsInputIteratorT
+ *   Random-access input iterator type for reading selection flags
+ *   (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+ *
+ * @tparam SelectedOutputIteratorT
+ *   Random-access output iterator type for writing selected items
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam SelectOpT
+ *   Selection operator type (NullType if selection flags or discontinuity flagging is
+ *   to be used for selection)
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type (NullType if selection functor or selection flags is to
+ *   be used for selection)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam KEEP_REJECTS
+ *   Whether or not we push rejected items to the back of the output
+ */
+template <typename InputIteratorT,
+          typename FlagsInputIteratorT,
+          typename SelectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename SelectOpT,
+          typename EqualityOpT,
+          typename OffsetT,
+          bool KEEP_REJECTS,
+          bool MayAlias           = false,
+          typename SelectedPolicy = detail::device_select_policy_hub<cub::detail::value_t<InputIteratorT>,
+                                                                     cub::detail::value_t<FlagsInputIteratorT>,
+                                                                     detail::select::per_partition_offset_t,
+                                                                     MayAlias,
+                                                                     KEEP_REJECTS>>
+struct DispatchSelectIf : SelectedPolicy
+{
+  /******************************************************************************
+   * Types and constants
+   ******************************************************************************/
+
+  // Offset type used to instantiate the stream compaction-kernel and agent to index the items within one partition
+  using per_partition_offset_t = detail::select::per_partition_offset_t;
+
+  // Offset type large enough to represent any index within the input and output iterators
+  using num_total_items_t = OffsetT;
+
+  // Type used to provide streaming information about each partition's context
+  static constexpr per_partition_offset_t partition_size = ::cuda::std::numeric_limits<per_partition_offset_t>::max();
+
+  // If the values representable by OffsetT exceed the partition_size, we use a kernel template specialization that
+  // supports streaming (i.e., splitting the input into partitions of up to partition_size number of items)
+  static constexpr bool may_require_streaming =
+    (static_cast<::cuda::std::uint64_t>(partition_size)
+     < static_cast<::cuda::std::uint64_t>(::cuda::std::numeric_limits<OffsetT>::max()));
+
+  using streaming_context_t = detail::select::streaming_context_t<num_total_items_t, may_require_streaming>;
+
+  using ScanTileStateT = ScanTileState<per_partition_offset_t>;
+
+  static constexpr int INIT_KERNEL_THREADS = 128;
+
+  /// Device-accessible allocation of temporary storage.
+  /// When `nullptr`, the required allocation size is written to `temp_storage_bytes`
+  /// and no work is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of data items
+  InputIteratorT d_in;
+
+  /// Pointer to the input sequence of selection flags (if applicable)
+  FlagsInputIteratorT d_flags;
+
+  /// Pointer to the output sequence of selected data items
+  SelectedOutputIteratorT d_selected_out;
+
+  /// Pointer to the total number of items selected (i.e., length of `d_selected_out`)
+  NumSelectedIteratorT d_num_selected_out;
+
+  /// Selection operator
+  SelectOpT select_op;
+
+  /// Equality operator
+  EqualityOpT equality_op;
+
+  /// Total number of input items (i.e., length of `d_in`)
+  OffsetT num_items;
+
+  /// CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  int ptx_version;
+
+  /**
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When `nullptr`, the required allocation size is written to `temp_storage_bytes`
+   *   and no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param d_flags
+   *   Pointer to the input sequence of selection flags (if applicable)
+   *
+   * @param d_selected_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param d_num_selected_out
+   *  Pointer to the total number of items selected (i.e., length of `d_selected_out`)
+   *
+   * @param select_op
+   *   Selection operator
+   *
+   * @param equality_op
+   *   Equality operator
+   *
+   * @param num_items
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchSelectIf(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagsInputIteratorT d_flags,
+    SelectedOutputIteratorT d_selected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_flags(d_flags)
+      , d_selected_out(d_selected_out)
+      , d_num_selected_out(d_num_selected_out)
+      , select_op(select_op)
+      , equality_op(equality_op)
+      , num_items(num_items)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+  /******************************************************************************
+   * Dispatch entrypoints
+   ******************************************************************************/
+
+  /**
+   * Internal dispatch routine for computing a device-wide selection using the
+   * specified kernel functions.
+   */
+  template <typename ActivePolicyT, typename ScanInitKernelPtrT, typename SelectIfKernelPtrT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  Invoke(ScanInitKernelPtrT scan_init_kernel, SelectIfKernelPtrT select_if_kernel)
+  {
+    using Policy = typename ActivePolicyT::SelectIfPolicyT;
+
+    using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+      Policy,
+      detail::select::agent_select_if_wrapper_t<KEEP_REJECTS, MayAlias>::template agent_t,
+      InputIteratorT,
+      FlagsInputIteratorT,
+      SelectedOutputIteratorT,
+      SelectOpT,
+      EqualityOpT,
+      per_partition_offset_t,
+      streaming_context_t>;
+    cudaError error = cudaSuccess;
+
+    constexpr auto block_threads    = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
+    constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD;
+    constexpr auto tile_size        = static_cast<OffsetT>(block_threads * items_per_thread);
+
+    // The maximum number of items for which we will ever invoke the kernel (i.e. largest partition size)
+    // The extra check of may_require_streaming ensures that OffsetT is larger than per_partition_offset_t to avoid
+    // truncation of partition_size
+    auto const max_partition_size =
+      (may_require_streaming && num_items > static_cast<OffsetT>(partition_size))
+        ? static_cast<OffsetT>(partition_size)
+        : num_items;
+
+    // The number of partitions required to "iterate" over the total input (ternary to avoid div-by-zero)
+    auto const num_partitions =
+      (max_partition_size == 0) ? static_cast<OffsetT>(1) : ::cuda::ceil_div(num_items, max_partition_size);
+
+    // The maximum number of tiles for which we will ever invoke the kernel
+    auto const max_num_tiles_per_invocation = static_cast<OffsetT>(::cuda::ceil_div(max_partition_size, tile_size));
+
+    // The amount of virtual shared memory to allocate
+    const auto vsmem_size = max_num_tiles_per_invocation * VsmemHelperT::vsmem_per_block;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Specify temporary storage allocation requirements
+      ::cuda::std::size_t streaming_selection_storage_bytes =
+        (num_partitions > 1) ? 2 * sizeof(num_total_items_t) : ::cuda::std::size_t{0};
+      ::cuda::std::size_t allocation_sizes[3] = {0ULL, vsmem_size, streaming_selection_storage_bytes};
+
+      // Bytes needed for tile status descriptors
+      error =
+        CubDebug(ScanTileStateT::AllocationSize(static_cast<int>(max_num_tiles_per_invocation), allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+      void* allocations[3] = {};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      // Initialize the streaming context with the temporary storage for double-buffering the previously selected items
+      // and the total number (across all partitions) of items
+      num_total_items_t* tmp_num_selected_out = reinterpret_cast<num_total_items_t*>(allocations[2]);
+      streaming_context_t streaming_context{
+        tmp_num_selected_out, (tmp_num_selected_out + 1), num_items, (num_partitions <= 1)};
+
+      // Iterate over the partitions until all input is processed
+      for (OffsetT partition_idx = 0; partition_idx < num_partitions; partition_idx++)
+      {
+        OffsetT current_partition_offset = partition_idx * max_partition_size;
+        OffsetT current_num_items =
+          (partition_idx + 1 == num_partitions) ? (num_items - current_partition_offset) : max_partition_size;
+
+        // Construct the tile status interface
+        const auto current_num_tiles = static_cast<int>(::cuda::ceil_div(current_num_items, tile_size));
+        ScanTileStateT tile_status;
+        error = CubDebug(tile_status.Init(current_num_tiles, allocations[0], allocation_sizes[0]));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        // Log scan_init_kernel configuration
+        int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(current_num_tiles, INIT_KERNEL_THREADS));
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+                init_grid_size,
+                INIT_KERNEL_THREADS,
+                (long long) stream);
+#endif
+
+        // Invoke scan_init_kernel to initialize tile descriptors
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+          .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out);
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        // No more items to process (note, we do not want to return early for num_items==0, because we need to make sure
+        // that `scan_init_kernel` has written '0' to d_num_selected_out)
+        if (current_num_items == 0)
+        {
+          return cudaSuccess;
+        }
+
+// Log select_if_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        {
+          // Get SM occupancy for select_if_kernel
+          int range_select_sm_occupancy;
+          error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                          select_if_kernel,
+                                          block_threads));
+          if (cudaSuccess != error)
+          {
+            return error;
+          }
+
+          _CubLog("Invoking select_if_kernel<<<%d, %d, 0, "
+                  "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                  current_num_tiles,
+                  block_threads,
+                  (long long) stream,
+                  items_per_thread,
+                  range_select_sm_occupancy);
+        }
+#endif
+
+        // Invoke select_if_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
+          .doit(select_if_kernel,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                static_cast<per_partition_offset_t>(current_num_items),
+                current_num_tiles,
+                streaming_context,
+                cub::detail::vsmem_t{allocations[1]});
+
+        // Check for failure to launch
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        // Prepare streaming context for next partition (swap double buffers, advance number of processed items, etc.)
+        streaming_context.advance(current_num_items, (partition_idx + OffsetT{2} == num_partitions));
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+
+    return Invoke<ActivePolicyT>(
+      DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      DeviceSelectSweepKernel<
+        MaxPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        NumSelectedIteratorT,
+        ScanTileStateT,
+        SelectOpT,
+        EqualityOpT,
+        per_partition_offset_t,
+        streaming_context_t,
+        KEEP_REJECTS,
+        MayAlias>);
+  }
+
+  /**
+   * Internal dispatch routine
+   *
+   * @param d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When `nullptr`, the required allocation size is written to `temp_storage_bytes`
+   *   and no work is done.
+   *
+   * @param temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param d_in
+   *   Pointer to the input sequence of data items
+   *
+   * @param d_flags
+   *   Pointer to the input sequence of selection flags (if applicable)
+   *
+   * @param d_selected_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param d_num_selected_out
+   *  Pointer to the total number of items selected (i.e., length of `d_selected_out`)
+   *
+   * @param select_op
+   *   Selection operator
+   *
+   * @param equality_op
+   *   Equality operator
+   *
+   * @param num_items
+   *   Total number of input items (i.e., length of `d_in`)
+   *
+   * @param stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagsInputIteratorT d_flags,
+    SelectedOutputIteratorT d_selected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+
+    int ptx_version = 0;
+    if (cudaError_t error = CubDebug(PtxVersion(ptx_version)))
+    {
+      return error;
+    }
+
+    DispatchSelectIf dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_flags,
+      d_selected_out,
+      d_num_selected_out,
+      select_op,
+      equality_op,
+      num_items,
+      stream,
+      ptx_version);
+
+    return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FlagsInputIteratorT d_flags,
+    SelectedOutputIteratorT d_selected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectOpT select_op,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_flags,
+      d_selected_out,
+      d_num_selected_out,
+      select_op,
+      equality_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 000000000..e6aeb9f9f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,1009 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector
+ * multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_segment_fixup.cuh>
+#include <cub/agent/agent_spmv_orig.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_search.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ *
+ * @tparam AgentSpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
+ */
+template <typename AgentSpmvPolicyT, typename ValueT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params)
+{
+  using VectorValueIteratorT =
+    CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>;
+
+  VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+  int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (row_idx < spmv_params.num_rows)
+  {
+    OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+    OffsetT nonzero_idx     = spmv_params.d_row_end_offsets[row_idx - 1];
+
+    ValueT value = 0.0;
+    if (end_nonzero_idx != nonzero_idx)
+    {
+      value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+    }
+
+    spmv_params.d_vector_y[row_idx] = value;
+  }
+}
+
+/**
+ * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ *
+ * @tparam SpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam CoordinateT
+ *   Merge path coordinate type
+ *
+ * @tparam SpmvParamsT
+ *   SpmvParams type
+ *
+ * @param[in] num_merge_tiles
+ *   Number of SpMV merge tiles (spmv grid size)
+ *
+ * @param[out] d_tile_coordinates
+ *   Pointer to the temporary array of tile starting coordinates
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
+ */
+template <typename SpmvPolicyT, typename OffsetT, typename CoordinateT, typename SpmvParamsT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceSpmvSearchKernel(int num_merge_tiles, CoordinateT* d_tile_coordinates, SpmvParamsT spmv_params)
+{
+  /// Constants
+  enum
+  {
+    BLOCK_THREADS    = SpmvPolicyT::BLOCK_THREADS,
+    ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD,
+    TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+  };
+
+  using RowOffsetsSearchIteratorT =
+    CacheModifiedInputIterator<SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, OffsetT, OffsetT>;
+
+  // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+  int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tile_idx < num_merge_tiles + 1)
+  {
+    OffsetT diagonal = (tile_idx * TILE_ITEMS);
+    CoordinateT tile_coordinate;
+    CountingInputIterator<OffsetT> nonzero_indices(0);
+
+    // Search the merge path
+    MergePathSearch(
+      diagonal,
+      RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+      nonzero_indices,
+      spmv_params.num_rows,
+      spmv_params.num_nonzeros,
+      tile_coordinate);
+
+    // Output starting offset
+    d_tile_coordinates[tile_idx] = tile_coordinate;
+  }
+}
+
+/**
+ * @brief Spmv agent entry point
+ *
+ * @tparam SpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam CoordinateT
+ *   Merge path coordinate type
+ *
+ * @tparam HAS_ALPHA
+ *   Whether the input parameter Alpha is 1
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter Beta is 0
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
+ *
+ * @param[in] d_tile_coordinates
+ *   Pointer to the temporary array of tile starting coordinates
+ *
+ * @param[out] d_tile_carry_pairs
+ *   Pointer to the temporary array carry-out dot product row-ids, one per block
+ *
+ * @param[in] num_tiles
+ *   Number of merge tiles
+ *
+ * @param[in] tile_state
+ *   Tile status interface for fixup reduce-by-key kernel
+ *
+ * @param[in] num_segment_fixup_tiles
+ *   Number of reduce-by-key tiles (fixup grid size)
+ */
+template <typename SpmvPolicyT,
+          typename ScanTileStateT,
+          typename ValueT,
+          typename OffsetT,
+          typename CoordinateT,
+          bool HAS_ALPHA,
+          bool HAS_BETA>
+__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel(
+  SpmvParams<ValueT, OffsetT> spmv_params,
+  CoordinateT* d_tile_coordinates,
+  KeyValuePair<OffsetT, ValueT>* d_tile_carry_pairs,
+  int num_tiles,
+  ScanTileStateT tile_state,
+  int num_segment_fixup_tiles)
+{
+  // Spmv agent type specialization
+  using AgentSpmvT = AgentSpmv<SpmvPolicyT, ValueT, OffsetT, HAS_ALPHA, HAS_BETA>;
+
+  // Shared memory for AgentSpmv
+  __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+  AgentSpmvT(temp_storage, spmv_params).ConsumeTile(d_tile_coordinates, d_tile_carry_pairs, num_tiles);
+
+  // Initialize fixup tile status
+  tile_state.InitializeStatus(num_segment_fixup_tiles);
+}
+
+/**
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter Beta is 0
+ */
+template <typename ValueT, typename OffsetT, bool HAS_BETA>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
+{
+  const int row = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+
+  if (row < spmv_params.num_rows)
+  {
+    ValueT result = 0.0;
+
+    _CCCL_IF_CONSTEXPR (HAS_BETA)
+    {
+      result += spmv_params.beta * spmv_params.d_vector_y[row];
+    }
+
+    spmv_params.d_vector_y[row] = result;
+  }
+}
+
+/**
+ * @brief Multi-block reduce-by-key sweep kernel entry point
+ *
+ * @tparam AgentSegmentFixupPolicyT
+ *   Parameterized AgentSegmentFixupPolicy tuning policy type
+ *
+ * @tparam PairsInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @param[in] d_pairs_in
+ *   Pointer to the array carry-out dot product row-ids, one per spmv block
+ *
+ * @param[in,out] d_aggregates_out
+ *   Output value aggregates
+ *
+ * @param[in] num_items
+ *   Total number of items to select from
+ *
+ * @param[in] num_tiles
+ *   Total number of tiles for the entire problem
+ *
+ * @param[in] tile_state
+ *   Tile status interface
+ */
+template <typename AgentSegmentFixupPolicyT,
+          typename PairsInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename OffsetT,
+          typename ScanTileStateT>
+__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel(
+    PairsInputIteratorT d_pairs_in,
+    AggregatesOutputIteratorT d_aggregates_out,
+    OffsetT num_items,
+    int num_tiles,
+    ScanTileStateT tile_state)
+{
+  // Thread block type for reducing tiles of value segments
+  using AgentSegmentFixupT =
+    AgentSegmentFixup<AgentSegmentFixupPolicyT,
+                      PairsInputIteratorT,
+                      AggregatesOutputIteratorT,
+                      cub::Equality,
+                      cub::Sum,
+                      OffsetT>;
+
+  // Shared memory for AgentSegmentFixup
+  __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum())
+    .ConsumeRange(num_items, num_tiles, tile_state);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename ValueT, typename OffsetT>
+struct DispatchSpmv
+{
+  //---------------------------------------------------------------------
+  // Constants and Types
+  //---------------------------------------------------------------------
+
+  enum
+  {
+    INIT_KERNEL_THREADS         = 128,
+    EMPTY_MATRIX_KERNEL_THREADS = 128
+  };
+
+  // SpmvParams bundle type
+  using SpmvParamsT = SpmvParams<ValueT, OffsetT>;
+
+  // 2D merge path coordinate type
+  using CoordinateT = typename CubVector<OffsetT, 2>::Type;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ReduceByKeyScanTileState<ValueT, OffsetT>;
+
+  // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+  using KeyValuePairT = KeyValuePair<OffsetT, ValueT>;
+
+  //---------------------------------------------------------------------
+  // Tuning policies
+  //---------------------------------------------------------------------
+
+  /// SM35
+  struct Policy350
+  {
+    using SpmvPolicyT =
+      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 96 : 128,
+                      (sizeof(ValueT) > 4) ? 4 : 7,
+                      LOAD_LDG,
+                      LOAD_CA,
+                      LOAD_LDG,
+                      LOAD_LDG,
+                      LOAD_LDG,
+                      (sizeof(ValueT) > 4) ? true : false,
+                      BLOCK_SCAN_WARP_SCANS>;
+
+    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  /// SM37
+  struct Policy370
+  {
+    using SpmvPolicyT =
+      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 128 : 128,
+                      (sizeof(ValueT) > 4) ? 9 : 14,
+                      LOAD_LDG,
+                      LOAD_CA,
+                      LOAD_LDG,
+                      LOAD_LDG,
+                      LOAD_LDG,
+                      false,
+                      BLOCK_SCAN_WARP_SCANS>;
+
+    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  /// SM50
+  struct Policy500
+  {
+    using SpmvPolicyT =
+      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128,
+                      (sizeof(ValueT) > 4) ? 6 : 7,
+                      LOAD_LDG,
+                      LOAD_DEFAULT,
+                      (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                      (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                      LOAD_LDG,
+                      (sizeof(ValueT) > 4) ? true : false,
+                      (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>;
+
+    using SegmentFixupPolicyT =
+      AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_VECTORIZE, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE>;
+  };
+
+  /// SM60
+  struct Policy600
+  {
+    using SpmvPolicyT =
+      AgentSpmvPolicy<(sizeof(ValueT) > 4) ? 64 : 128,
+                      (sizeof(ValueT) > 4) ? 5 : 7,
+                      LOAD_DEFAULT,
+                      LOAD_DEFAULT,
+                      LOAD_DEFAULT,
+                      LOAD_DEFAULT,
+                      LOAD_DEFAULT,
+                      false,
+                      BLOCK_SCAN_WARP_SCANS>;
+
+    using SegmentFixupPolicyT = AgentSegmentFixupPolicy<128, 3, BLOCK_LOAD_DIRECT, LOAD_LDG, BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  //---------------------------------------------------------------------
+  // Tuning policies of current PTX compiler pass
+  //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+  using PtxPolicy = Policy600;
+
+#elif (CUB_PTX_ARCH >= 500)
+  using PtxPolicy = Policy500;
+
+#elif (CUB_PTX_ARCH >= 370)
+  using PtxPolicy = Policy370;
+
+#else
+  using PtxPolicy = Policy350;
+
+#endif
+
+  // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+  struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT
+  {};
+  struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT
+  {};
+
+  //---------------------------------------------------------------------
+  // Utilities
+  //---------------------------------------------------------------------
+
+  /**
+   * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+   */
+  template <typename KernelConfig>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static void
+  InitConfigs(int ptx_version, KernelConfig& spmv_config, KernelConfig& segment_fixup_config)
+  {
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      ( // We're on the device, so initialize the kernel dispatch
+        // configurations with the current PTX policy
+        spmv_config.template Init<PtxSpmvPolicyT>(); segment_fixup_config.template Init<PtxSegmentFixupPolicy>();),
+      (
+        // We're on the host, so lookup and initialize the kernel dispatch
+        // configurations with the policies that match the device's PTX
+        // version
+        if (ptx_version >= 600) {
+          spmv_config.template Init<typename Policy600::SpmvPolicyT>();
+          segment_fixup_config.template Init<typename Policy600::SegmentFixupPolicyT>();
+        } else if (ptx_version >= 500) {
+          spmv_config.template Init<typename Policy500::SpmvPolicyT>();
+          segment_fixup_config.template Init<typename Policy500::SegmentFixupPolicyT>();
+        } else if (ptx_version >= 370) {
+          spmv_config.template Init<typename Policy370::SpmvPolicyT>();
+          segment_fixup_config.template Init<typename Policy370::SegmentFixupPolicyT>();
+        } else {
+          spmv_config.template Init<typename Policy350::SpmvPolicyT>();
+          segment_fixup_config.template Init<typename Policy350::SegmentFixupPolicyT>();
+        }));
+  }
+
+  /**
+   * Kernel kernel dispatch configuration.
+   */
+  struct KernelConfig
+  {
+    int block_threads;
+    int items_per_thread;
+    int tile_items;
+
+    template <typename PolicyT>
+    CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void Init()
+    {
+      block_threads    = PolicyT::BLOCK_THREADS;
+      items_per_thread = PolicyT::ITEMS_PER_THREAD;
+      tile_items       = block_threads * items_per_thread;
+    }
+  };
+
+  //---------------------------------------------------------------------
+  // Dispatch entrypoints
+  //---------------------------------------------------------------------
+
+  /**
+   * Internal dispatch routine for computing a device-wide reduction using the
+   * specified kernel functions.
+   *
+   * If the input is larger than a single tile, this method uses two-passes of
+   * kernel invocations.
+   *
+   * @tparam Spmv1ColKernelT
+   *   Function type of cub::DeviceSpmv1ColKernel
+   *
+   * @tparam SpmvSearchKernelT
+   *   Function type of cub::AgentSpmvSearchKernel
+   *
+   * @tparam SpmvKernelT
+   *   Function type of cub::AgentSpmvKernel
+   *
+   * @tparam SegmentFixupKernelT
+   *   Function type of cub::DeviceSegmentFixupKernelT
+   *
+   * @tparam SpmvEmptyMatrixKernelT
+   *   Function type of cub::DeviceSpmvEmptyMatrixKernel
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of \p d_temp_storage allocation
+   *
+   * @paramSpMV spmv_params
+   *   input parameter bundle
+   *
+   * @param[in] stream
+   *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   *
+   * @param[in] spmv_1col_kernel
+   *   Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+   *
+   * @param[in] spmv_search_kernel
+   *   Kernel function pointer to parameterization of AgentSpmvSearchKernel
+   *
+   * @param[in] spmv_kernel
+   *   Kernel function pointer to parameterization of AgentSpmvKernel
+   *
+   * @param[in] segment_fixup_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+   *
+   * @param[in] spmv_empty_matrix_kernel
+   *   Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel
+   *
+   * @param[in] spmv_config
+   *   Dispatch parameters that match the policy that @p spmv_kernel was compiled for
+   *
+   * @param[in] segment_fixup_config
+   *   Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for
+   */
+  template <typename Spmv1ColKernelT,
+            typename SpmvSearchKernelT,
+            typename SpmvKernelT,
+            typename SegmentFixupKernelT,
+            typename SpmvEmptyMatrixKernelT>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SpmvParamsT& spmv_params,
+    cudaStream_t stream,
+    Spmv1ColKernelT spmv_1col_kernel,
+    SpmvSearchKernelT spmv_search_kernel,
+    SpmvKernelT spmv_kernel,
+    SegmentFixupKernelT segment_fixup_kernel,
+    SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
+    KernelConfig spmv_config,
+    KernelConfig segment_fixup_config)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      if (spmv_params.num_rows < 0 || spmv_params.num_cols < 0)
+      {
+        return cudaErrorInvalidValue;
+      }
+
+      if (spmv_params.num_rows == 0 || spmv_params.num_cols == 0)
+      { // Empty problem, no-op.
+        if (d_temp_storage == nullptr)
+        {
+          temp_storage_bytes = 1;
+        }
+
+        break;
+      }
+
+      if (spmv_params.num_nonzeros == 0)
+      {
+        if (d_temp_storage == nullptr)
+        {
+          // Return if the caller is simply requesting the size of the storage allocation
+          temp_storage_bytes = 1;
+          break;
+        }
+
+        constexpr int threads_in_block = EMPTY_MATRIX_KERNEL_THREADS;
+        const int blocks_in_grid       = ::cuda::ceil_div(spmv_params.num_rows, threads_in_block);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking spmv_empty_matrix_kernel<<<%d, %d, 0, %lld>>>()\n",
+                blocks_in_grid,
+                threads_in_block,
+                (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+        error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
+                  .doit(spmv_empty_matrix_kernel, spmv_params);
+
+        if (CubDebug(error))
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = detail::DebugSyncStream(stream);
+        if (CubDebug(error))
+        {
+          break;
+        }
+
+        break;
+      }
+
+      if (spmv_params.num_cols == 1)
+      {
+        if (d_temp_storage == nullptr)
+        {
+          // Return if the caller is simply requesting the size of the storage allocation
+          temp_storage_bytes = 1;
+          break;
+        }
+
+        // Get search/init grid dims
+        int degen_col_kernel_block_size = INIT_KERNEL_THREADS;
+        int degen_col_kernel_grid_size  = ::cuda::ceil_div(spmv_params.num_rows, degen_col_kernel_block_size);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                degen_col_kernel_grid_size,
+                degen_col_kernel_block_size,
+                (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke spmv_search_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream)
+          .doit(spmv_1col_kernel, spmv_params);
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = detail::DebugSyncStream(stream);
+        if (CubDebug(error))
+        {
+          break;
+        }
+
+        break;
+      }
+
+      // Get device ordinal
+      int device_ordinal;
+      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      {
+        break;
+      }
+
+      // Get SM count
+      int sm_count;
+      if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)))
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+      {
+        break;
+      }
+
+      // Total number of spmv work items
+      int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+      // Tile sizes of kernels
+      int merge_tile_size         = spmv_config.block_threads * spmv_config.items_per_thread;
+      int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+      // Number of tiles for kernels
+      int num_merge_tiles         = ::cuda::ceil_div(num_merge_items, merge_tile_size);
+      int num_segment_fixup_tiles = ::cuda::ceil_div(num_merge_tiles, segment_fixup_tile_size);
+
+      // Get SM occupancy for kernels
+      int spmv_sm_occupancy;
+      if (CubDebug(error = MaxSmOccupancy(spmv_sm_occupancy, spmv_kernel, spmv_config.block_threads)))
+      {
+        break;
+      }
+
+      int segment_fixup_sm_occupancy;
+      if (CubDebug(error = MaxSmOccupancy(
+                     segment_fixup_sm_occupancy, segment_fixup_kernel, segment_fixup_config.block_threads)))
+      {
+        break;
+      }
+
+      // Get grid dimensions
+      dim3 spmv_grid_size(CUB_MIN(num_merge_tiles, max_dim_x), ::cuda::ceil_div(num_merge_tiles, max_dim_x), 1);
+
+      dim3 segment_fixup_grid_size(
+        CUB_MIN(num_segment_fixup_tiles, max_dim_x), ::cuda::ceil_div(num_segment_fixup_tiles, max_dim_x), 1);
+
+      // Get the temporary storage allocation requirements
+      size_t allocation_sizes[3];
+      if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0])))
+      {
+        break; // bytes needed for reduce-by-key tile status descriptors
+      }
+      allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs
+      allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates
+
+      // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+      void* allocations[3] = {};
+      if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+      {
+        break;
+      }
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_state;
+      if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0])))
+      {
+        break;
+      }
+
+      // Alias the other allocations
+      KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs
+      CoordinateT* d_tile_coordinates   = (CoordinateT*) allocations[2]; // Agent starting coordinates
+
+      // Get search/init grid dims
+      int search_block_size = INIT_KERNEL_THREADS;
+      int search_grid_size  = ::cuda::ceil_div(num_merge_tiles + 1, search_block_size);
+
+      if (search_grid_size < sm_count)
+      //            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+      {
+        // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+        d_tile_coordinates = nullptr;
+      }
+      else
+      {
+// Use separate search kernel if we have enough spmv tiles to saturate the device
+
+// Log spmv_search_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                search_grid_size,
+                search_block_size,
+                (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke spmv_search_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(search_grid_size, search_block_size, 0, stream)
+          .doit(spmv_search_kernel, num_merge_tiles, d_tile_coordinates, spmv_params);
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = detail::DebugSyncStream(stream);
+        if (CubDebug(error))
+        {
+          break;
+        }
+      }
+
+// Log spmv_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+              spmv_grid_size.x,
+              spmv_grid_size.y,
+              spmv_grid_size.z,
+              spmv_config.block_threads,
+              (long long) stream,
+              spmv_config.items_per_thread,
+              spmv_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke spmv_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(spmv_grid_size, spmv_config.block_threads, 0, stream)
+        .doit(spmv_kernel,
+              spmv_params,
+              d_tile_coordinates,
+              d_tile_carry_pairs,
+              num_merge_tiles,
+              tile_state,
+              num_segment_fixup_tiles);
+
+      // Check for failure to launch
+      if (CubDebug(error = cudaPeekAtLastError()))
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = detail::DebugSyncStream(stream);
+      if (CubDebug(error))
+      {
+        break;
+      }
+
+      // Run reduce-by-key fixup if necessary
+      if (num_merge_tiles > 1)
+      {
+// Log segment_fixup_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                segment_fixup_grid_size.x,
+                segment_fixup_grid_size.y,
+                segment_fixup_grid_size.z,
+                segment_fixup_config.block_threads,
+                (long long) stream,
+                segment_fixup_config.items_per_thread,
+                segment_fixup_sm_occupancy);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+        // Invoke segment_fixup_kernel
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+          segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream)
+          .doit(segment_fixup_kernel,
+                d_tile_carry_pairs,
+                spmv_params.d_vector_y,
+                num_merge_tiles,
+                num_segment_fixup_tiles,
+                tile_state);
+
+        // Check for failure to launch
+        if (CubDebug(error = cudaPeekAtLastError()))
+        {
+          break;
+        }
+
+        // Sync the stream if specified to flush runtime errors
+        error = detail::DebugSyncStream(stream);
+        if (CubDebug(error))
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  template <typename Spmv1ColKernelT,
+            typename SpmvSearchKernelT,
+            typename SpmvKernelT,
+            typename SegmentFixupKernelT,
+            typename SpmvEmptyMatrixKernelT>
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN
+  _CCCL_FORCEINLINE static cudaError_t
+  Dispatch(void* d_temp_storage,
+           size_t& temp_storage_bytes,
+           SpmvParamsT& spmv_params,
+           cudaStream_t stream,
+           bool debug_synchronous,
+           Spmv1ColKernelT spmv_1col_kernel,
+           SpmvSearchKernelT spmv_search_kernel,
+           SpmvKernelT spmv_kernel,
+           SegmentFixupKernelT segment_fixup_kernel,
+           SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
+           KernelConfig spmv_config,
+           KernelConfig segment_fixup_config)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch<Spmv1ColKernelT, SpmvSearchKernelT, SpmvKernelT, SegmentFixupKernelT, SpmvEmptyMatrixKernelT>(
+      d_temp_storage,
+      temp_storage_bytes,
+      spmv_params,
+      stream,
+      spmv_1col_kernel,
+      spmv_search_kernel,
+      spmv_kernel,
+      segment_fixup_kernel,
+      spmv_empty_matrix_kernel,
+      spmv_config,
+      segment_fixup_config);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /**
+   * @brief Internal dispatch routine for computing a device-wide reduction
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param SpMV spmv_params
+   *   input parameter bundle
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  Dispatch(void* d_temp_storage, size_t& temp_storage_bytes, SpmvParamsT& spmv_params, cudaStream_t stream = 0)
+  {
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      if (CubDebug(error = PtxVersion(ptx_version)))
+      {
+        break;
+      }
+
+      // Get kernel kernel dispatch configurations
+      KernelConfig spmv_config, segment_fixup_config;
+      InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+      constexpr bool has_alpha = false;
+      constexpr bool has_beta  = false;
+
+      if (CubDebug(
+            error = Dispatch(
+              d_temp_storage,
+              temp_storage_bytes,
+              spmv_params,
+              stream,
+              DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+              DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+              DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, has_alpha, has_beta>,
+              DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+              DeviceSpmvEmptyMatrixKernel<ValueT, OffsetT, has_beta>,
+              spmv_config,
+              segment_fixup_config)))
+      {
+        break;
+      }
+
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    SpmvParamsT& spmv_params,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(d_temp_storage, temp_storage_bytes, spmv_params, stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
new file mode 100644
index 000000000..e77f82e06
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -0,0 +1,489 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_three_way_partition.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_three_way_partition.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <cstdio>
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename ScanTileStateT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ScanTileStateT tile_status,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    OffsetT num_items,
+    int num_tiles)
+{
+  using AgentThreeWayPartitionPolicyT = typename ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy;
+
+  // Thread block type for selecting data from input tiles
+  using AgentThreeWayPartitionT = AgentThreeWayPartition<
+    AgentThreeWayPartitionPolicyT,
+    InputIteratorT,
+    FirstOutputIteratorT,
+    SecondOutputIteratorT,
+    UnselectedOutputIteratorT,
+    SelectFirstPartOp,
+    SelectSecondPartOp,
+    OffsetT>;
+
+  // Shared memory for AgentThreeWayPartition
+  __shared__ typename AgentThreeWayPartitionT::TempStorage temp_storage;
+
+  // Process tiles
+  AgentThreeWayPartitionT(
+    temp_storage,
+    d_in,
+    d_first_part_out,
+    d_second_part_out,
+    d_unselected_out,
+    select_first_part_op,
+    select_second_part_op,
+    num_items)
+    .ConsumeRange(num_tiles, tile_status, d_num_selected_out);
+}
+
+/**
+ * @brief Initialization kernel for tile status initialization (multi-block)
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @param[in] tile_state_1
+ *   Tile status interface
+ *
+ * @param[in] tile_state_2
+ *   Tile status interface
+ *
+ * @param[in] num_tiles
+ *   Number of tiles
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected
+ *   (i.e., length of @p d_selected_out)
+ */
+template <typename ScanTileStateT, typename NumSelectedIteratorT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void
+DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, int num_tiles, NumSelectedIteratorT d_num_selected_out)
+{
+  // Initialize tile status
+  tile_state.InitializeStatus(num_tiles);
+
+  // Initialize d_num_selected_out
+  if (blockIdx.x == 0)
+  {
+    if (threadIdx.x < 2)
+    {
+      d_num_selected_out[threadIdx.x] = 0;
+    }
+  }
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+template <typename InputIteratorT,
+          typename FirstOutputIteratorT,
+          typename SecondOutputIteratorT,
+          typename UnselectedOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename SelectFirstPartOp,
+          typename SelectSecondPartOp,
+          typename OffsetT,
+          typename SelectedPolicy =
+            detail::device_three_way_partition_policy_hub<cub::detail::value_t<InputIteratorT>, OffsetT>>
+struct DispatchThreeWayPartitionIf
+{
+  /*****************************************************************************
+   * Types and constants
+   ****************************************************************************/
+
+  using AccumPackHelperT = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  using AccumPackT       = typename AccumPackHelperT::pack_t;
+  using ScanTileStateT   = cub::ScanTileState<AccumPackT>;
+
+  static constexpr int INIT_KERNEL_THREADS = 256;
+
+  void* d_temp_storage;
+  std::size_t& temp_storage_bytes;
+  InputIteratorT d_in;
+  FirstOutputIteratorT d_first_part_out;
+  SecondOutputIteratorT d_second_part_out;
+  UnselectedOutputIteratorT d_unselected_out;
+  NumSelectedIteratorT d_num_selected_out;
+  SelectFirstPartOp select_first_part_op;
+  SelectSecondPartOp select_second_part_op;
+  OffsetT num_items;
+  cudaStream_t stream;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchThreeWayPartitionIf(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_first_part_out(d_first_part_out)
+      , d_second_part_out(d_second_part_out)
+      , d_unselected_out(d_unselected_out)
+      , d_num_selected_out(d_num_selected_out)
+      , select_first_part_op(select_first_part_op)
+      , select_second_part_op(select_second_part_op)
+      , num_items(num_items)
+      , stream(stream)
+  {}
+
+  /*****************************************************************************
+   * Dispatch entrypoints
+   ****************************************************************************/
+
+  template <typename ActivePolicyT, typename ScanInitKernelPtrT, typename SelectIfKernelPtrT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  Invoke(ScanInitKernelPtrT three_way_partition_init_kernel, SelectIfKernelPtrT three_way_partition_kernel)
+  {
+    cudaError error = cudaSuccess;
+
+    constexpr int block_threads    = ActivePolicyT::ThreeWayPartitionPolicy::BLOCK_THREADS;
+    constexpr int items_per_thread = ActivePolicyT::ThreeWayPartitionPolicy::ITEMS_PER_THREAD;
+
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      int tile_size = block_threads * items_per_thread;
+      int num_tiles = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[1]; // bytes needed for tile status descriptors
+
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute
+      // the necessary size of the blob)
+      void* allocations[1] = {};
+
+      error = CubDebug(cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage
+        // allocation
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_status;
+
+      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log three_way_partition_init_kernel configuration
+      int init_grid_size = CUB_MAX(1, ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS));
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking three_way_partition_init_kernel<<<%d, %d, 0, %lld>>>()\n",
+              init_grid_size,
+              INIT_KERNEL_THREADS,
+              reinterpret_cast<long long>(stream));
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke three_way_partition_init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get grid size for scanning tiles
+      dim3 scan_grid_size;
+      scan_grid_size.z = 1;
+      scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x);
+      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+// Log select_if_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      {
+        // Get SM occupancy for select_if_kernel
+        int range_select_sm_occupancy;
+        error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                        three_way_partition_kernel,
+                                        block_threads));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        _CubLog("Invoking three_way_partition_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d "
+                "items per thread, %d SM occupancy\n",
+                scan_grid_size.x,
+                scan_grid_size.y,
+                scan_grid_size.z,
+                block_threads,
+                reinterpret_cast<long long>(stream),
+                items_per_thread,
+                range_select_sm_occupancy);
+      }
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke select_if_kernel
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+        .doit(three_way_partition_kernel,
+              d_in,
+              d_first_part_out,
+              d_second_part_out,
+              d_unselected_out,
+              d_num_selected_out,
+              tile_status,
+              select_first_part_op,
+              select_second_part_op,
+              num_items,
+              num_tiles);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+    return Invoke<ActivePolicyT>(
+      DeviceThreeWayPartitionInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      DeviceThreeWayPartitionKernel<
+        MaxPolicyT,
+        InputIteratorT,
+        FirstOutputIteratorT,
+        SecondOutputIteratorT,
+        UnselectedOutputIteratorT,
+        NumSelectedIteratorT,
+        ScanTileStateT,
+        SelectFirstPartOp,
+        SelectSecondPartOp,
+        OffsetT>);
+  }
+
+  /**
+   * Internal dispatch routine
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename SelectedPolicy::MaxPolicy;
+
+    cudaError error = cudaSuccess;
+
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(cub::PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      DispatchThreeWayPartitionIf dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_first_part_out,
+        d_second_part_out,
+        d_unselected_out,
+        d_num_selected_out,
+        select_first_part_op,
+        select_second_part_op,
+        num_items,
+        stream);
+
+      // Dispatch
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    FirstOutputIteratorT d_first_part_out,
+    SecondOutputIteratorT d_second_part_out,
+    UnselectedOutputIteratorT d_unselected_out,
+    NumSelectedIteratorT d_num_selected_out,
+    SelectFirstPartOp select_first_part_op,
+    SelectSecondPartOp select_second_part_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_first_part_out,
+      d_second_part_out,
+      d_unselected_out,
+      d_num_selected_out,
+      select_first_part_op,
+      select_second_part_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_transform.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_transform.cuh
new file mode 100644
index 000000000..1b25d003a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -0,0 +1,994 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+_CCCL_NV_DIAG_SUPPRESS(186)
+#  include <cuda_pipeline_primitives.h>
+// we cannot re-enable the warning here, because it is triggered outside the translation unit
+// see also: https://godbolt.org/z/1x8b4hn3G
+#endif // defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000
+
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+#include <cuda/cmath>
+#include <cuda/ptx>
+#include <cuda/std/__algorithm/clamp.h>
+#include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/expected>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <cassert>
+
+// cooperative groups do not support NVHPC yet
+#ifndef _CCCL_CUDA_COMPILER_NVHPC
+#  include <cooperative_groups.h>
+#  include <cooperative_groups/memcpy_async.h>
+#endif
+
+CUB_NAMESPACE_BEGIN
+
+// The ublkcp kernel needs PTX features that are only available and understood by nvcc >=12.
+// Also, cooperative groups do not support NVHPC yet.
+#if _CCCL_CUDACC_VER_MAJOR >= 12 && !defined(_CCCL_CUDA_COMPILER_NVHPC)
+#  define _CUB_HAS_TRANSFORM_UBLKCP
+#endif // _CCCL_CUDACC_VER_MAJOR >= 12 && !defined(_CCCL_CUDA_COMPILER_NVHPC)
+
+namespace detail
+{
+namespace transform
+{
+_CCCL_HOST_DEVICE constexpr int sum()
+{
+  return 0;
+}
+
+// TODO(bgruber): remove with C++17
+template <typename... Ts>
+_CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail)
+{
+  return head + sum(tail...);
+}
+
+#if _CCCL_STD_VER >= 2017
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return (int{sizeof(value_t<Its>)} + ... + 0);
+}
+#else // ^^^ C++17 ^^^ / vvv C++11 vvv
+template <typename... Its>
+_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int
+{
+  return sum(int{sizeof(value_t<Its>)}...);
+}
+#endif // _CCCL_STD_VER >= 2017
+
+enum class Algorithm
+{
+  // We previously had a fallback algorithm that would use cub::DeviceFor. Benchmarks showed that the prefetch algorithm
+  // is always superior to that fallback, so it was removed.
+  prefetch,
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  ublkcp,
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+template <typename T>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
+{
+#if _CCCL_STD_VER > 2011
+  _CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
+#endif // _CCCL_STD_VER > 2011
+  return reinterpret_cast<const char*>(
+    reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
+}
+
+template <int BlockThreads>
+struct prefetch_policy_t
+{
+  static constexpr int block_threads = BlockThreads;
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  static constexpr int items_per_thread_no_input = 2; // when there are no input iterators, the kernel is just filling
+  static constexpr int min_items_per_thread      = 1;
+  static constexpr int max_items_per_thread      = 32;
+};
+
+// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
+// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
+// as we need to create less instructions to prefetch the same amount of data.
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
+{
+  // TODO(bgruber): prefetch to L1 may be even better
+  asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory");
+}
+
+template <int BlockDim, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(const T* addr, int tile_size)
+{
+  constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to be
+                                            // architecture dependent?
+  const int tile_size_bytes = tile_size * sizeof(T);
+  // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
+#pragma unroll 1
+  for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes;
+       offset += BlockDim * prefetch_byte_stride)
+  {
+    prefetch(reinterpret_cast<const char*>(addr) + offset);
+  }
+}
+
+// TODO(miscco): we should probably constrain It to not be a contiguous iterator in C++17 (and change the overload
+// above to accept any contiguous iterator)
+// overload for any iterator that is not a pointer, do nothing
+template <int, typename It, ::cuda::std::__enable_if_t<!::cuda::std::is_pointer<It>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It, int)
+{}
+
+// This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
+// global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
+// parameter yields a global memory address.
+template <typename PrefetchPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteratorIn>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
+  Offset num_items,
+  int num_elem_per_thread,
+  F f,
+  RandomAccessIteratorOut out,
+  RandomAccessIteratorIn... ins)
+{
+  constexpr int block_dim = PrefetchPolicy::block_threads;
+  const int tile_stride   = block_dim * num_elem_per_thread;
+  const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
+  const int tile_size     = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_stride}));
+
+  // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
+  {
+    int dummy[] = {(ins += offset, 0)..., 0};
+    (void) &dummy;
+    out += offset;
+  }
+
+  {
+    // TODO(bgruber): replace by fold over comma in C++17
+    int dummy[] = {(prefetch_tile<block_dim>(ins, tile_size), 0)..., 0}; // extra zero to handle empty packs
+    (void) &dummy; // nvcc 11.1 needs extra strong unused warning suppression
+  }
+
+#define PREFETCH_AGENT(full_tile)                                                                                  \
+  /* ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost */                      \
+  /* bgruber: but A6000 and H100 show small gains without pragma */                                                \
+  /*_Pragma("unroll 1")*/ for (int j = 0; j < num_elem_per_thread; ++j)                                            \
+  {                                                                                                                \
+    const int idx = j * block_dim + threadIdx.x;                                                                   \
+    if (full_tile || idx < tile_size)                                                                              \
+    {                                                                                                              \
+      /* we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) */ \
+      out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins[idx])...);                                          \
+    }                                                                                                              \
+  }
+
+  if (tile_stride == tile_size)
+  {
+    PREFETCH_AGENT(true);
+  }
+  else
+  {
+    PREFETCH_AGENT(false);
+  }
+#undef PREFETCH_AGENT
+}
+
+template <int BlockThreads>
+struct async_copy_policy_t
+{
+  static constexpr int block_threads = BlockThreads;
+  // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy
+  static constexpr int min_items_per_thread = 1;
+  static constexpr int max_items_per_thread = 32;
+};
+
+// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17.
+template <class F, class Tuple, std::size_t... Is>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence<Is...>)
+  -> decltype(::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...))
+{
+  return ::cuda::std::forward<F>(f)(::cuda::std::get<Is>(::cuda::std::forward<Tuple>(t))...);
+}
+
+template <class F, class Tuple>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t)
+  -> decltype(poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{}))
+{
+  return poor_apply_impl(
+    ::cuda::std::forward<F>(f),
+    ::cuda::std::forward<Tuple>(t),
+    ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t<Tuple>>::value>{});
+}
+
+// mult must be a power of 2
+template <typename Integral>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral
+{
+#if _CCCL_STD_VER > 2011
+  _CCCL_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t<Integral>>(mult)), "");
+#endif // _CCCL_STD_VER > 2011
+  return (x + mult - 1) & ~(mult - 1);
+}
+
+// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
+//
+// For performance considerations of memcpy_async:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async
+//
+// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we
+// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower
+// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to
+// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we
+// have to align to 128 bytes instead of 16.
+//
+// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts:
+// * According to the CUDA programming guide
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable
+// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is
+// always aligned to at least 256 bytes."
+// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on
+// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer.
+// * This should count for device memory, but also for device accessible memory living on the host.
+// * The base pointer alignment and size rounding also never leaves the size of a cache line.
+//
+// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is
+// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a
+// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end.
+
+// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some
+// conditions.
+// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring
+// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of
+// the element type.
+
+// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels.
+// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes
+// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes
+template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation
+struct aligned_base_ptr
+{
+  using value_type = T;
+
+  const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to
+                   // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0
+  int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127].
+
+  _CCCL_HOST_DEVICE const T* ptr_to_elements() const
+  {
+    return reinterpret_cast<const T*>(ptr + head_padding);
+  }
+
+  _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b)
+  {
+    return a.ptr == b.ptr && a.head_padding == b.head_padding;
+  }
+};
+
+template <typename T>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T>
+{
+  const char* base_ptr = round_down_ptr(ptr, alignment);
+  return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)};
+}
+
+constexpr int bulk_copy_alignment     = 128;
+constexpr int bulk_copy_size_multiple = 16;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
+{
+  const ::cuda::std::uint32_t membermask = ~0;
+  ::cuda::std::uint32_t is_elected;
+  asm volatile(
+    "{\n\t .reg .pred P_OUT; \n\t"
+    "elect.sync _|P_OUT, %1;\n\t"
+    "selp.b32 %0, 1, 0, P_OUT; \n"
+    "}"
+    : "=r"(is_elected)
+    : "r"(membermask)
+    :);
+  return threadIdx.x < 32 && static_cast<bool>(is_elected);
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile(
+  ::cuda::std::uint64_t& bar,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  ::cuda::std::uint32_t& total_bytes_bulk_copied,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  static_assert(alignof(T) <= bulk_copy_alignment, "");
+
+  const char* src = aligned_ptr.ptr + global_offset * sizeof(T);
+  char* dst       = smem + smem_offset;
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
+
+  // TODO(bgruber): we could precompute bytes_to_copy on the host
+  const int bytes_to_copy = round_up_to_po2_multiple(
+    aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
+
+  ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
+  total_bytes_bulk_copied += bytes_to_copy;
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+template <typename Offset, typename T>
+_CCCL_DEVICE void bulk_copy_tile_fallback(
+  int tile_size,
+  int tile_stride,
+  char* smem,
+  int& smem_offset,
+  Offset global_offset,
+  const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* src = aligned_ptr.ptr_to_elements() + global_offset;
+  T* dst       = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
+  _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
+
+  const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
+  cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
+
+  // add bulk_copy_alignment to make space for the next tile's head padding
+  smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
+}
+
+// TODO(bgruber): inline this as lambda in C++14
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE const T&
+fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr<T>& aligned_ptr)
+{
+  const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
+  smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
+  return smem_operand_tile_base[smem_idx];
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_ublkcp(
+  Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  __shared__ uint64_t bar;
+  extern __shared__ char __align__(bulk_copy_alignment) smem[];
+
+  namespace ptx = ::cuda::ptx;
+
+  constexpr int block_dim = BulkCopyPolicy::block_threads;
+  const int tile_stride   = block_dim * num_elem_per_thread;
+  const Offset offset     = static_cast<Offset>(blockIdx.x) * tile_stride;
+  const int tile_size     = ::cuda::std::min(num_items - offset, Offset{tile_stride});
+
+  const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
+  if (inner_blocks)
+  {
+    // use one thread to setup the entire bulk copy
+    if (elect_one())
+    {
+      ptx::mbarrier_init(&bar, 1);
+      ptx::fence_proxy_async(ptx::space_shared);
+
+      int smem_offset                    = 0;
+      ::cuda::std::uint32_t total_copied = 0;
+
+      // TODO(bgruber): use a fold over comma in C++17
+      // Order of evaluation is left-to-right
+      int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)...,
+                     0};
+      (void) dummy;
+
+      // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
+      ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
+    }
+
+    // all threads wait for bulk copy
+    __syncthreads();
+    while (!ptx::mbarrier_try_wait_parity(&bar, 0))
+      ;
+  }
+  else
+  {
+    // use all threads to schedule an async_memcpy
+    int smem_offset = 0;
+
+    // TODO(bgruber): use a fold over comma in C++17
+    // Order of evaluation is left-to-right
+    int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0};
+    (void) dummy;
+
+    cooperative_groups::wait(cooperative_groups::this_thread_block());
+  }
+
+  // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
+  out += offset;
+
+  // note: I tried expressing the UBLKCP_AGENT as a function object but it adds a lot of code to handle the variadics
+  // TODO(bgruber): use a polymorphic lambda in C++14
+#  define UBLKCP_AGENT(full_tile)                                                                            \
+    /* Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark) */  \
+    _CCCL_PRAGMA(unroll 1)                                                                                   \
+    for (int j = 0; j < num_elem_per_thread; ++j)                                                            \
+    {                                                                                                        \
+      const int idx = j * block_dim + threadIdx.x;                                                           \
+      if (full_tile || idx < tile_size)                                                                      \
+      {                                                                                                      \
+        int smem_offset = 0;                                                                                 \
+        /* need to expand into a tuple for guaranteed order of evaluation*/                                  \
+        out[idx] = poor_apply(                                                                               \
+          [&](const InTs&... values) {                                                                       \
+            return f(values...);                                                                             \
+          },                                                                                                 \
+          ::cuda::std::tuple<InTs...>{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...}); \
+      }                                                                                                      \
+    }
+  if (tile_stride == tile_size)
+  {
+    UBLKCP_AGENT(true);
+  }
+  else
+  {
+    UBLKCP_AGENT(false);
+  }
+#  undef UBLKCP_AGENT
+}
+
+template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
+_CCCL_DEVICE void transform_kernel_impl(
+  ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>,
+  Offset num_items,
+  int num_elem_per_thread,
+  F f,
+  RandomAccessIteratorOut out,
+  aligned_base_ptr<InTs>... aligned_ptrs)
+{
+  // only call the real kernel for sm90 and later
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (transform_kernel_ublkcp<BulkCopyPolicy>(num_items, num_elem_per_thread, f, out, aligned_ptrs...);));
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <typename It>
+union kernel_arg
+{
+  aligned_base_ptr<value_t<It>> aligned_ptr;
+  It iterator;
+
+  _CCCL_HOST_DEVICE kernel_arg() {} // in case It is not default-constructible
+};
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.iterator = it;
+  return arg;
+}
+
+template <typename It>
+_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg<It>
+{
+  kernel_arg<It> arg;
+  arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment);
+  return arg;
+}
+
+// TODO(bgruber): make a variable template in C++14
+template <Algorithm Alg>
+using needs_aligned_ptr_t =
+  ::cuda::std::bool_constant<false
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                             || Alg == Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                             >;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg(
+  ::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> aligned_base_ptr<value_t<It>>&&
+{
+  return ::cuda::std::move(arg.aligned_ptr);
+}
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+template <Algorithm Alg, typename It, ::cuda::std::__enable_if_t<!needs_aligned_ptr_t<Alg>::value, int> = 0>
+_CCCL_DEVICE _CCCL_FORCEINLINE auto
+select_kernel_arg(::cuda::std::integral_constant<Algorithm, Alg>, kernel_arg<It>&& arg) -> It&&
+{
+  return ::cuda::std::move(arg.iterator);
+}
+
+// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
+// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This
+// saves some compile-time and binary size.
+template <typename MaxPolicy,
+          typename Offset,
+          typename F,
+          typename RandomAccessIteratorOut,
+          typename... RandomAccessIteartorsIn>
+__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
+  CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
+    Offset num_items,
+    int num_elem_per_thread,
+    F f,
+    RandomAccessIteratorOut out,
+    kernel_arg<RandomAccessIteartorsIn>... ins)
+{
+  constexpr auto alg = ::cuda::std::integral_constant<Algorithm, MaxPolicy::ActivePolicy::algorithm>{};
+  transform_kernel_impl<typename MaxPolicy::ActivePolicy::algo_policy>(
+    alg,
+    num_items,
+    num_elem_per_thread,
+    ::cuda::std::move(f),
+    ::cuda::std::move(out),
+    select_kernel_arg(alg, ::cuda::std::move(ins))...);
+}
+
+constexpr int arch_to_min_bytes_in_flight(int sm_arch)
+{
+  // TODO(bgruber): use if-else in C++14 for better readability
+  return sm_arch >= 900 ? 48 * 1024 // 32 for H100, 48 for H200
+       : sm_arch >= 800 ? 16 * 1024 // A100
+                        : 12 * 1024; // V100 and below
+}
+
+template <typename... RandomAccessIteratorsIn>
+_CCCL_HOST_DEVICE constexpr auto bulk_copy_smem_for_tile_size(int tile_size) -> int
+{
+  return round_up_to_po2_multiple(int{sizeof(int64_t)}, bulk_copy_alignment) /* bar */
+       // 128 bytes of padding for each input tile (handles before + after)
+       + tile_size * loaded_bytes_per_iteration<RandomAccessIteratorsIn...>()
+       + sizeof...(RandomAccessIteratorsIn) * bulk_copy_alignment;
+}
+
+template <bool RequiresStableAddress, typename RandomAccessIteratorTupleIn>
+struct policy_hub
+{
+  static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple");
+};
+
+template <bool RequiresStableAddress, typename... RandomAccessIteratorsIn>
+struct policy_hub<RequiresStableAddress, ::cuda::std::tuple<RandomAccessIteratorsIn...>>
+{
+  static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0;
+  static constexpr bool all_contiguous =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_contiguous_iterator<RandomAccessIteratorsIn>...>::value;
+  static constexpr bool all_values_trivially_reloc =
+    ::cuda::std::conjunction<THRUST_NS_QUALIFIER::is_trivially_relocatable<value_t<RandomAccessIteratorsIn>>...>::value;
+
+  static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc;
+
+  // TODO(bgruber): consider a separate kernel for just filling
+
+  struct policy300 : ChainedPolicy<300, policy300, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
+    // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
+    static constexpr auto algorithm = Algorithm::prefetch;
+    using algo_policy               = prefetch_policy_t<256>;
+  };
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // H100 and H200
+  struct policy900 : ChainedPolicy<900, policy900, policy300>
+  {
+    static constexpr int min_bif = arch_to_min_bytes_in_flight(900);
+    using async_policy           = async_copy_policy_t<256>;
+    static constexpr bool exhaust_smem =
+      bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(
+        async_policy::block_threads * async_policy::min_items_per_thread)
+      > 48 * 1024;
+    static constexpr bool any_type_is_overalinged =
+#  if _CCCL_STD_VER >= 2017
+      ((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment) || ...);
+#  else
+      sum((alignof(value_t<RandomAccessIteratorsIn>) > bulk_copy_alignment)...) > 0;
+#  endif
+
+    static constexpr bool use_fallback =
+      RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged;
+    static constexpr auto algorithm = use_fallback ? Algorithm::prefetch : Algorithm::ublkcp;
+    using algo_policy               = ::cuda::std::_If<use_fallback, prefetch_policy_t<256>, async_policy>;
+  };
+
+  using max_policy = policy900;
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+  using max_policy = policy300;
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+};
+
+// TODO(bgruber): replace by ::cuda::std::expected in C++14
+template <typename T>
+struct PoorExpected
+{
+  alignas(T) char storage[sizeof(T)];
+  cudaError_t error;
+
+  _CCCL_HOST_DEVICE PoorExpected(T value)
+      : error(cudaSuccess)
+  {
+    new (storage) T(::cuda::std::move(value));
+  }
+
+  _CCCL_HOST_DEVICE PoorExpected(cudaError_t error)
+      : error(error)
+  {}
+
+  _CCCL_HOST_DEVICE explicit operator bool() const
+  {
+    return error == cudaSuccess;
+  }
+
+  _CCCL_HOST_DEVICE T& operator*()
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE const T& operator*() const
+  {
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing")
+    return reinterpret_cast<const T&>(storage);
+    _CCCL_DIAG_POP
+  }
+
+  _CCCL_HOST_DEVICE T* operator->()
+  {
+    return &**this;
+  }
+
+  _CCCL_HOST_DEVICE const T* operator->() const
+  {
+    return &**this;
+  }
+};
+
+// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this.
+_CCCL_HOST_DEVICE inline PoorExpected<int> get_max_shared_memory()
+{
+  //  gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels
+  //  will later be launched on) belongs to the currently active device. So we can just query the active device here.
+  int device = 0;
+  auto error = CubDebug(cudaGetDevice(&device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  int max_smem = 0;
+  error        = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  return max_smem;
+}
+
+_CCCL_HOST_DEVICE inline PoorExpected<int> get_sm_count()
+{
+  int device = 0;
+  auto error = CubDebug(cudaGetDevice(&device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  int sm_count = 0;
+  error        = CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device));
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+
+  return sm_count;
+}
+
+struct elem_counts
+{
+  int elem_per_thread;
+  int tile_size;
+  int smem_size;
+};
+
+struct prefetch_config
+{
+  int max_occupancy;
+  int sm_count;
+};
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename RandomAccessIteratorTupleIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub = policy_hub<RequiresStableAddress, RandomAccessIteratorTupleIn>>
+struct dispatch_t;
+
+template <bool RequiresStableAddress,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename PolicyHub>
+struct dispatch_t<RequiresStableAddress,
+                  Offset,
+                  ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                  RandomAccessIteratorOut,
+                  TransformOp,
+                  PolicyHub>
+{
+  static_assert(::cuda::std::is_same<Offset, ::cuda::std::int32_t>::value
+                  || ::cuda::std::is_same<Offset, ::cuda::std::int64_t>::value,
+                "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
+
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> in;
+  RandomAccessIteratorOut out;
+  Offset num_items;
+  TransformOp op;
+  cudaStream_t stream;
+
+#define CUB_DETAIL_TRANSFORM_KERNEL_PTR             \
+  &transform_kernel<typename PolicyHub::max_policy, \
+                    Offset,                         \
+                    TransformOp,                    \
+                    RandomAccessIteratorOut,        \
+                    THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<RandomAccessIteratorsIn>...>
+
+  static constexpr int loaded_bytes_per_iter = loaded_bytes_per_iteration<RandomAccessIteratorsIn...>();
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+  // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running
+  // on?
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel()
+    -> PoorExpected<
+      ::cuda::std::
+        tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
+  {
+    using policy_t          = typename ActivePolicy::algo_policy;
+    constexpr int block_dim = policy_t::block_threads;
+    static_assert(block_dim % bulk_copy_alignment == 0,
+                  "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple
+                                                                                        // of 128-byte
+
+    auto determine_element_counts = [&]() -> PoorExpected<elem_counts> {
+      const auto max_smem = get_max_shared_memory();
+      if (!max_smem)
+      {
+        return max_smem.error;
+      }
+
+      elem_counts last_counts{};
+      // Increase the number of output elements per thread until we reach the required bytes in flight.
+      static_assert(policy_t::min_items_per_thread <= policy_t::max_items_per_thread, ""); // ensures the loop below
+      // runs at least once
+      for (int elem_per_thread = +policy_t::min_items_per_thread; elem_per_thread < +policy_t::max_items_per_thread;
+           ++elem_per_thread)
+      {
+        const int tile_size = block_dim * elem_per_thread;
+        const int smem_size = bulk_copy_smem_for_tile_size<RandomAccessIteratorsIn...>(tile_size);
+        if (smem_size > *max_smem)
+        {
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          // assert should be prevented by smem check in policy
+          assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory");
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+          return last_counts;
+        }
+
+        if (tile_size >= num_items)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        int max_occupancy = 0;
+        const auto error =
+          CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size));
+        if (error != cudaSuccess)
+        {
+          return error;
+        }
+
+        const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter;
+        if (ActivePolicy::min_bif <= bytes_in_flight_SM)
+        {
+          return elem_counts{elem_per_thread, tile_size, smem_size};
+        }
+
+        last_counts = elem_counts{elem_per_thread, tile_size, smem_size};
+      }
+      return last_counts;
+    };
+    PoorExpected<elem_counts> config = [&]() {
+      NV_IF_TARGET(NV_IS_HOST,
+                   (static auto cached_config = determine_element_counts(); return cached_config;),
+                   (
+                     // we cannot cache the determined element count in device code
+                     return determine_element_counts();));
+    }();
+    if (!config)
+    {
+      return config.error;
+    }
+#  ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+    assert(config->elem_per_thread > 0);
+    assert(config->tile_size > 0);
+    assert(config->tile_size % bulk_copy_alignment == 0);
+    assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor
+#  endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+
+    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
+    return ::cuda::std::make_tuple(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream),
+      CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+      config->elem_per_thread);
+  }
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>)
+  {
+    auto ret = configure_ublkcp_kernel<ActivePolicy>();
+    if (!ret)
+    {
+      return ret.error;
+    }
+    // TODO(bgruber): use a structured binding in C++17
+    // auto [launcher, kernel, elem_per_thread] = *ret;
+
+    return ::cuda::std::get<0>(*ret).doit(
+      ::cuda::std::get<1>(*ret),
+      num_items,
+      ::cuda::std::get<2>(*ret),
+      op,
+      out,
+      make_aligned_base_ptr_kernel_arg(
+        THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)), bulk_copy_alignment)...);
+  }
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+  template <typename ActivePolicy, std::size_t... Is>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
+  invoke_algorithm(cuda::std::index_sequence<Is...>, ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>)
+  {
+    using policy_t          = typename ActivePolicy::algo_policy;
+    constexpr int block_dim = policy_t::block_threads;
+
+    auto determine_config = [&]() -> PoorExpected<prefetch_config> {
+      int max_occupancy = 0;
+      const auto error  = CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, 0));
+      if (error != cudaSuccess)
+      {
+        return error;
+      }
+      const auto sm_count = get_sm_count();
+      if (!sm_count)
+      {
+        return sm_count.error;
+      }
+      return prefetch_config{max_occupancy, *sm_count};
+    };
+
+    PoorExpected<prefetch_config> config = [&]() {
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        (
+          // this static variable exists for each template instantiation of the surrounding function and class, on which
+          // the chosen element count solely depends (assuming max SMEM is constant during a program execution)
+          static auto cached_config = determine_config(); return cached_config;),
+        (
+          // we cannot cache the determined element count in device code
+          return determine_config();));
+    }();
+    if (!config)
+    {
+      return config.error;
+    }
+
+    const int items_per_thread =
+      loaded_bytes_per_iter == 0
+        ? +policy_t::items_per_thread_no_input
+        : ::cuda::ceil_div(ActivePolicy::min_bif, config->max_occupancy * block_dim * loaded_bytes_per_iter);
+
+    // Generate at least one block per SM. This improves tiny problem sizes (e.g. 2^16 elements).
+    const int items_per_thread_evenly_spread =
+      static_cast<int>(::cuda::std::min(Offset{items_per_thread}, num_items / (config->sm_count * block_dim)));
+
+    const int items_per_thread_clamped = ::cuda::std::clamp(
+      items_per_thread_evenly_spread, +policy_t::min_items_per_thread, +policy_t::max_items_per_thread);
+    const int tile_size = block_dim * items_per_thread_clamped;
+    const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
+    return CubDebug(
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, 0, stream)
+        .doit(
+          CUB_DETAIL_TRANSFORM_KERNEL_PTR,
+          num_items,
+          items_per_thread_clamped,
+          op,
+          out,
+          make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get<Is>(in)))...));
+  }
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    // // TODO(bgruber): replace the overload set by if constexpr in C++17
+    return invoke_algorithm<ActivePolicy>(::cuda::std::index_sequence_for<RandomAccessIteratorsIn...>{},
+                                          ::cuda::std::integral_constant<Algorithm, ActivePolicy::algorithm>{});
+  }
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
+    ::cuda::std::tuple<RandomAccessIteratorsIn...> in,
+    RandomAccessIteratorOut out,
+    Offset num_items,
+    TransformOp op,
+    cudaStream_t stream)
+  {
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    int ptx_version = 0;
+    auto error      = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream};
+    return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch));
+  }
+
+#undef CUB_DETAIL_TRANSFORM_KERNEL_PTR
+};
+} // namespace transform
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
new file mode 100644
index 000000000..a9c4008be
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -0,0 +1,666 @@
+
+/******************************************************************************
+ * Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique
+ * items by key from sequences of data items residing within device-accessible memory.
+ */
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_unique_by_key.cuh>
+#include <cub/device/dispatch/dispatch_scan.cuh>
+#include <cub/device/dispatch/tuning/tuning_unique_by_key.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * @brief Unique by key kernel entry point (multi-block)
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Pointer to the input sequence of keys
+ *
+ * @param[in] d_values_in
+ *   Pointer to the input sequence of values
+ *
+ * @param[out] d_keys_out
+ *   Pointer to the output sequence of selected data items
+ *
+ * @param[out] d_values_out
+ *   Pointer to the output sequence of selected data items
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected
+ *   (i.e., length of @p d_keys_out or @p d_values_out)
+ *
+ * @param[in] tile_state
+ *   Tile status interface
+ *
+ * @param[in] equality_op
+ *   Equality operator
+ *
+ * @param[in] num_items
+ *   Total number of input items
+ *   (i.e., length of @p d_keys_in or @p d_values_in)
+ *
+ * @param[in] num_tiles
+ *   Total number of tiles for the entire problem
+ *
+ * @param[in] vsmem
+ *   Memory to support virtual shared memory
+ */
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename ScanTileStateT,
+          typename EqualityOpT,
+          typename OffsetT>
+__launch_bounds__(int(
+  cub::detail::vsmem_helper_default_fallback_policy_t<
+    typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
+    AgentUniqueByKey,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyOutputIteratorT,
+    ValueOutputIteratorT,
+    EqualityOpT,
+    OffsetT>::agent_policy_t::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    ScanTileStateT tile_state,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    int num_tiles,
+    cub::detail::vsmem_t vsmem)
+{
+  using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+    typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT,
+    AgentUniqueByKey,
+    KeyInputIteratorT,
+    ValueInputIteratorT,
+    KeyOutputIteratorT,
+    ValueOutputIteratorT,
+    EqualityOpT,
+    OffsetT>;
+
+  using AgentUniqueByKeyPolicyT = typename VsmemHelperT::agent_policy_t;
+
+  // Thread block type for selecting data from input tiles
+  using AgentUniqueByKeyT = typename VsmemHelperT::agent_t;
+
+  // Static shared memory allocation
+  __shared__ typename VsmemHelperT::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename AgentUniqueByKeyT::TempStorage& temp_storage =
+    VsmemHelperT::get_temp_storage(static_temp_storage, vsmem, (blockIdx.x * gridDim.y) + blockIdx.y);
+
+  // Process tiles
+  AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items)
+    .ConsumeRange(num_tiles, tile_state, d_num_selected_out);
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  VsmemHelperT::discard_temp_storage(temp_storage);
+}
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ */
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename SelectedPolicy = DeviceUniqueByKeyPolicy<KeyInputIteratorT, ValueInputIteratorT>>
+struct DispatchUniqueByKey : SelectedPolicy
+{
+  /******************************************************************************
+   * Types and constants
+   ******************************************************************************/
+
+  enum
+  {
+    INIT_KERNEL_THREADS = 128,
+  };
+
+  // The input key and value type
+  using KeyT   = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+  using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+  // Tile status descriptor interface type
+  using ScanTileStateT = ScanTileState<OffsetT>;
+
+  /// Device-accessible allocation of temporary storage.  When nullptr, the required allocation size
+  /// is written to `temp_storage_bytes` and no work is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of `d_temp_storage` allocation
+  size_t& temp_storage_bytes;
+
+  /// Pointer to the input sequence of keys
+  KeyInputIteratorT d_keys_in;
+
+  /// Pointer to the input sequence of values
+  ValueInputIteratorT d_values_in;
+
+  /// Pointer to the output sequence of selected data items
+  KeyOutputIteratorT d_keys_out;
+
+  /// Pointer to the output sequence of selected data items
+  ValueOutputIteratorT d_values_out;
+
+  /// Pointer to the total number of items selected
+  /// (i.e., length of @p d_keys_out or @p d_values_out)
+  NumSelectedIteratorT d_num_selected_out;
+
+  /// Equality operator
+  EqualityOpT equality_op;
+
+  /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
+  OffsetT num_items;
+
+  /// **[optional]** CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+  cudaStream_t stream;
+
+  /**
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @tparam temp_storage_bytes
+   *   [in,out] Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in
+   *   Pointer to the input sequence of keys
+   *
+   * @param[in] d_values_in
+   *   Pointer to the input sequence of values
+   *
+   * @param[out] d_keys_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_values_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_num_selected_out
+   *   Pointer to the total number of items selected
+   *   (i.e., length of @p d_keys_out or @p d_values_out)
+   *
+   * @param[in] equality_op
+   *   Equality operator
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys_in(d_keys_in)
+      , d_values_in(d_values_in)
+      , d_keys_out(d_keys_out)
+      , d_values_out(d_values_out)
+      , d_num_selected_out(d_num_selected_out)
+      , equality_op(equality_op)
+      , num_items(num_items)
+      , stream(stream)
+  {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE DispatchUniqueByKey(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_keys_in(d_keys_in)
+      , d_values_in(d_values_in)
+      , d_keys_out(d_keys_out)
+      , d_values_out(d_values_out)
+      , d_num_selected_out(d_num_selected_out)
+      , equality_op(equality_op)
+      , num_items(num_items)
+      , stream(stream)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  /******************************************************************************
+   * Dispatch entrypoints
+   ******************************************************************************/
+
+  template <typename ActivePolicyT, typename InitKernel, typename ScanKernel>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
+  {
+    using Policy = typename ActivePolicyT::UniqueByKeyPolicyT;
+
+    using VsmemHelperT = cub::detail::vsmem_helper_default_fallback_policy_t<
+      Policy,
+      AgentUniqueByKey,
+      KeyInputIteratorT,
+      ValueInputIteratorT,
+      KeyOutputIteratorT,
+      ValueOutputIteratorT,
+      EqualityOpT,
+      OffsetT>;
+
+    cudaError error = cudaSuccess;
+    do
+    {
+      // Get device ordinal
+      int device_ordinal;
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Number of input tiles
+      constexpr auto block_threads    = VsmemHelperT::agent_policy_t::BLOCK_THREADS;
+      constexpr auto items_per_thread = VsmemHelperT::agent_policy_t::ITEMS_PER_THREAD;
+      int tile_size                   = block_threads * items_per_thread;
+      int num_tiles                   = static_cast<int>(::cuda::ceil_div(num_items, tile_size));
+      const auto vsmem_size           = num_tiles * VsmemHelperT::vsmem_per_block;
+
+      // Specify temporary storage allocation requirements
+      size_t allocation_sizes[2] = {0, vsmem_size};
+
+      // Bytes needed for tile status descriptors
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+      void* allocations[2] = {nullptr, nullptr};
+
+      error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      if (d_temp_storage == nullptr)
+      {
+        // Return if the caller is simply requesting the size of the storage allocation
+        break;
+      }
+
+      // Construct the tile status interface
+      ScanTileStateT tile_state;
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Log init_kernel configuration
+      num_tiles          = CUB_MAX(1, num_tiles);
+      int init_grid_size = ::cuda::ceil_div(num_tiles, INIT_KERNEL_THREADS);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke init_kernel to initialize tile descriptors
+      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        .doit(init_kernel, tile_state, num_tiles, d_num_selected_out);
+
+      // Check for failure to launch
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Return if empty problem
+      if (num_items == 0)
+      {
+        break;
+      }
+
+      // Get max x-dimension of grid
+      int max_dim_x;
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Get grid size for scanning tiles
+      dim3 scan_grid_size;
+      scan_grid_size.z = 1;
+      scan_grid_size.y = ::cuda::ceil_div(num_tiles, max_dim_x);
+      scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+// Log select_if_kernel configuration
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      {
+        // Get SM occupancy for unique_by_key_kernel
+        int scan_sm_occupancy;
+        error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
+                                        scan_kernel,
+                                        block_threads));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+
+        _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, "
+                "%lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x,
+                scan_grid_size.y,
+                scan_grid_size.z,
+                block_threads,
+                (long long) stream,
+                items_per_thread,
+                scan_sm_occupancy);
+      }
+#endif // CUB_DETAIL_DEBUG_ENABLE_LOG
+
+      // Invoke select_if_kernel
+      error =
+        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+          .doit(scan_kernel,
+                d_keys_in,
+                d_values_in,
+                d_keys_out,
+                d_values_out,
+                d_num_selected_out,
+                tile_state,
+                equality_op,
+                num_items,
+                num_tiles,
+                cub::detail::vsmem_t{allocations[1]});
+
+      // Check for failure to launch
+      error = CubDebug(error);
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Sync the stream if specified to flush runtime errors
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t Invoke()
+  {
+    using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy;
+
+    // Ensure kernels are instantiated.
+    return Invoke<ActivePolicyT>(
+      DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+      DeviceUniqueByKeySweepKernel<
+        MaxPolicyT,
+        KeyInputIteratorT,
+        ValueInputIteratorT,
+        KeyOutputIteratorT,
+        ValueOutputIteratorT,
+        NumSelectedIteratorT,
+        ScanTileStateT,
+        EqualityOpT,
+        OffsetT>);
+  }
+
+  /**
+   * @brief Internal dispatch routine
+   *
+   * @param[in] d_temp_storage
+   *   Device-accessible allocation of temporary storage.
+   *   When nullptr, the required allocation size is written to
+   *   `temp_storage_bytes` and no work is done.
+   *
+   * @param[in,out] &temp_storage_bytes
+   *   Reference to size in bytes of `d_temp_storage` allocation
+   *
+   * @param[in] d_keys_in
+   *   Pointer to the input sequence of keys
+   *
+   * @param[in] d_values_in
+   *   Pointer to the input sequence of values
+   *
+   * @param[out] d_keys_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_values_out
+   *   Pointer to the output sequence of selected data items
+   *
+   * @param[out] d_num_selected_out
+   *   Pointer to the total number of items selected
+   *   (i.e., length of @p d_keys_out or @p d_values_out)
+   *
+   * @param[in] equality_op
+   *   Equality operator
+   *
+   * @param[in] num_items
+   *   Total number of input items (i.e., the length of @p d_in)
+   *
+   * @param[in] stream
+   *   **[optional]** CUDA stream to launch kernels within.
+   *   Default is stream<sub>0</sub>.
+   */
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy;
+
+    cudaError_t error;
+    do
+    {
+      // Get PTX version
+      int ptx_version = 0;
+      error           = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Create dispatch functor
+      DispatchUniqueByKey dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys_in,
+        d_values_in,
+        d_keys_out,
+        d_values_out,
+        d_num_selected_out,
+        equality_op,
+        num_items,
+        stream);
+
+      // Dispatch to chained policy
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+    } while (0);
+
+    return error;
+  }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+  CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    KeyInputIteratorT d_keys_in,
+    ValueInputIteratorT d_values_in,
+    KeyOutputIteratorT d_keys_out,
+    ValueOutputIteratorT d_values_out,
+    NumSelectedIteratorT d_num_selected_out,
+    EqualityOpT equality_op,
+    OffsetT num_items,
+    cudaStream_t stream,
+    bool debug_synchronous)
+  {
+    CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
+
+    return Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_values_in,
+      d_keys_out,
+      d_values_out,
+      d_num_selected_out,
+      equality_op,
+      num_items,
+      stream);
+  }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/for_each.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/for_each.cuh
new file mode 100644
index 000000000..2213252d2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/for_each.cuh
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+template <class Fn>
+struct first_parameter
+{
+  using type = void;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A)>
+{
+  using type = A;
+};
+
+template <class C, class R, class A>
+struct first_parameter<R (C::*)(A) const>
+{
+  using type = A;
+};
+
+template <class Fn>
+using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;
+
+template <class Value, class Fn, class = void>
+struct has_unique_value_overload : ::cuda::std::false_type
+{};
+
+// clang-format off
+template <class Value, class Fn>
+struct has_unique_value_overload<
+  Value,
+  Fn,
+  typename ::cuda::std::enable_if<
+              !::cuda::std::is_reference<first_parameter_t<Fn>>::value &&
+              ::cuda::std::is_convertible<Value, first_parameter_t<Fn>
+             >::value>::type>
+    : ::cuda::std::true_type
+{};
+
+// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
+// This can be observable if the unary operator takes parameter by reference and modifies it or uses address.
+// The trait below checks if the freedom to copy trivial types can be regained.
+template <typename Value, typename Fn>
+using can_regain_copy_freedom =
+  ::cuda::std::integral_constant<
+    bool,
+    ::cuda::std::is_trivially_constructible<Value>::value &&
+    ::cuda::std::is_trivially_copy_assignable<Value>::value &&
+    :: cuda::std::is_trivially_move_assignable<Value>::value &&
+    ::cuda::std::is_trivially_destructible<Value>::value &&
+    has_unique_value_overload<Value, Fn>::value>;
+// clang-format on
+
+// This kernel is used when the block size is not known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  const auto block_threads  = static_cast<OffsetT>(blockDim.x);
+  const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+  const auto tile_base      = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining  = num_items - tile_base;
+  const auto items_in_tile  = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+// This kernel is used when the block size is known at compile time
+template <class ChainedPolicyT, class OffsetT, class OpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES //
+__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
+  void static_kernel(OffsetT num_items, OpT op)
+{
+  using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
+  using agent_t         = agent_block_striped_t<active_policy_t, OffsetT, OpT>;
+
+  constexpr auto block_threads  = active_policy_t::block_threads;
+  constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;
+
+  const auto tile_base     = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
+  const auto num_remaining = num_items - tile_base;
+  const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);
+
+  if (items_in_tile == items_per_tile)
+  {
+    agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
+  }
+  else
+  {
+    agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
+  }
+}
+
+} // namespace for_each
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/reduce.cuh
new file mode 100644
index 000000000..174b262c3
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/kernels/reduce.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce.cuh>
+#include <cub/grid/grid_even_share.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace reduce
+{
+
+/**
+ * All cub::DeviceReduce::* algorithms are using the same implementation. Some of them, however,
+ * should use initial value only for empty problems. If this struct is used as initial value with
+ * one of the `DeviceReduce` algorithms, the `init` value wrapped by this struct will only be used
+ * for empty problems; it will not be incorporated into the aggregate of non-empty problems.
+ */
+template <class T>
+struct empty_problem_init_t
+{
+  T init;
+
+  _CCCL_HOST_DEVICE operator T() const
+  {
+    return init;
+  }
+};
+
+/**
+ * @brief Applies initial value to the block aggregate and stores the result to the output iterator.
+ *
+ * @param d_out Iterator to the output aggregate
+ * @param reduction_op Binary reduction functor
+ * @param init Initial value
+ * @param block_aggregate Aggregate value computed by the block
+ */
+template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
+_CCCL_HOST_DEVICE void
+finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate)
+{
+  *d_out = reduction_op(init, block_aggregate);
+}
+
+/**
+ * @brief Ignores initial value and stores the block aggregate to the output iterator.
+ *
+ * @param d_out Iterator to the output aggregate
+ * @param block_aggregate Aggregate value computed by the block
+ */
+template <class OutputIteratorT, class ReductionOpT, class InitT, class AccumT>
+_CCCL_HOST_DEVICE void
+finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t<InitT>, AccumT block_aggregate)
+{
+  *d_out = block_aggregate;
+}
+} // namespace reduce
+} // namespace detail
+
+/**
+ * @brief Reduce region kernel entry point (multi-block). Computes privatized
+ *        reductions, one per thread block.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `auto operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @tparam AccumT
+ *   Accumulator type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapping an equal number of tiles onto each
+ *   thread block
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename AccumT,
+          typename TransformOpT>
+CUB_DETAIL_KERNEL_ATTRIBUTES
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceReduceKernel(
+  InputIteratorT d_in,
+  AccumT* d_out,
+  OffsetT num_items,
+  GridEvenShare<OffsetT> even_share,
+  ReductionOpT reduction_op,
+  TransformOpT transform_op)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+                InputIteratorT,
+                AccumT*,
+                OffsetT,
+                ReductionOpT,
+                AccumT,
+                TransformOpT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Consume input tiles
+  AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeTiles(even_share);
+
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    detail::uninitialized_copy_single(d_out + blockIdx.x, block_aggregate);
+  }
+}
+
+/**
+ * @brief Reduce a single tile kernel entry point (single-block). Can be used
+ *        to aggregate privatized thread block reductions from a previous
+ *        multi-block reduction pass.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for reading input items @iterator
+ *
+ * @tparam OutputIteratorT
+ *   Output iterator type for recording the reduced aggregate @iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ReductionOpT
+ *   Binary reduction functor type having member
+ *   `T operator()(const T &a, const U &b)`
+ *
+ * @tparam InitT
+ *   Initial value type
+ *
+ * @tparam AccumT
+ *   Accumulator type
+ *
+ * @param[in] d_in
+ *   Pointer to the input sequence of data items
+ *
+ * @param[out] d_out
+ *   Pointer to the output aggregate
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] reduction_op
+ *   Binary reduction functor
+ *
+ * @param[in] init
+ *   The initial value of the reduction
+ */
+template <typename ChainedPolicyT,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename ReductionOpT,
+          typename InitT,
+          typename AccumT,
+          typename TransformOpT = ::cuda::std::__identity>
+CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
+  int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
+  1) void DeviceReduceSingleTileKernel(InputIteratorT d_in,
+                                       OutputIteratorT d_out,
+                                       OffsetT num_items,
+                                       ReductionOpT reduction_op,
+                                       InitT init,
+                                       TransformOpT transform_op)
+{
+  // Thread block type for reducing input tiles
+  using AgentReduceT =
+    AgentReduce<typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+                InputIteratorT,
+                OutputIteratorT,
+                OffsetT,
+                ReductionOpT,
+                AccumT,
+                TransformOpT>;
+
+  // Shared memory storage
+  __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+  // Check if empty problem
+  if (num_items == 0)
+  {
+    if (threadIdx.x == 0)
+    {
+      *d_out = init;
+    }
+
+    return;
+  }
+
+  // Consume input tiles
+  AccumT block_aggregate =
+    AgentReduceT(temp_storage, d_in, reduction_op, transform_op).ConsumeRange(OffsetT(0), num_items);
+
+  // Output result
+  if (threadIdx.x == 0)
+  {
+    detail::reduce::finalize_and_store_aggregate(d_out, reduction_op, init, block_aggregate);
+  }
+}
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_for.cuh
similarity index 70%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh
rename to source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_for.cuh
index b2bf4658b..759d7e632 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_deprecated.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_for.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -25,22 +25,39 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * Define CUB_DEPRECATED macro.
- */
-
 #pragma once
 
-#include "util_compiler.cuh"
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_for.cuh>
+#include <cub/util_device.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace for_each
+{
+
+struct policy_hub_t
+{
+  struct policy_350_t : ChainedPolicy<350, policy_350_t, policy_350_t>
+  {
+    using for_policy_t = policy_t<256, 2>;
+  };
+
+  using MaxPolicy = policy_350_t;
+};
 
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#  define CUB_DEPRECATED __declspec(deprecated)
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
-#  define CUB_DEPRECATED __attribute__((deprecated))
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
-#  define CUB_DEPRECATED __attribute__((deprecated))
-#else
-#  define CUB_DEPRECATED
-#endif
+} // namespace for_each
+} // namespace detail
 
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
new file mode 100644
index 000000000..25f8f5a3b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_histogram.cuh
@@ -0,0 +1,204 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace histogram
+{
+
+enum class primitive_sample
+{
+  no,
+  yes
+};
+
+enum class sample_size
+{
+  _1,
+  _2,
+  unknown
+};
+
+enum class counter_size
+{
+  _4,
+  unknown
+};
+
+template <class T>
+constexpr primitive_sample is_primitive_sample()
+{
+  return Traits<T>::PRIMITIVE ? primitive_sample::yes : primitive_sample::no;
+}
+
+template <class CounterT>
+constexpr counter_size classify_counter_size()
+{
+  return sizeof(CounterT) == 4 ? counter_size::_4 : counter_size::unknown;
+}
+
+template <class SampleT>
+constexpr sample_size classify_sample_size()
+{
+  return sizeof(SampleT) == 1 ? sample_size::_1 : sizeof(SampleT) == 2 ? sample_size::_2 : sample_size::unknown;
+}
+
+template <class SampleT>
+constexpr int v_scale()
+{
+  return (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int);
+}
+
+template <class SampleT, int NumActiveChannels, int NominalItemsPerThread>
+constexpr int t_scale()
+{
+  return CUB_MAX((NominalItemsPerThread / NumActiveChannels / v_scale<SampleT>()), 1);
+}
+
+template <class SampleT,
+          int NumChannels,
+          int NumActiveChannels,
+          counter_size CounterSize,
+          primitive_sample PrimitiveSample = is_primitive_sample<SampleT>(),
+          sample_size SampleSize           = classify_sample_size<SampleT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = t_scale<SampleT, NumActiveChannels, 16>();
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_LDG;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = true;
+  static constexpr bool work_stealing = false;
+};
+
+template <class SampleT>
+struct sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
+{
+  static constexpr int threads = 768;
+  static constexpr int items   = 12;
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_LDG;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = false;
+  static constexpr bool work_stealing = false;
+};
+
+template <class SampleT>
+struct sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
+{
+  static constexpr int threads = 960;
+  static constexpr int items   = 10;
+
+  static constexpr CacheLoadModifier load_modifier               = LOAD_DEFAULT;
+  static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool rle_compress  = true;
+  static constexpr bool work_stealing = false;
+};
+
+} // namespace histogram
+
+template <class SampleT, class CounterT, int NumChannels, int NumActiveChannels>
+struct device_histogram_policy_hub
+{
+  template <int NOMINAL_ITEMS_PER_THREAD>
+  struct TScale
+  {
+    enum
+    {
+      V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+      VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NumActiveChannels / V_SCALE), 1)
+    };
+  };
+
+  /// SM35
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    // TODO This might be worth it to separate usual histogram and the multi one
+    using AgentHistogramPolicyT =
+      AgentHistogramPolicy<128, TScale<8>::VALUE, BLOCK_LOAD_DIRECT, LOAD_LDG, true, BLEND, true>;
+  };
+
+  /// SM50
+  struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+  {
+    // TODO This might be worth it to separate usual histogram and the multi one
+    using AgentHistogramPolicyT =
+      AgentHistogramPolicy<384, TScale<16>::VALUE, cub::BLOCK_LOAD_DIRECT, LOAD_LDG, true, SMEM, false>;
+  };
+
+  /// SM900
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy500>
+  {
+    using tuning = detail::histogram::
+      sm90_tuning<SampleT, NumChannels, NumActiveChannels, histogram::classify_counter_size<CounterT>()>;
+
+    using AgentHistogramPolicyT =
+      AgentHistogramPolicy<tuning::threads,
+                           tuning::items,
+                           tuning::load_algorithm,
+                           tuning::load_modifier,
+                           tuning::rle_compress,
+                           tuning::mem_preference,
+                           tuning::work_stealing>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
new file mode 100644
index 000000000..46d8b8df0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -0,0 +1,866 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace reduce_by_key
+{
+
+enum class primitive_key
+{
+  no,
+  yes
+};
+enum class primitive_accum
+{
+  no,
+  yes
+};
+enum class primitive_op
+{
+  no,
+  yes
+};
+enum class key_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+enum class accum_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+template <class T>
+constexpr primitive_key is_primitive_key()
+{
+  return Traits<T>::PRIMITIVE ? primitive_key::yes : primitive_key::no;
+}
+
+template <class T>
+constexpr primitive_accum is_primitive_accum()
+{
+  return Traits<T>::PRIMITIVE ? primitive_accum::yes : primitive_accum::no;
+}
+
+template <class ReductionOpT>
+constexpr primitive_op is_primitive_op()
+{
+  return basic_binary_op_t<ReductionOpT>::value ? primitive_op::yes : primitive_op::no;
+}
+
+template <class KeyT>
+constexpr key_size classify_key_size()
+{
+  return sizeof(KeyT) == 1 ? key_size::_1
+       : sizeof(KeyT) == 2 ? key_size::_2
+       : sizeof(KeyT) == 4 ? key_size::_4
+       : sizeof(KeyT) == 8 ? key_size::_8
+       : sizeof(KeyT) == 16
+         ? key_size::_16
+         : key_size::unknown;
+}
+
+template <class AccumT>
+constexpr accum_size classify_accum_size()
+{
+  return sizeof(AccumT) == 1 ? accum_size::_1
+       : sizeof(AccumT) == 2 ? accum_size::_2
+       : sizeof(AccumT) == 4 ? accum_size::_4
+       : sizeof(AccumT) == 8 ? accum_size::_8
+       : sizeof(AccumT) == 16
+         ? accum_size::_16
+         : accum_size::unknown;
+}
+
+template <class KeyT,
+          class AccumT,
+          primitive_op PrimitiveOp,
+          primitive_key PrimitiveKey     = is_primitive_key<KeyT>(),
+          primitive_accum PrimitiveAccum = is_primitive_accum<AccumT>(),
+          key_size KeySize               = classify_key_size<KeyT>(),
+          accum_size AccumSize           = classify_accum_size<AccumT>()>
+struct sm90_tuning
+{
+  static constexpr int max_input_bytes      = CUB_MAX(sizeof(KeyT), sizeof(AccumT));
+  static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
+
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 6;
+  static constexpr int items =
+    (max_input_bytes <= 8)
+      ? 6
+      : CUB_MIN(nominal_4b_items_per_thread,
+                CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<AccumT, int>;
+};
+
+// 8-bit key
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<720>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_2>
+{
+  static constexpr int threads = 320;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<865>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_4>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<735>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<580>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_1, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1100>;
+};
+
+// 16-bit key
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_1>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<985>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<276, 650>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<240, 765>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 19;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1190>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_2, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1175>;
+};
+
+// 32-bit key
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<404, 645>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1160>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1170>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1055>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_4, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1195>;
+};
+
+// 64-bit key
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1170>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<236, 1030>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_4>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<152, 560>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1030>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_8, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1125>;
+};
+
+// 128-bit key
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_1>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1080>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_2>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<320, 1005>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_4>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<232, 1100>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1195>;
+};
+
+template <class KeyT, class AccumT>
+struct sm90_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::no, key_size::_16, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1150>;
+};
+
+template <class KeyT,
+          class AccumT,
+          primitive_op PrimitiveOp,
+          primitive_key PrimitiveKey     = is_primitive_key<KeyT>(),
+          primitive_accum PrimitiveAccum = is_primitive_accum<AccumT>(),
+          key_size KeySize               = classify_key_size<KeyT>(),
+          accum_size AccumSize           = classify_accum_size<AccumT>()>
+struct sm80_tuning
+{
+  static constexpr int max_input_bytes      = CUB_MAX(sizeof(KeyT), sizeof(AccumT));
+  static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
+
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 6;
+  static constexpr int items =
+    (max_input_bytes <= 8)
+      ? 6
+      : CUB_MIN(nominal_4b_items_per_thread,
+                CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<AccumT, int>;
+};
+
+// 8-bit key
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<975>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_2>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<840>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<760>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_1, accum_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1070>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_1, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1175>;
+};
+
+// 16-bit key
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<620>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_2>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<640>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<905>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_2, accum_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<810>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_2, accum_size::_16>
+{
+  static constexpr int threads = 160;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1115>;
+};
+
+// 32-bit key
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_1>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1110>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_2>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1200>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1110>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_4, accum_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1165>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_4, accum_size::_16>
+{
+  static constexpr int threads = 160;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1100>;
+};
+
+// 64-bit key
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_1>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1175>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_2>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1075>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_4>
+{
+  static constexpr int threads = 384;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1040>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::yes, key_size::_8, accum_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1080>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::yes, primitive_accum::no, key_size::_8, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<430>;
+};
+
+// 128-bit key
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_1>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1105>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_2>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<755>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_4>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<535>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::yes, key_size::_16, accum_size::_8>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1035>;
+};
+
+template <class KeyT, class AccumT>
+struct sm80_tuning<KeyT, AccumT, primitive_op::yes, primitive_key::no, primitive_accum::no, key_size::_16, accum_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1090>;
+};
+} // namespace reduce_by_key
+
+template <class ReductionOpT, class AccumT, class KeyT>
+struct device_reduce_by_key_policy_hub
+{
+  static constexpr int MAX_INPUT_BYTES      = CUB_MAX(sizeof(KeyT), sizeof(AccumT));
+  static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(AccumT);
+
+  struct DefaultTuning
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6;
+    static constexpr int ITEMS_PER_THREAD =
+      (MAX_INPUT_BYTES <= 8)
+        ? 6
+        : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
+                  CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES));
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<128,
+                             ITEMS_PER_THREAD,
+                             BLOCK_LOAD_DIRECT,
+                             LOAD_LDG,
+                             BLOCK_SCAN_WARP_SCANS,
+                             detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
+  };
+
+  /// SM35
+  struct Policy350
+      : DefaultTuning
+      , ChainedPolicy<350, Policy350, Policy350>
+  {};
+
+  /// SM80
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
+  {
+    using tuning =
+      detail::reduce_by_key::sm80_tuning<KeyT, AccumT, detail::reduce_by_key::is_primitive_op<ReductionOpT>()>;
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             LOAD_DEFAULT,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  /// SM86
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  /// SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning =
+      detail::reduce_by_key::sm90_tuning<KeyT, AccumT, detail::reduce_by_key::is_primitive_op<ReductionOpT>()>;
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             LOAD_DEFAULT,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
new file mode 100644
index 000000000..536e4c5c9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh
@@ -0,0 +1,677 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_reduce_by_key.cuh>
+#include <cub/agent/agent_rle.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace rle
+{
+
+enum class primitive_key
+{
+  no,
+  yes
+};
+enum class primitive_length
+{
+  no,
+  yes
+};
+enum class key_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+enum class length_size
+{
+  _4,
+  unknown
+};
+
+template <class T>
+constexpr primitive_key is_primitive_key()
+{
+  return Traits<T>::PRIMITIVE ? primitive_key::yes : primitive_key::no;
+}
+
+template <class T>
+constexpr primitive_length is_primitive_length()
+{
+  return Traits<T>::PRIMITIVE ? primitive_length::yes : primitive_length::no;
+}
+
+template <class KeyT>
+constexpr key_size classify_key_size()
+{
+  return sizeof(KeyT) == 1 ? key_size::_1
+       : sizeof(KeyT) == 2 ? key_size::_2
+       : sizeof(KeyT) == 4 ? key_size::_4
+       : sizeof(KeyT) == 8 ? key_size::_8
+       : sizeof(KeyT) == 16
+         ? key_size::_16
+         : key_size::unknown;
+}
+
+template <class LengthT>
+constexpr length_size classify_length_size()
+{
+  return sizeof(LengthT) == 4 ? length_size::_4 : length_size::unknown;
+}
+
+namespace encode
+{
+
+template <class LengthT,
+          class KeyT,
+          primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
+          primitive_key PrimitiveKey       = is_primitive_key<KeyT>(),
+          length_size LengthSize           = classify_length_size<LengthT>(),
+          key_size KeySize                 = classify_key_size<KeyT>()>
+struct sm90_tuning
+{
+  static constexpr int max_input_bytes      = CUB_MAX(sizeof(KeyT), sizeof(LengthT));
+  static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT);
+
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 6;
+
+  static constexpr int items =
+    (max_input_bytes <= 8)
+      ? 6
+      : CUB_MIN(nominal_4b_items_per_thread,
+                CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<LengthT, int>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<620>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_2>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 22;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<775>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_4>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<284, 480>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_8>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 19;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<515>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class LengthT>
+struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<428, 930>;
+};
+
+template <class LengthT>
+struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<428, 930>;
+};
+#endif
+
+template <class LengthT,
+          class KeyT,
+          primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
+          primitive_key PrimitiveKey       = is_primitive_key<KeyT>(),
+          length_size LengthSize           = classify_length_size<LengthT>(),
+          key_size KeySize                 = classify_key_size<KeyT>()>
+struct sm80_tuning
+{
+  static constexpr int max_input_bytes      = CUB_MAX(sizeof(KeyT), sizeof(LengthT));
+  static constexpr int combined_input_bytes = sizeof(KeyT) + sizeof(LengthT);
+
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 6;
+
+  static constexpr int items =
+    (max_input_bytes <= 8)
+      ? 6
+      : CUB_MIN(nominal_4b_items_per_thread,
+                CUB_MAX(1, ((nominal_4b_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<LengthT, int>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<640>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<900>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1080>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1075>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class LengthT>
+struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<630>;
+};
+
+template <class LengthT>
+struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<630>;
+};
+#endif
+
+} // namespace encode
+
+namespace non_trivial_runs
+{
+
+template <class LengthT,
+          class KeyT,
+          primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
+          primitive_key PrimitiveKey       = is_primitive_key<KeyT>(),
+          length_size LengthSize           = classify_length_size<LengthT>(),
+          key_size KeySize                 = classify_key_size<KeyT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 96;
+
+  static constexpr int nominal_4b_items_per_thread = 15;
+
+  static constexpr int items =
+    CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT))));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = true;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<LengthT, int>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<385>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_2>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<675>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<695>;
+};
+
+template <class LengthT, class KeyT>
+struct sm90_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<840>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class LengthT>
+struct sm90_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>;
+};
+
+template <class LengthT>
+struct sm90_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<484, 1150>;
+};
+#endif
+
+template <class LengthT,
+          class KeyT,
+          primitive_length PrimitiveLength = is_primitive_length<LengthT>(),
+          primitive_key PrimitiveKey       = is_primitive_key<KeyT>(),
+          length_size LengthSize           = classify_length_size<LengthT>(),
+          key_size KeySize                 = classify_key_size<KeyT>()>
+struct sm80_tuning
+{
+  static constexpr int threads = 96;
+
+  static constexpr int nominal_4b_items_per_thread = 15;
+
+  static constexpr int items =
+    CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT))));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = true;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<LengthT, int>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_1>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<630>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_2>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<1015>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_4>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<915>;
+};
+
+template <class LengthT, class KeyT>
+struct sm80_tuning<LengthT, KeyT, primitive_length::yes, primitive_key::yes, length_size::_4, key_size::_8>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<1065>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class LengthT>
+struct sm80_tuning<LengthT, __int128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<1050>;
+};
+
+template <class LengthT>
+struct sm80_tuning<LengthT, __uint128_t, primitive_length::yes, primitive_key::no, length_size::_4, key_size::_16>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr bool store_with_time_slicing = false;
+
+  using delay_constructor = detail::no_delay_constructor_t<1050>;
+};
+#endif
+
+} // namespace non_trivial_runs
+
+} // namespace rle
+
+template <class LengthT, class KeyT>
+struct device_run_length_encode_policy_hub
+{
+  static constexpr int MAX_INPUT_BYTES      = CUB_MAX(sizeof(KeyT), sizeof(LengthT));
+  static constexpr int COMBINED_INPUT_BYTES = sizeof(KeyT) + sizeof(LengthT);
+
+  struct DefaultTuning
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6;
+    static constexpr int ITEMS_PER_THREAD =
+      (MAX_INPUT_BYTES <= 8)
+        ? 6
+        : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD,
+                  CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES));
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<128,
+                             ITEMS_PER_THREAD,
+                             BLOCK_LOAD_DIRECT,
+                             LOAD_LDG,
+                             BLOCK_SCAN_WARP_SCANS,
+                             detail::default_reduce_by_key_delay_constructor_t<LengthT, int>>;
+  };
+
+  /// SM35
+  struct Policy350
+      : DefaultTuning
+      , ChainedPolicy<350, Policy350, Policy350>
+  {};
+
+  /// SM80
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
+  {
+    using tuning = detail::rle::encode::sm80_tuning<LengthT, KeyT>;
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             LOAD_DEFAULT,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  // SM86
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  /// SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::rle::encode::sm90_tuning<LengthT, KeyT>;
+
+    using ReduceByKeyPolicyT =
+      AgentReduceByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             LOAD_DEFAULT,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+template <class LengthT, class KeyT>
+struct device_non_trivial_runs_policy_hub
+{
+  struct DefaultTuning
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+
+      ITEMS_PER_THREAD =
+        CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(KeyT)))),
+    };
+
+    using RleSweepPolicyT =
+      AgentRlePolicy<96,
+                     ITEMS_PER_THREAD,
+                     BLOCK_LOAD_DIRECT,
+                     LOAD_LDG,
+                     true,
+                     BLOCK_SCAN_WARP_SCANS,
+                     detail::default_reduce_by_key_delay_constructor_t<int, int>>;
+  };
+
+  /// SM35
+  struct Policy350
+      : DefaultTuning
+      , ChainedPolicy<350, Policy350, Policy350>
+  {};
+
+  // SM80
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
+  {
+    using tuning = detail::rle::non_trivial_runs::sm80_tuning<LengthT, KeyT>;
+
+    using RleSweepPolicyT =
+      AgentRlePolicy<tuning::threads,
+                     tuning::items,
+                     tuning::load_algorithm,
+                     LOAD_DEFAULT,
+                     tuning::store_with_time_slicing,
+                     BLOCK_SCAN_WARP_SCANS,
+                     typename tuning::delay_constructor>;
+  };
+
+  // SM86
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  // SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::rle::non_trivial_runs::sm90_tuning<LengthT, KeyT>;
+
+    using RleSweepPolicyT =
+      AgentRlePolicy<tuning::threads,
+                     tuning::items,
+                     tuning::load_algorithm,
+                     LOAD_DEFAULT,
+                     tuning::store_with_time_slicing,
+                     BLOCK_SCAN_WARP_SCANS,
+                     typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan.cuh
new file mode 100644
index 000000000..419908c4e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -0,0 +1,386 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_scan.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace scan
+{
+
+enum class keep_rejects
+{
+  no,
+  yes
+};
+enum class primitive_accum
+{
+  no,
+  yes
+};
+enum class primitive_op
+{
+  no,
+  yes
+};
+enum class offset_size
+{
+  _4,
+  _8,
+  unknown
+};
+enum class accum_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+template <class AccumT>
+constexpr primitive_accum is_primitive_accum()
+{
+  return Traits<AccumT>::PRIMITIVE ? primitive_accum::yes : primitive_accum::no;
+}
+
+template <class ScanOpT>
+constexpr primitive_op is_primitive_op()
+{
+  return basic_binary_op_t<ScanOpT>::value ? primitive_op::yes : primitive_op::no;
+}
+
+template <class AccumT>
+constexpr accum_size classify_accum_size()
+{
+  return sizeof(AccumT) == 1 ? accum_size::_1
+       : sizeof(AccumT) == 2 ? accum_size::_2
+       : sizeof(AccumT) == 4 ? accum_size::_4
+       : sizeof(AccumT) == 8 ? accum_size::_8
+       : sizeof(AccumT) == 16
+         ? accum_size::_16
+         : accum_size::unknown;
+}
+
+template <int Threads, int Items, int L2B, int L2W>
+struct tuning
+{
+  static constexpr int threads = Threads;
+  static constexpr int items   = Items;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<L2B, L2W>;
+};
+
+template <class AccumT,
+          primitive_op PrimitiveOp,
+          primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
+          accum_size AccumSize                 = classify_accum_size<AccumT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 128;
+  static constexpr int items   = 15;
+
+  using delay_constructor = detail::default_delay_constructor_t<AccumT>;
+};
+
+// clang-format off
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_1> : tuning<192, 22, 168, 1140> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_2> : tuning<512, 12, 376, 1125> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_4> : tuning<128, 24, 648, 1245> {};
+template <class T> struct sm90_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_8> : tuning<224, 24, 632, 1290> {};
+
+template <> struct sm90_tuning<float,  primitive_op::yes, primitive_accum::yes, accum_size::_4> : tuning<128, 24, 688, 1140> {};
+template <> struct sm90_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8> : tuning<224, 24, 576, 1215> {};
+
+#if CUB_IS_INT128_ENABLED
+template <> struct sm90_tuning< __int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {};
+template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {};
+#endif
+// clang-format on
+
+template <class AccumT,
+          primitive_op PrimitiveOp,
+          primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>(),
+          accum_size AccumSize                 = classify_accum_size<AccumT>()>
+struct sm80_tuning
+{
+  static constexpr int threads = 128;
+  static constexpr int items   = 15;
+
+  using delay_constructor = detail::default_delay_constructor_t<AccumT>;
+
+  static constexpr bool LargeValues = sizeof(AccumT) > 128;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = //
+    LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = //
+    LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <class T>
+struct sm80_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_1>
+{
+  static constexpr int threads = 320;
+  static constexpr int items   = 14;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<368, 725>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <class T>
+struct sm80_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_2>
+{
+  static constexpr int threads = 352;
+  static constexpr int items   = 16;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<488, 1040>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <class T>
+struct sm80_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_4>
+{
+  static constexpr int threads = 320;
+  static constexpr int items   = 12;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<268, 1180>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <class T>
+struct sm80_tuning<T, primitive_op::yes, primitive_accum::yes, accum_size::_8>
+{
+  static constexpr int threads = 288;
+  static constexpr int items   = 22;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<716, 785>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <>
+struct sm80_tuning<float, primitive_op::yes, primitive_accum::yes, accum_size::_4>
+{
+  static constexpr int threads = 288;
+  static constexpr int items   = 8;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<724, 1050>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+template <>
+struct sm80_tuning<double, primitive_op::yes, primitive_accum::yes, accum_size::_8>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 12;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<388, 1100>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm80_tuning<__int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>
+{
+  static constexpr int threads = 640;
+  static constexpr int items   = 24;
+
+  using delay_constructor = detail::no_delay_constructor_t<1200>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_DIRECT;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+};
+
+template <>
+struct sm80_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16>
+{
+  static constexpr int threads = 640;
+  static constexpr int items   = 24;
+
+  using delay_constructor = detail::no_delay_constructor_t<1200>;
+
+  static constexpr BlockLoadAlgorithm load_algorithm   = BLOCK_LOAD_DIRECT;
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+};
+#endif
+
+} // namespace scan
+} // namespace detail
+
+template <typename AccumT, typename ScanOpT = Sum>
+struct DeviceScanPolicy
+{
+  // For large values, use timesliced loads/stores to fit shared memory.
+  static constexpr bool LargeValues = sizeof(AccumT) > 128;
+  static constexpr BlockLoadAlgorithm ScanTransposedLoad =
+    LargeValues ? BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED : BLOCK_LOAD_WARP_TRANSPOSE;
+  static constexpr BlockStoreAlgorithm ScanTransposedStore =
+    LargeValues ? BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED : BLOCK_STORE_WARP_TRANSPOSE;
+
+  template <int NOMINAL_BLOCK_THREADS_4B,
+            int NOMINAL_ITEMS_PER_THREAD_4B,
+            typename ComputeT,
+            BlockLoadAlgorithm LOAD_ALGORITHM,
+            CacheLoadModifier LOAD_MODIFIER,
+            BlockStoreAlgorithm STORE_ALGORITHM,
+            BlockScanAlgorithm SCAN_ALGORITHM,
+            typename DelayConstructorT>
+  using policy_t =
+    AgentScanPolicy<NOMINAL_BLOCK_THREADS_4B,
+                    NOMINAL_ITEMS_PER_THREAD_4B,
+                    ComputeT,
+                    LOAD_ALGORITHM,
+                    LOAD_MODIFIER,
+                    STORE_ALGORITHM,
+                    SCAN_ALGORITHM,
+                    MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
+                    DelayConstructorT>;
+
+  /// SM350
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+    using ScanPolicyT =
+      policy_t<128,
+               12, ///< Threads per block, items per thread
+               AccumT,
+               BLOCK_LOAD_DIRECT,
+               LOAD_CA,
+               BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+               BLOCK_SCAN_RAKING,
+               detail::default_delay_constructor_t<AccumT>>;
+  };
+
+  /// SM520
+  struct Policy520 : ChainedPolicy<520, Policy520, Policy350>
+  {
+    // Titan X: 32.47B items/s @ 48M 32-bit T
+    using ScanPolicyT =
+      policy_t<128,
+               12, ///< Threads per block, items per thread
+               AccumT,
+               BLOCK_LOAD_DIRECT,
+               LOAD_CA,
+               ScanTransposedStore,
+               BLOCK_SCAN_WARP_SCANS,
+               detail::default_delay_constructor_t<AccumT>>;
+  };
+
+  /// SM600
+  struct DefaultTuning
+  {
+    using ScanPolicyT =
+      policy_t<128,
+               15, ///< Threads per block, items per thread
+               AccumT,
+               ScanTransposedLoad,
+               LOAD_DEFAULT,
+               ScanTransposedStore,
+               BLOCK_SCAN_WARP_SCANS,
+               detail::default_delay_constructor_t<AccumT>>;
+  };
+
+  /// SM600
+  struct Policy600
+      : DefaultTuning
+      , ChainedPolicy<600, Policy600, Policy520>
+  {};
+
+  /// SM800
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy600>
+  {
+    using tuning = detail::scan::sm80_tuning<AccumT, detail::scan::is_primitive_op<ScanOpT>()>;
+
+    using ScanPolicyT =
+      policy_t<tuning::threads,
+               tuning::items,
+               AccumT,
+               tuning::load_algorithm,
+               LOAD_DEFAULT,
+               tuning::store_algorithm,
+               BLOCK_SCAN_WARP_SCANS,
+               typename tuning::delay_constructor>;
+  };
+
+  /// SM860
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  /// SM900
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::scan::sm90_tuning<AccumT, detail::scan::is_primitive_op<ScanOpT>()>;
+
+    using ScanPolicyT =
+      policy_t<tuning::threads,
+               tuning::items,
+               AccumT,
+               ScanTransposedLoad,
+               LOAD_DEFAULT,
+               ScanTransposedStore,
+               BLOCK_SCAN_WARP_SCANS,
+               typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
new file mode 100644
index 000000000..9ca76faa7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -0,0 +1,1129 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_scan_by_key.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+namespace scan_by_key
+{
+
+enum class primitive_accum
+{
+  no,
+  yes
+};
+enum class primitive_op
+{
+  no,
+  yes
+};
+enum class offset_size
+{
+  _4,
+  _8,
+  unknown
+};
+enum class val_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+enum class key_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+template <class AccumT>
+constexpr primitive_accum is_primitive_accum()
+{
+  return Traits<AccumT>::PRIMITIVE ? primitive_accum::yes : primitive_accum::no;
+}
+
+template <class ScanOpT>
+constexpr primitive_op is_primitive_op()
+{
+  return basic_binary_op_t<ScanOpT>::value ? primitive_op::yes : primitive_op::no;
+}
+
+template <class ValueT>
+constexpr val_size classify_val_size()
+{
+  return sizeof(ValueT) == 1 ? val_size::_1
+       : sizeof(ValueT) == 2 ? val_size::_2
+       : sizeof(ValueT) == 4 ? val_size::_4
+       : sizeof(ValueT) == 8 ? val_size::_8
+       : sizeof(ValueT) == 16
+         ? val_size::_16
+         : val_size::unknown;
+}
+
+template <class KeyT>
+constexpr key_size classify_key_size()
+{
+  return sizeof(KeyT) == 1 ? key_size::_1
+       : sizeof(KeyT) == 2 ? key_size::_2
+       : sizeof(KeyT) == 4 ? key_size::_4
+       : sizeof(KeyT) == 8 ? key_size::_8
+       : sizeof(KeyT) == 16
+         ? key_size::_16
+         : key_size::unknown;
+}
+
+template <class KeyT,
+          class AccumT,
+          primitive_op PrimitiveOp,
+          key_size KeySize                     = classify_key_size<KeyT>(),
+          val_size AccumSize                   = classify_val_size<AccumT>(),
+          primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>()>
+struct sm90_tuning
+{
+  static constexpr int nominal_4b_items_per_thread = 9;
+
+  static constexpr int threads = 256;
+
+  static constexpr size_t max_input_bytes = (cub::max)(sizeof(KeyT), sizeof(AccumT));
+
+  static constexpr size_t combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
+
+  static constexpr int items =
+    ((max_input_bytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<AccumT, int>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<650>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 16;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<124, 995>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<488, 545>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<488, 1070>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<936, 1105>;
+};
+
+template <class KeyT>
+struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<936, 1105>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<136, 785>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<445>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 22;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<312, 865>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<352, 1170>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<504, 1190>;
+};
+
+template <class KeyT>
+struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<504, 1190>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<850>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<128, 965>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<700, 1005>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<556, 1195>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<512, 1030>;
+};
+
+template <class KeyT>
+struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<512, 1030>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<504, 1010>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<420, 970>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<500, 1125>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<600, 930>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<364, 1085>;
+};
+
+template <class KeyT>
+struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<364, 1085>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<500, 975>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<164, 1075>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<268, 1120>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<320, 1200>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm90_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<364, 1050>;
+};
+
+template <class KeyT>
+struct sm90_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<364, 1050>;
+};
+#endif
+
+template <class KeyT,
+          class AccumT,
+          primitive_op PrimitiveOp,
+          key_size KeySize                     = classify_key_size<KeyT>(),
+          val_size AccumSize                   = classify_val_size<AccumT>(),
+          primitive_accum PrimitiveAccumulator = is_primitive_accum<AccumT>()>
+struct sm80_tuning
+{
+  static constexpr int nominal_4b_items_per_thread = 9;
+
+  static constexpr int threads = 256;
+
+  static constexpr size_t max_input_bytes = (cub::max)(sizeof(KeyT), sizeof(AccumT));
+
+  static constexpr size_t combined_input_bytes = sizeof(KeyT) + sizeof(AccumT);
+
+  static constexpr int items =
+    ((max_input_bytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(nominal_4b_items_per_thread, combined_input_bytes));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::default_reduce_by_key_delay_constructor_t<AccumT, int>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<795>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<825>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<640>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_1, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<124, 1040>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 19;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1095>;
+};
+
+template <class KeyT>
+struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_1, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 19;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1095>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 8;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1070>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 320;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<625>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1055>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_2, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 160;
+
+  static constexpr int items = 17;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<160, 695>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 160;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1105>;
+};
+
+template <class KeyT>
+struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_2, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 160;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1105>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1130>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1130>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1140>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_4, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<888, 635>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 17;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1100>;
+};
+
+template <class KeyT>
+struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_4, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 17;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1100>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1120>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1115>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 13;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<24, 1060>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_8, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1160>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 320;
+
+  static constexpr int items = 8;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<220>;
+};
+
+template <class KeyT>
+struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_8, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 320;
+
+  static constexpr int items = 8;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<220>;
+};
+#endif
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_1, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<144, 1120>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_2, primitive_accum::yes>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<364, 780>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_4, primitive_accum::yes>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1170>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_op::yes, key_size::_16, val_size::_8, primitive_accum::yes>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1030>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <class KeyT>
+struct sm80_tuning<KeyT, __int128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1160>;
+};
+
+template <class KeyT>
+struct sm80_tuning<KeyT, __uint128_t, primitive_op::yes, key_size::_16, val_size::_16, primitive_accum::no>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr BlockStoreAlgorithm store_algorithm = BLOCK_STORE_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1160>;
+};
+#endif
+} // namespace scan_by_key
+} // namespace detail
+
+template <typename KeysInputIteratorT, typename AccumT, typename ValueT = AccumT, typename ScanOpT = Sum>
+struct DeviceScanByKeyPolicy
+{
+  using KeyT = cub::detail::value_t<KeysInputIteratorT>;
+
+  static constexpr size_t MaxInputBytes = (cub::max)(sizeof(KeyT), sizeof(AccumT));
+
+  static constexpr size_t CombinedInputBytes = sizeof(KeyT) + sizeof(AccumT);
+
+  // SM350
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 6;
+    static constexpr int ITEMS_PER_THREAD =
+      ((MaxInputBytes <= 8) ? 6 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes));
+
+    using ScanByKeyPolicyT =
+      AgentScanByKeyPolicy<128,
+                           ITEMS_PER_THREAD,
+                           BLOCK_LOAD_WARP_TRANSPOSE,
+                           LOAD_CA,
+                           BLOCK_SCAN_WARP_SCANS,
+                           BLOCK_STORE_WARP_TRANSPOSE,
+                           detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
+  };
+
+  struct DefaultTuning
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 9;
+    static constexpr int ITEMS_PER_THREAD =
+      ((MaxInputBytes <= 8) ? 9 : Nominal4BItemsToItemsCombined(NOMINAL_4B_ITEMS_PER_THREAD, CombinedInputBytes));
+
+    using ScanByKeyPolicyT =
+      AgentScanByKeyPolicy<256,
+                           ITEMS_PER_THREAD,
+                           BLOCK_LOAD_WARP_TRANSPOSE,
+                           LOAD_CA,
+                           BLOCK_SCAN_WARP_SCANS,
+                           BLOCK_STORE_WARP_TRANSPOSE,
+                           detail::default_reduce_by_key_delay_constructor_t<AccumT, int>>;
+  };
+
+  // SM520
+  struct Policy520
+      : DefaultTuning
+      , ChainedPolicy<520, Policy520, Policy350>
+  {};
+
+  // SM800
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy520>
+  {
+    using tuning = detail::scan_by_key::sm80_tuning<KeyT, ValueT, detail::scan_by_key::is_primitive_op<ScanOpT>()>;
+
+    using ScanByKeyPolicyT =
+      AgentScanByKeyPolicy<tuning::threads,
+                           tuning::items,
+                           tuning::load_algorithm,
+                           LOAD_DEFAULT,
+                           BLOCK_SCAN_WARP_SCANS,
+                           tuning::store_algorithm,
+                           typename tuning::delay_constructor>;
+  };
+
+  // SM860
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  // SM900
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::scan_by_key::sm90_tuning<KeyT, ValueT, detail::scan_by_key::is_primitive_op<ScanOpT>()>;
+
+    using ScanByKeyPolicyT =
+      AgentScanByKeyPolicy<tuning::threads,
+                           tuning::items,
+                           tuning::load_algorithm,
+                           LOAD_DEFAULT,
+                           BLOCK_SCAN_WARP_SCANS,
+                           tuning::store_algorithm,
+                           typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
new file mode 100644
index 000000000..5a9e0651d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_select_if.cuh
@@ -0,0 +1,784 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_select_if.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace select
+{
+
+enum class flagged
+{
+  no,
+  yes
+};
+enum class keep_rejects
+{
+  no,
+  yes
+};
+enum class primitive
+{
+  no,
+  yes
+};
+enum class offset_size
+{
+  _4,
+  _8,
+  unknown
+};
+enum class input_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+template <class InputT>
+constexpr primitive is_primitive()
+{
+  return Traits<InputT>::PRIMITIVE ? primitive::yes : primitive::no;
+}
+
+template <class FlagT>
+constexpr flagged is_flagged()
+{
+  return std::is_same<FlagT, NullType>::value ? flagged::no : flagged::yes;
+}
+
+template <bool KeepRejects>
+constexpr keep_rejects are_rejects_kept()
+{
+  return KeepRejects ? keep_rejects::yes : keep_rejects::no;
+}
+
+template <class InputT>
+constexpr input_size classify_input_size()
+{
+  return sizeof(InputT) == 1 ? input_size::_1
+       : sizeof(InputT) == 2 ? input_size::_2
+       : sizeof(InputT) == 4 ? input_size::_4
+       : sizeof(InputT) == 8 ? input_size::_8
+       : sizeof(InputT) == 16
+         ? input_size::_16
+         : input_size::unknown;
+}
+
+template <class OffsetT>
+constexpr offset_size classify_offset_size()
+{
+  return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
+}
+
+template <class InputT,
+          flagged,
+          keep_rejects,
+          offset_size OffsetSize,
+          primitive            = is_primitive<InputT>(),
+          input_size InputSize = classify_input_size<InputT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 10;
+
+  static constexpr int items =
+    CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(InputT))));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<350, 450>;
+};
+
+// select::if
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 22;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<580>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 22;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<320, 605>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 17;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<76, 1150>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<380, 1140>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>;
+};
+
+template <>
+struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<460, 1145>;
+};
+#endif
+
+// select::flagged
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 448;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<715>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 448;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<504, 765>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<415, 1125>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<360, 1170>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 3;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>;
+};
+
+template <>
+struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 3;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<284, 1130>;
+};
+#endif
+
+// partition::if
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<908, 995>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 320;
+  static constexpr int items   = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<500, 560>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<536, 1055>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 128;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<512, 1075>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>;
+};
+
+template <>
+struct sm90_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<1616, 1115>;
+};
+#endif
+
+// partition::flagged
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<580, 850>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<388, 1055>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<72, 1165>;
+};
+
+template <class Input>
+struct sm90_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 224;
+  static constexpr int items   = 6;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<532, 1180>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 160;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<720, 1105>;
+};
+
+template <>
+struct sm90_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 160;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<720, 1105>;
+};
+#endif
+
+template <class InputT,
+          flagged,
+          keep_rejects,
+          offset_size OffsetSize,
+          primitive            = is_primitive<InputT>(),
+          input_size InputSize = classify_input_size<InputT>()>
+struct sm80_tuning
+{
+  static constexpr int threads = 128;
+
+  static constexpr int nominal_4b_items_per_thread = 10;
+
+  static constexpr int items =
+    CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(InputT))));
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<350, 450>;
+};
+
+// select::if
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 992;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<395>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 576;
+  static constexpr int items   = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<870>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1130>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<832, 1165>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 4;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1140>;
+};
+
+template <>
+struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 4;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1140>;
+};
+#endif
+
+// select::flagged
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 224;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<735>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1155>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 320;
+  static constexpr int items   = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<124, 1115>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::no, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 6;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1130>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>;
+};
+
+template <>
+struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<464, 1025>;
+};
+#endif
+
+// partition::if
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<510>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 224;
+  static constexpr int items   = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1045>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1040>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::no, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<68, 1160>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
+};
+
+template <>
+struct sm80_tuning<__uint128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
+};
+#endif
+
+// partition::flagged
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_1>
+{
+  static constexpr int threads = 512;
+  static constexpr int items   = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<595>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_2>
+{
+  static constexpr int threads = 224;
+  static constexpr int items   = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1105>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_4>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<912, 1025>;
+};
+
+template <class Input>
+struct sm80_tuning<Input, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::yes, input_size::_8>
+{
+  static constexpr int threads = 192;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<884, 1130>;
+};
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
+};
+
+template <>
+struct sm80_tuning<__uint128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 5;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<400, 1090>;
+};
+#endif
+
+} // namespace select
+
+template <class InputT, class FlagT, class OffsetT, bool MayAlias, bool KeepRejects>
+struct device_select_policy_hub
+{
+  struct DefaultTuning
+  {
+    static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = 10;
+
+    static constexpr int ITEMS_PER_THREAD =
+      CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
+
+    using SelectIfPolicyT =
+      AgentSelectIfPolicy<128,
+                          ITEMS_PER_THREAD,
+                          BLOCK_LOAD_DIRECT,
+                          MayAlias ? LOAD_CA : LOAD_LDG,
+                          BLOCK_SCAN_WARP_SCANS,
+                          detail::fixed_delay_constructor_t<350, 450>>;
+  };
+
+  struct Policy350
+      : DefaultTuning
+      , ChainedPolicy<350, Policy350, Policy350>
+  {};
+
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
+  {
+    using tuning =
+      detail::select::sm80_tuning<InputT,
+                                  select::is_flagged<FlagT>(),
+                                  select::are_rejects_kept<KeepRejects>(),
+                                  select::classify_offset_size<OffsetT>()>;
+
+    using SelectIfPolicyT =
+      AgentSelectIfPolicy<tuning::threads,
+                          tuning::items,
+                          tuning::load_algorithm,
+                          LOAD_DEFAULT,
+                          BLOCK_SCAN_WARP_SCANS,
+                          typename tuning::delay_constructor>;
+  };
+
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning =
+      detail::select::sm90_tuning<InputT,
+                                  select::is_flagged<FlagT>(),
+                                  select::are_rejects_kept<KeepRejects>(),
+                                  select::classify_offset_size<OffsetT>()>;
+
+    using SelectIfPolicyT =
+      AgentSelectIfPolicy<tuning::threads,
+                          tuning::items,
+                          tuning::load_algorithm,
+                          LOAD_DEFAULT,
+                          BLOCK_SCAN_WARP_SCANS,
+                          typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
new file mode 100644
index 000000000..13dcd2230
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh
@@ -0,0 +1,337 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_three_way_partition.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace three_way_partition
+{
+
+enum class input_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+enum class offset_size
+{
+  _4,
+  _8,
+  unknown
+};
+
+template <class InputT>
+constexpr input_size classify_input_size()
+{
+  return sizeof(InputT) == 1 ? input_size::_1
+       : sizeof(InputT) == 2 ? input_size::_2
+       : sizeof(InputT) == 4 ? input_size::_4
+       : sizeof(InputT) == 8 ? input_size::_8
+       : sizeof(InputT) == 16
+         ? input_size::_16
+         : input_size::unknown;
+}
+
+template <class OffsetT>
+constexpr offset_size classify_offset_size()
+{
+  return sizeof(OffsetT) == 4 ? offset_size::_4 : sizeof(OffsetT) == 8 ? offset_size::_8 : offset_size::unknown;
+}
+
+template <class InputT,
+          class OffsetT,
+          input_size InputSize   = classify_input_size<InputT>(),
+          offset_size OffsetSize = classify_offset_size<OffsetT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = Nominal4BItemsToItems<InputT>(9);
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using AccumPackHelperT  = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  using AccumPackT        = typename AccumPackHelperT::pack_t;
+  using delay_constructor = detail::default_delay_constructor_t<AccumPackT>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_1, offset_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<445>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_2, offset_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<104, 512>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_4, offset_size::_4>
+{
+  static constexpr int threads = 320;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1105>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_8, offset_size::_4>
+{
+  static constexpr int threads = 384;
+  static constexpr int items   = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<464, 1165>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_16, offset_size::_4>
+{
+  static constexpr int threads = 128;
+  static constexpr int items   = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1040>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_1, offset_size::_8>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 24;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<4, 285>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_2, offset_size::_8>
+{
+  static constexpr int threads = 640;
+  static constexpr int items   = 24;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<245>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_4, offset_size::_8>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<910>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_8, offset_size::_8>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 18;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1145>;
+};
+
+template <class Input, class OffsetT>
+struct sm90_tuning<Input, OffsetT, input_size::_16, offset_size::_8>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1050>;
+};
+
+template <class InputT,
+          class OffsetT,
+          input_size InputSize   = classify_input_size<InputT>(),
+          offset_size OffsetSize = classify_offset_size<OffsetT>()>
+struct sm80_tuning
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = Nominal4BItemsToItems<InputT>(9);
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  using AccumPackHelperT  = detail::three_way_partition::accumulator_pack_t<OffsetT>;
+  using AccumPackT        = typename AccumPackHelperT::pack_t;
+  using delay_constructor = detail::default_delay_constructor_t<AccumPackT>;
+};
+
+template <class Input, class OffsetT>
+struct sm80_tuning<Input, OffsetT, input_size::_2, offset_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<910>;
+};
+
+template <class Input, class OffsetT>
+struct sm80_tuning<Input, OffsetT, input_size::_4, offset_size::_4>
+{
+  static constexpr int threads = 256;
+  static constexpr int items   = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::no_delay_constructor_t<1120>;
+};
+
+template <class Input, class OffsetT>
+struct sm80_tuning<Input, OffsetT, input_size::_8, offset_size::_4>
+{
+  static constexpr int threads = 224;
+  static constexpr int items   = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<264, 1080>;
+};
+
+template <class Input, class OffsetT>
+struct sm80_tuning<Input, OffsetT, input_size::_16, offset_size::_4>
+{
+  static constexpr int threads = 128;
+  static constexpr int items   = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<672, 1120>;
+};
+
+} // namespace three_way_partition
+
+template <class InputT, class OffsetT>
+struct device_three_way_partition_policy_hub
+{
+  struct DefaultTuning
+  {
+    static constexpr int ITEMS_PER_THREAD = Nominal4BItemsToItems<InputT>(9);
+
+    using ThreeWayPartitionPolicy =
+      cub::AgentThreeWayPartitionPolicy<256,
+                                        ITEMS_PER_THREAD,
+                                        cub::BLOCK_LOAD_DIRECT,
+                                        cub::LOAD_DEFAULT,
+                                        cub::BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  /// SM35
+  struct Policy350
+      : DefaultTuning
+      , ChainedPolicy<350, Policy350, Policy350>
+  {};
+
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy350>
+  {
+    using tuning = detail::three_way_partition::sm80_tuning<InputT, OffsetT>;
+
+    using ThreeWayPartitionPolicy =
+      AgentThreeWayPartitionPolicy<tuning::threads,
+                                   tuning::items,
+                                   tuning::load_algorithm,
+                                   cub::LOAD_DEFAULT,
+                                   cub::BLOCK_SCAN_WARP_SCANS,
+                                   typename tuning::delay_constructor>;
+  };
+
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  /// SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::three_way_partition::sm90_tuning<InputT, OffsetT>;
+
+    using ThreeWayPartitionPolicy =
+      AgentThreeWayPartitionPolicy<tuning::threads,
+                                   tuning::items,
+                                   tuning::load_algorithm,
+                                   cub::LOAD_DEFAULT,
+                                   cub::BLOCK_SCAN_WARP_SCANS,
+                                   typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh
new file mode 100644
index 000000000..600ab797e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh
@@ -0,0 +1,792 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/agent/agent_unique_by_key.cuh>
+#include <cub/agent/single_pass_scan_operators.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+namespace unique_by_key
+{
+
+enum class primitive_key
+{
+  no,
+  yes
+};
+enum class primitive_val
+{
+  no,
+  yes
+};
+enum class key_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+enum class val_size
+{
+  _1,
+  _2,
+  _4,
+  _8,
+  _16,
+  unknown
+};
+
+template <class T>
+constexpr primitive_key is_primitive_key()
+{
+  return Traits<T>::PRIMITIVE ? primitive_key::yes : primitive_key::no;
+}
+
+template <class T>
+constexpr primitive_val is_primitive_val()
+{
+  return Traits<T>::PRIMITIVE ? primitive_val::yes : primitive_val::no;
+}
+
+template <class KeyT>
+constexpr key_size classify_key_size()
+{
+  return sizeof(KeyT) == 1 ? key_size::_1
+       : sizeof(KeyT) == 2 ? key_size::_2
+       : sizeof(KeyT) == 4 ? key_size::_4
+       : sizeof(KeyT) == 8 ? key_size::_8
+       : sizeof(KeyT) == 16
+         ? key_size::_16
+         : key_size::unknown;
+}
+
+template <class ValueT>
+constexpr val_size classify_val_size()
+{
+  return sizeof(ValueT) == 1 ? val_size::_1
+       : sizeof(ValueT) == 2 ? val_size::_2
+       : sizeof(ValueT) == 4 ? val_size::_4
+       : sizeof(ValueT) == 8 ? val_size::_8
+       : sizeof(ValueT) == 16
+         ? val_size::_16
+         : val_size::unknown;
+}
+
+template <class KeyT,
+          class ValueT,
+          primitive_key PrimitiveKey   = is_primitive_key<KeyT>(),
+          primitive_val PrimitiveAccum = is_primitive_val<ValueT>(),
+          key_size KeySize             = classify_key_size<KeyT>(),
+          val_size AccumSize           = classify_val_size<ValueT>()>
+struct sm90_tuning
+{
+  static constexpr int threads = 64;
+
+  static constexpr int nominal_4b_items_per_thread = 11;
+
+  static constexpr int items = Nominal4BItemsToItems<KeyT>(nominal_4b_items_per_thread);
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
+
+  using delay_constructor = detail::default_delay_constructor_t<int>;
+};
+
+// 8-bit key
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<550>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_2>
+{
+  static constexpr int threads = 448;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<725>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1130>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_8>
+{
+  static constexpr int threads = 512;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1100>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_1, val_size::_16>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<344, 1165>;
+};
+
+// 16-bit key
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<640>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_2>
+{
+  static constexpr int threads = 288;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<404, 710>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_4>
+{
+  static constexpr int threads = 512;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<525>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_8>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 23;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1200>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_2, val_size::_16>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<424, 1055>;
+};
+
+// 32-bit key
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_1>
+{
+  static constexpr int threads = 448;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<348, 580>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_2>
+{
+  static constexpr int threads = 384;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1060>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_4>
+{
+  static constexpr int threads = 512;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1045>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_8>
+{
+  static constexpr int threads = 512;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1120>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_4, val_size::_16>
+{
+  static constexpr int threads = 384;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1025>;
+};
+
+// 64-bit key
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_1>
+{
+  static constexpr int threads = 384;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1060>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_2>
+{
+  static constexpr int threads = 384;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<964, 1125>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_4>
+{
+  static constexpr int threads = 640;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1070>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_8>
+{
+  static constexpr int threads = 448;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1190>;
+};
+
+template <class KeyT, class ValueT>
+struct sm90_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_8, val_size::_16>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 9;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1155>;
+};
+
+template <class KeyT,
+          class ValueT,
+          primitive_key PrimitiveKey   = is_primitive_key<KeyT>(),
+          primitive_val PrimitiveAccum = is_primitive_val<ValueT>(),
+          key_size KeySize             = classify_key_size<KeyT>(),
+          val_size AccumSize           = classify_val_size<ValueT>()>
+struct sm80_tuning
+{
+  static constexpr int threads = 64;
+
+  static constexpr int nominal_4b_items_per_thread = 11;
+
+  static constexpr int items = Nominal4BItemsToItems<KeyT>(nominal_4b_items_per_thread);
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
+
+  using delay_constructor = detail::default_delay_constructor_t<int>;
+};
+
+// 8-bit key
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<835>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<765>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1155>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_1, val_size::_8>
+{
+  static constexpr int threads = 224;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1065>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_1, val_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 15;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<248, 1200>;
+};
+
+// 16-bit key
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_1>
+{
+  static constexpr int threads = 320;
+
+  static constexpr int items = 20;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1020>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_2>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 22;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<328, 1080>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<535>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_2, val_size::_8>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 10;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1055>;
+};
+
+// 32-bit key
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 12;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1120>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 14;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1185>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 11;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::no_delay_constructor_t<1115>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_4, val_size::_8>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<320, 1115>;
+};
+
+// 64-bit key
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_1>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<24, 555>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_2>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<324, 1105>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_4>
+{
+  static constexpr int threads = 256;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<740, 1105>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::yes, key_size::_8, val_size::_8>
+{
+  static constexpr int threads = 192;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<764, 1155>;
+};
+
+template <class KeyT, class ValueT>
+struct sm80_tuning<KeyT, ValueT, primitive_key::yes, primitive_val::no, key_size::_8, val_size::_16>
+{
+  static constexpr int threads = 128;
+
+  static constexpr int items = 7;
+
+  static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
+
+  static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
+
+  using delay_constructor = detail::fixed_delay_constructor_t<992, 1135>;
+};
+
+} // namespace unique_by_key
+} // namespace detail
+
+template <typename KeyInputIteratorT, typename ValueInputIteratorT = unsigned long long int*>
+struct DeviceUniqueByKeyPolicy
+{
+  using KeyT   = typename std::iterator_traits<KeyInputIteratorT>::value_type;
+  using ValueT = typename std::iterator_traits<ValueInputIteratorT>::value_type;
+
+  // SM350
+  struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
+  {
+    static constexpr int INPUT_SIZE = sizeof(KeyT);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      ITEMS_PER_THREAD            = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+    };
+
+    using UniqueByKeyPolicyT =
+      AgentUniqueByKeyPolicy<128,
+                             ITEMS_PER_THREAD,
+                             cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                             cub::LOAD_LDG,
+                             cub::BLOCK_SCAN_WARP_SCANS,
+                             detail::default_delay_constructor_t<int>>;
+  };
+
+  struct DefaultTuning
+  {
+    static constexpr int INPUT_SIZE = sizeof(KeyT);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      ITEMS_PER_THREAD            = Nominal4BItemsToItems<KeyT>(NOMINAL_4B_ITEMS_PER_THREAD),
+    };
+
+    using UniqueByKeyPolicyT =
+      AgentUniqueByKeyPolicy<64,
+                             ITEMS_PER_THREAD,
+                             cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                             cub::LOAD_LDG,
+                             cub::BLOCK_SCAN_WARP_SCANS,
+                             detail::default_delay_constructor_t<int>>;
+  };
+
+  // SM520
+  struct Policy520
+      : DefaultTuning
+      , ChainedPolicy<520, Policy520, Policy350>
+  {};
+
+  /// SM80
+  struct Policy800 : ChainedPolicy<800, Policy800, Policy520>
+  {
+    using tuning = detail::unique_by_key::sm80_tuning<KeyT, ValueT>;
+
+    using UniqueByKeyPolicyT =
+      AgentUniqueByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             tuning::load_modifier,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  // SM860
+  struct Policy860
+      : DefaultTuning
+      , ChainedPolicy<860, Policy860, Policy800>
+  {};
+
+  /// SM90
+  struct Policy900 : ChainedPolicy<900, Policy900, Policy860>
+  {
+    using tuning = detail::unique_by_key::sm90_tuning<KeyT, ValueT>;
+
+    using UniqueByKeyPolicyT =
+      AgentUniqueByKeyPolicy<tuning::threads,
+                             tuning::items,
+                             tuning::load_algorithm,
+                             tuning::load_modifier,
+                             BLOCK_SCAN_WARP_SCANS,
+                             typename tuning::delay_constructor>;
+  };
+
+  using MaxPolicy = Policy900;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_barrier.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_barrier.cuh
new file mode 100644
index 000000000..74ff85d61
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_barrier.cuh
@@ -0,0 +1,215 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/util_debug.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected:
+  using SyncFlag = unsigned int;
+
+  // Counters in global device memory
+  SyncFlag* d_sync;
+
+public:
+  /**
+   * Constructor
+   */
+  GridBarrier()
+      : d_sync(nullptr)
+  {}
+
+  /**
+   * Synchronize
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Sync() const
+  {
+    volatile SyncFlag* d_vol_sync = d_sync;
+
+    // Threadfence and syncthreads to make sure global writes are visible before
+    // thread-0 reports in with its sync counter
+    __threadfence();
+    CTA_SYNC();
+
+    if (blockIdx.x == 0)
+    {
+      // Report in ourselves
+      if (threadIdx.x == 0)
+      {
+        d_vol_sync[blockIdx.x] = 1;
+      }
+
+      CTA_SYNC();
+
+      // Wait for everyone else to report in
+      for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+      {
+        while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+        {
+          __threadfence_block();
+        }
+      }
+
+      CTA_SYNC();
+
+      // Let everyone know it's safe to proceed
+      for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+      {
+        d_vol_sync[peer_block] = 0;
+      }
+    }
+    else
+    {
+      if (threadIdx.x == 0)
+      {
+        // Report in
+        d_vol_sync[blockIdx.x] = 1;
+
+        // Wait for acknowledgment
+        while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+        {
+          __threadfence_block();
+        }
+      }
+
+      CTA_SYNC();
+    }
+  }
+};
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed
+ * for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+  // Number of bytes backed by d_sync
+  size_t sync_bytes;
+
+public:
+  /**
+   * Constructor
+   */
+  GridBarrierLifetime()
+      : GridBarrier()
+      , sync_bytes(0)
+  {}
+
+  /**
+   * DeviceFrees and resets the progress counters
+   */
+  cudaError_t HostReset()
+  {
+    cudaError_t retval = cudaSuccess;
+    if (d_sync)
+    {
+      retval = CubDebug(cudaFree(d_sync));
+      d_sync = nullptr;
+    }
+    sync_bytes = 0;
+    return retval;
+  }
+
+  /**
+   * Destructor
+   */
+  virtual ~GridBarrierLifetime()
+  {
+    HostReset();
+  }
+
+  /**
+   * Sets up the progress counters for the next kernel launch (lazily
+   * allocating and initializing them if necessary)
+   */
+  cudaError_t Setup(int sweep_grid_size)
+  {
+    cudaError_t retval = cudaSuccess;
+    do
+    {
+      size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+      if (new_sync_bytes > sync_bytes)
+      {
+        if (d_sync)
+        {
+          retval = CubDebug(cudaFree(d_sync));
+          if (cudaSuccess != retval)
+          {
+            break;
+          }
+        }
+
+        sync_bytes = new_sync_bytes;
+
+        // Allocate and initialize to zero
+        retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes));
+        if (cudaSuccess != retval)
+        {
+          break;
+        }
+
+        retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes));
+        if (cudaSuccess != retval)
+        {
+          break;
+        }
+      }
+    } while (0);
+
+    return retval;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_even_share.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_even_share.cuh
new file mode 100644
index 000000000..83ff8f92e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_even_share.cuh
@@ -0,0 +1,212 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an
+ * "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units
+ * (grains).
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/grid/grid_mapping.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * @par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * @par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+  int total_tiles;
+  int big_shares;
+  OffsetT big_share_items;
+  OffsetT normal_share_items;
+  OffsetT normal_base_offset;
+
+public:
+  /// Total number of input items
+  OffsetT num_items;
+
+  /// Grid size in thread blocks
+  int grid_size;
+
+  /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+  OffsetT block_offset;
+
+  /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+  OffsetT block_end;
+
+  /// Stride between input tiles
+  OffsetT block_stride;
+
+  /**
+   * \brief Constructor.
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridEvenShare()
+      : total_tiles(0)
+      , big_shares(0)
+      , big_share_items(0)
+      , normal_share_items(0)
+      , normal_base_offset(0)
+      , num_items(0)
+      , grid_size(0)
+      , block_offset(0)
+      , block_end(0)
+      , block_stride(0)
+  {}
+
+  /**
+   * @brief Dispatch initializer. To be called prior prior to kernel launch.
+   *
+   * @param num_items_
+   *   Total number of input items
+   *
+   * @param max_grid_size
+   *   Maximum grid size allowable (actual grid size may be less if not warranted by the the
+   *   number of input items)
+   *
+   * @param tile_items
+   *   Number of data items per input tile
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void DispatchInit(OffsetT num_items_, int max_grid_size, int tile_items)
+  {
+    this->block_offset      = num_items_; // Initialize past-the-end
+    this->block_end         = num_items_; // Initialize past-the-end
+    this->num_items         = num_items_;
+    this->total_tiles       = static_cast<int>(::cuda::ceil_div(num_items_, tile_items));
+    this->grid_size         = CUB_MIN(total_tiles, max_grid_size);
+    int avg_tiles_per_block = total_tiles / grid_size;
+    // leftover grains go to big blocks:
+    this->big_shares         = total_tiles - (avg_tiles_per_block * grid_size);
+    this->normal_share_items = avg_tiles_per_block * tile_items;
+    this->normal_base_offset = big_shares * tile_items;
+    this->big_share_items    = normal_share_items + tile_items;
+  }
+
+  /**
+   * @brief Initializes ranges for the specified thread block index. Specialized
+   *        for a "raking" access pattern in which each thread block is assigned a
+   *        consecutive sequence of input tiles.
+   */
+  template <int TILE_ITEMS>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(int block_id, Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+  {
+    block_stride = TILE_ITEMS;
+    if (block_id < big_shares)
+    {
+      // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+      block_offset = (block_id * big_share_items);
+      block_end    = block_offset + big_share_items;
+    }
+    else if (block_id < total_tiles)
+    {
+      // This thread block gets a normal share of grains (avg_tiles_per_block)
+      block_offset = normal_base_offset + (block_id * normal_share_items);
+      // Avoid generating values greater than num_items, as it may cause overflow
+      block_end = block_offset + CUB_MIN(num_items - block_offset, normal_share_items);
+    }
+    // Else default past-the-end
+  }
+
+  /**
+   * @brief Block-initialization, specialized for a "raking" access
+   *        pattern in which each thread block is assigned a consecutive sequence
+   *        of input tiles.
+   */
+  template <int TILE_ITEMS>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(int block_id, Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+  {
+    block_stride = grid_size * TILE_ITEMS;
+    block_offset = (block_id * TILE_ITEMS);
+    block_end    = num_items;
+  }
+
+  /**
+   * @brief Block-initialization, specialized for "strip mining" access
+   *        pattern in which the input tiles assigned to each thread block are
+   *        separated by a stride equal to the the extent of the grid.
+   */
+  template <int TILE_ITEMS, GridMappingStrategy STRATEGY>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit()
+  {
+    BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+  }
+
+  /**
+   * @brief Block-initialization, specialized for a "raking" access
+   *        pattern in which each thread block is assigned a consecutive sequence
+   *        of input tiles.
+   *
+   * @param[in] block_offset
+   *   Threadblock begin offset (inclusive)
+   *
+   * @param[in] block_end
+   *   Threadblock end offset (exclusive)
+   */
+  template <int TILE_ITEMS>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockInit(OffsetT block_offset, OffsetT block_end)
+  {
+    this->block_offset = block_offset;
+    this->block_end    = block_end;
+    this->block_stride = TILE_ITEMS;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_mapping.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_mapping.cuh
new file mode 100644
index 000000000..e16512ff1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_mapping.cuh
@@ -0,0 +1,106 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto
+ * a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide
+ * data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+  /**
+   * \brief An a "raking" access pattern in which each thread block is
+   * assigned a consecutive sequence of input tiles
+   *
+   * \par Overview
+   * The input is evenly partitioned into \p p segments, where \p p is
+   * constant and corresponds loosely to the number of thread blocks that may
+   * actively reside on the target device. Each segment is comprised of
+   * consecutive tiles, where a tile is a small, constant-sized unit of input
+   * to be processed to completion before the thread block terminates or
+   * obtains more work.  The kernel invokes \p p thread blocks, each
+   * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+   * in tile-size increments.
+   */
+  GRID_MAPPING_RAKE,
+
+  /**
+   * \brief An a "strip mining" access pattern in which the input tiles assigned
+   * to each thread block are separated by a stride equal to the the extent of
+   * the grid.
+   *
+   * \par Overview
+   * The input is evenly partitioned into \p p sets, where \p p is
+   * constant and corresponds loosely to the number of thread blocks that may
+   * actively reside on the target device. Each set is comprised of
+   * data tiles separated by stride \p tiles, where a tile is a small,
+   * constant-sized unit of input to be processed to completion before the
+   * thread block terminates or obtains more work.  The kernel invokes \p p
+   * thread blocks, each of which iteratively consumes a segment of
+   * <em>n</em>/<em>p</em> elements in tile-size increments.
+   */
+  GRID_MAPPING_STRIP_MINE,
+
+  /**
+   * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+   *
+   * \par Overview
+   * The input is treated as a queue to be dynamically consumed by a grid of
+   * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+   * unit of input to be processed to completion before the thread block
+   * terminates or obtains more work.  The grid size \p p is constant,
+   * loosely corresponding to the number of thread blocks that may actively
+   * reside on the target device.
+   */
+  GRID_MAPPING_DYNAMIC,
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_queue.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_queue.cuh
new file mode 100644
index 000000000..ec98c6445
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/grid/grid_queue.cuh
@@ -0,0 +1,201 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_debug.cuh>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * @par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * @par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * @par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * @par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * @tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+  /// Counter indices
+  enum
+  {
+    FILL  = 0,
+    DRAIN = 1,
+  };
+
+  /// Pair of counters
+  OffsetT* d_counters;
+
+public:
+  /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE static size_t AllocationSize()
+  {
+    return sizeof(OffsetT) * 2;
+  }
+
+  /// Constructs an invalid GridQueue descriptor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridQueue()
+      : d_counters(nullptr)
+  {}
+
+  /**
+   * @brief Constructs a GridQueue descriptor around the device storage allocation
+   *
+   * @param d_storage
+   *   Device allocation to back the GridQueue.  Must be at least as big as
+   *   <tt>AllocationSize()</tt>.
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE GridQueue(void* d_storage)
+      : d_counters((OffsetT*) d_storage)
+  {}
+
+  /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for
+  /// draining in the next kernel instance. To be called by the host or by a kernel prior to that
+  /// which will be draining.
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t FillAndResetDrain(OffsetT fill_size, cudaStream_t stream = 0)
+  {
+    cudaError_t result = cudaErrorUnknown;
+
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      ((void) stream; d_counters[FILL] = fill_size; d_counters[DRAIN] = 0; result = cudaSuccess;),
+      (OffsetT counters[2]; counters[FILL] = fill_size; counters[DRAIN] = 0;
+       result = CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));));
+
+    return result;
+  }
+
+  /// This operation resets the drain so that it may advance to meet the existing fill-size.
+  /// To be called by the host or by a kernel prior to that which will be draining.
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t ResetDrain(cudaStream_t stream = 0)
+  {
+    cudaError_t result = cudaErrorUnknown;
+
+    NV_IF_TARGET(NV_IS_DEVICE,
+                 ((void) stream; d_counters[DRAIN] = 0; result = cudaSuccess;),
+                 (result = CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));));
+
+    return result;
+  }
+
+  /// This operation resets the fill counter.
+  /// To be called by the host or by a kernel prior to that which will be filling.
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t ResetFill(cudaStream_t stream = 0)
+  {
+    cudaError_t result = cudaErrorUnknown;
+
+    NV_IF_TARGET(NV_IS_DEVICE,
+                 ((void) stream; d_counters[FILL] = 0; result = cudaSuccess;),
+                 (result = CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));));
+
+    return result;
+  }
+
+  /// Returns the fill-size established by the parent or by the previous kernel.
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t FillSize(OffsetT& fill_size, cudaStream_t stream = 0)
+  {
+    cudaError_t result = cudaErrorUnknown;
+
+    NV_IF_TARGET(NV_IS_DEVICE,
+                 ((void) stream; fill_size = d_counters[FILL]; result = cudaSuccess;),
+                 (result = CubDebug(
+                    cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));));
+
+    return result;
+  }
+
+  /// Drain @p num_items from the queue. Returns offset from which to read items.
+  /// To be called from CUDA kernel.
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT Drain(OffsetT num_items)
+  {
+    return atomicAdd(d_counters + DRAIN, num_items);
+  }
+
+  /// Fill @p num_items into the queue. Returns offset from which to write items.
+  /// To be called from CUDA kernel.
+  _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT Fill(OffsetT num_items)
+  {
+    return atomicAdd(d_counters + FILL, num_items);
+  }
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(GridQueue<OffsetT> grid_queue, OffsetT num_items)
+{
+  grid_queue.FillAndResetDrain(num_items);
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/host/mutex.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/host/mutex.cuh
new file mode 100644
index 000000000..465d00eba
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/host/mutex.cuh
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_deprecated.cuh>
+
+#include <mutex>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * Wraps std::mutex
+ *  deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed
+ *             in a future release. Use `std::mutex` instead.
+ */
+struct CUB_DEPRECATED Mutex
+{
+  std::mutex mtx;
+
+  void Lock()
+  {
+    mtx.lock();
+  }
+
+  void Unlock()
+  {
+    mtx.unlock();
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/arg_index_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 000000000..413f38853
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#include <thrust/version.h>
+
+#include <iostream>
+#include <iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input wrapper for pairing dereferenced values with their corresponding
+ *        indices (forming \p KeyValuePair tuples).
+ *
+ * @par Overview
+ * - ArgIndexInputIterator wraps a random access input iterator @p itr of type @p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIterator at offset @p i produces a @p KeyValuePair value whose
+ *   @p key field is @p i and whose @p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p ArgIndexInputIterator to
+ * dereference an array of doubles
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * cub::ArgIndexInputIterator<double*>::value_type tup = *itr;
+ * printf("%f @ %ld\n",
+ *   tup.value,
+ *   tup.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * tup = *itr;
+ * printf("%f @ %ld\n",
+ *   tup.value,
+ *   tup.key);   // 9.0 @ 6
+ *
+ * @endcode
+ *
+ * @tparam InputIteratorT
+ *   The value type of the wrapped input iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ *
+ * @tparam OutputValueT
+ *   The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <typename InputIteratorT,
+          typename OffsetT      = ptrdiff_t,
+          typename OutputValueT = cub::detail::value_t<InputIteratorT>>
+class ArgIndexInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = ArgIndexInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = KeyValuePair<difference_type, OutputValueT>;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = value_type*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = value_type;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::any_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  InputIteratorT itr;
+  difference_type offset;
+
+public:
+  /**
+   * @param itr
+   *   Input iterator to wrap
+   *
+   * @param offset
+   *   OffsetT (in items) from @p itr denoting the position of the iterator
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ArgIndexInputIterator(InputIteratorT itr, difference_type offset = 0)
+      : itr(itr)
+      , offset(offset)
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    offset++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    offset++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    value_type retval;
+    retval.value = itr[offset];
+    retval.key   = offset;
+    return retval;
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(itr, offset + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    offset += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(itr, offset - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    offset -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return offset - other.offset;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    self_type offset = (*this) + n;
+    return *offset;
+  }
+
+  /// Structure dereference
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return &(*(*this));
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs)
+  {
+    return ((itr == rhs.itr) && (offset == rhs.offset));
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs)
+  {
+    return ((itr != rhs.itr) || (offset != rhs.offset));
+  }
+
+  /// Normalize
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void normalize()
+  {
+    itr += offset;
+    offset = 0;
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+  {
+    return os;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 000000000..06456e2a6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,247 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  include <iostream>
+#  include <iterator>
+#else
+#  include <cuda/std/iterator>
+#endif
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input wrapper for dereferencing array values using a PTX cache load
+ *        modifier.
+ *
+ * @par Overview
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. @p ValueType references are
+ *   made by reading @p ValueType values through loads modified by @p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CacheModifiedInputIterator to
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * @endcode
+ *
+ * @tparam CacheLoadModifier
+ *   The cub::CacheLoadModifier to use when accessing data
+ *
+ * @tparam ValueType
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = CacheModifiedInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = ValueType;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = ValueType*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = ValueType;
+
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::device_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#  else // THRUST_VERSION < 100700
+  using iterator_category = std::random_access_iterator_tag;
+#  endif // THRUST_VERSION
+#else // defined(_CCCL_COMPILER_NVRTC)
+  using iterator_category = ::cuda::std::random_access_iterator_tag;
+#endif // defined(_CCCL_COMPILER_NVRTC)
+
+public:
+  /// Wrapped native pointer
+  ValueType* ptr;
+
+  /// Constructor
+  template <typename QualifiedValueType>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedInputIterator(QualifiedValueType* ptr) ///< Native pointer to wrap
+      : ptr(const_cast<typename ::cuda::std::remove_cv<QualifiedValueType>::type*>(ptr))
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    ptr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    ptr++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    return ThreadLoad<MODIFIER>(ptr);
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(ptr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    ptr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(ptr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    ptr -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return ptr - other.ptr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    return ThreadLoad<MODIFIER>(ptr + n);
+  }
+
+  /// Structure dereference
+  _CCCL_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return &ThreadLoad<MODIFIER>(ptr);
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const
+  {
+    return (ptr == rhs.ptr);
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const
+  {
+    return (ptr != rhs.ptr);
+  }
+
+  /// ostream operator
+#if !defined(_CCCL_COMPILER_NVRTC)
+  friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+  {
+    return os;
+  }
+#endif
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_output_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 000000000..b442a4ad9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,258 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#include <iostream>
+#include <iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * @par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. @p ValueType references are
+ *   made by writing @p ValueType values through stores modified by @p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * @endcode
+ *
+ * @par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * @tparam CacheStoreModifier
+ *   The cub::CacheStoreModifier to use when accessing data
+ *
+ * @tparam ValueType
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <CacheStoreModifier MODIFIER, typename ValueType, typename OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+  // Proxy object
+  struct Reference
+  {
+    ValueType* ptr;
+
+    /// Constructor
+    _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Reference(ValueType* ptr)
+        : ptr(ptr)
+    {}
+
+    /// Assignment
+    _CCCL_DEVICE _CCCL_FORCEINLINE ValueType operator=(ValueType val)
+    {
+      ThreadStore<MODIFIER>(ptr, val);
+      return val;
+    }
+  };
+
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = CacheModifiedOutputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = void;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = void;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = Reference;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::device_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  ValueType* ptr;
+
+public:
+  /**
+   * @param ptr
+   *   Native pointer to wrap
+   */
+  template <typename QualifiedValueType>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedOutputIterator(QualifiedValueType* ptr)
+      : ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type*>(ptr))
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    ptr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    ptr++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    return Reference(ptr);
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(ptr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    ptr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(ptr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    ptr -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return ptr - other.ptr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    return Reference(ptr + n);
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs)
+  {
+    return (ptr == rhs.ptr);
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs)
+  {
+    return (ptr != rhs.ptr);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+  {
+    return os;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/constant_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 000000000..ea12b2675
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,241 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#include <iostream>
+#include <iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * @par Overview
+ * - Read references to a ConstantInputIterator always return the supplied constant
+ *   of type @p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p ConstantInputIterator to
+ * dereference a sequence of homogeneous doubles.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * @endcode
+ *
+ * @tparam ValueType
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <typename ValueType, typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = ConstantInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = ValueType;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = ValueType*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = ValueType;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::any_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  ValueType val;
+  OffsetT offset;
+#ifdef _WIN32
+  // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+  OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+  /**
+   * @param val
+   *   Starting value for the iterator instance to report
+   *
+   * @param offset
+   *   Base offset
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ConstantInputIterator(ValueType val, OffsetT offset = 0)
+      : val(val)
+      , offset(offset)
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    offset++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    offset++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    return val;
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(val, offset + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    offset += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(val, offset - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    offset -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return offset - other.offset;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance /*n*/) const
+  {
+    return val;
+  }
+
+  /// Structure dereference
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return &val;
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const
+  {
+    return (offset == rhs.offset) && ((val == rhs.val));
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const
+  {
+    return (offset != rhs.offset) || (val != rhs.val);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+  {
+    os << "[" << itr.val << "," << itr.offset << "]";
+    return os;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/counting_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 000000000..5a384d45f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  include <iostream>
+#  include <iterator>
+#else
+#  include <cuda/std/iterator>
+#endif
+
+#include <cuda/std/iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * @par Overview
+ * - After initializing a CountingInputIterator to a certain integer @p base, read references
+ *   at @p offset will return the value @p base + @p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CountingInputIterator to
+ * dereference a sequence of incrementing integers.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * @endcode
+ *
+ * @tparam ValueType
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <typename ValueType, typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = CountingInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = ValueType;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = ValueType*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = ValueType;
+
+#if !defined(_CCCL_COMPILER_NVRTC)
+#  if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::any_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#  else // THRUST_VERSION < 100700
+  using iterator_category = std::random_access_iterator_tag;
+#  endif // THRUST_VERSION
+#else // defined(_CCCL_COMPILER_NVRTC)
+  using iterator_category = ::cuda::std::random_access_iterator_tag;
+#endif // defined(_CCCL_COMPILER_NVRTC)
+
+private:
+  ValueType val;
+
+public:
+  /**
+   * @param val
+   *   Starting value for the iterator instance to report
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CountingInputIterator(const ValueType& val)
+      : val(val)
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    val++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    val++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    return val;
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(val + (ValueType) n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    val += (ValueType) n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(val - (ValueType) n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    val -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return (difference_type) (val - other.val);
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    return val + (ValueType) n;
+  }
+
+  /// Structure dereference
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return &val;
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const
+  {
+    return (val == rhs.val);
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const
+  {
+    return (val != rhs.val);
+  }
+
+  /// ostream operator
+#if !defined(_CCCL_COMPILER_NVRTC)
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+  {
+    os << "[" << itr.val << "]";
+    return os;
+  }
+#endif
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/discard_output_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 000000000..075fdb11f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,217 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <iostream>
+#include <iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = DiscardOutputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = void;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = void;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = void;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::any_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+  // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+  OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))] = {};
+#endif
+
+public:
+  /**
+   * @param offset
+   *   Base offset
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DiscardOutputIterator(OffsetT offset = 0)
+      : offset(offset)
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    offset++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    offset++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator*()
+  {
+    // return self reference, which can be assigned to anything
+    return *this;
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(offset + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    offset += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(offset - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    offset -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return offset - other.offset;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator[](Distance n)
+  {
+    // return self reference, which can be assigned to anything
+    return *this;
+  }
+
+  /// Structure dereference
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return;
+  }
+
+  /// Assignment to anything else (no-op)
+  template <typename T>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE void operator=(T const&)
+  {}
+
+  /// Cast to void* operator
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator void*() const
+  {
+    return nullptr;
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs)
+  {
+    return (offset == rhs.offset);
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs)
+  {
+    return (offset != rhs.offset);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+  {
+    os << "[" << itr.offset << "]";
+    return os;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/tex_obj_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 000000000..0837b0744
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,326 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_debug.cuh>
+
+#include <iostream>
+#include <iterator>
+
+#include <nv/target>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input wrapper for dereferencing array values through texture cache.
+ *        Uses newer Kepler-style texture objects.
+ *
+ * @par Overview
+ * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p TexObjInputIterator to
+ * dereference a device array of doubles through texture cache.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * @endcode
+ *
+ * @tparam T
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <typename T, typename OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = TexObjInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = T;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = T*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = T;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::device_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  // Largest texture word we can use in device
+  using TextureWord = typename UnitWord<T>::TextureWord;
+
+  // Number of texture words per T
+  enum
+  {
+    TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+  };
+
+private:
+  T* ptr;
+  difference_type tex_offset;
+  cudaTextureObject_t tex_obj;
+
+public:
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TexObjInputIterator()
+      : ptr(nullptr)
+      , tex_offset(0)
+      , tex_obj(0)
+  {}
+
+  /**
+   * @brief Use this iterator to bind @p ptr with a texture reference
+   *
+   * @param ptr
+   *   Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+   *
+   * @param bytes
+   *   Number of bytes in the range
+   *
+   * @param tex_offset
+   *   OffsetT (in items) from @p ptr denoting the position of the iterator
+   */
+  template <typename QualifiedT>
+  cudaError_t BindTexture(QualifiedT* ptr, size_t bytes, size_t tex_offset = 0)
+  {
+    this->ptr        = const_cast<typename std::remove_cv<QualifiedT>::type*>(ptr);
+    this->tex_offset = static_cast<difference_type>(tex_offset);
+
+    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<TextureWord>();
+    cudaResourceDesc res_desc;
+    cudaTextureDesc tex_desc;
+    memset(&res_desc, 0, sizeof(cudaResourceDesc));
+    memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+    res_desc.resType                = cudaResourceTypeLinear;
+    res_desc.res.linear.devPtr      = this->ptr;
+    res_desc.res.linear.desc        = channel_desc;
+    res_desc.res.linear.sizeInBytes = bytes;
+    tex_desc.readMode               = cudaReadModeElementType;
+    return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, nullptr));
+  }
+
+  /// Unbind this iterator from its texture reference
+  cudaError_t UnbindTexture()
+  {
+    return CubDebug(cudaDestroyTextureObject(tex_obj));
+  }
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    tex_offset++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    tex_offset++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    NV_IF_TARGET(NV_IS_HOST, (return ptr[tex_offset];), (return this->device_deref();));
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval;
+    retval.ptr        = ptr;
+    retval.tex_obj    = tex_obj;
+    retval.tex_offset = tex_offset + n;
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    tex_offset += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval;
+    retval.ptr        = ptr;
+    retval.tex_obj    = tex_obj;
+    retval.tex_offset = tex_offset - n;
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    tex_offset -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return tex_offset - other.tex_offset;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    self_type offset = (*this) + n;
+    return *offset;
+  }
+
+  /// Structure dereference
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE pointer operator->()
+  {
+    return &(*(*this));
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const
+  {
+    return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const
+  {
+    return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+  {
+    os << "cub::TexObjInputIterator( ptr=" << itr.ptr << ", offset=" << itr.tex_offset << ", tex_obj=" << itr.tex_obj
+       << " )";
+    return os;
+  }
+
+private:
+  // This is hoisted out of operator* because #pragma can't be used inside of
+  // NV_IF_TARGET
+  _CCCL_DEVICE _CCCL_FORCEINLINE reference device_deref() const
+  {
+    // Move array of uninitialized words, then alias and assign to return
+    // value
+    TextureWord words[TEXTURE_MULTIPLE];
+
+    const auto tex_idx_base = tex_offset * TEXTURE_MULTIPLE;
+
+#pragma unroll
+    for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+    {
+      words[i] = tex1Dfetch<TextureWord>(tex_obj, tex_idx_base + i);
+    }
+
+    // Load from words
+    return *reinterpret_cast<T*>(words);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/iterator/transform_input_iterator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 000000000..a396a2dd8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,253 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+
+#include <iostream>
+#include <iterator>
+
+#if (THRUST_VERSION >= 100700)
+// This iterator is compatible with Thrust API 1.7 and newer
+#  include <thrust/iterator/iterator_facade.h>
+#  include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * @par Overview
+ * - TransformInputIterator wraps a unary conversion functor of type
+ *   @p ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type @p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of @p TransformInputIterator to
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * @endcode
+ *
+ * @tparam ValueType
+ *   The value type of this iterator
+ *
+ * @tparam ConversionOp
+ *   Unary functor type for mapping objects of type @p InputType to type @p ValueType.
+ *   Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ *
+ * @tparam InputIteratorT
+ *   The type of the wrapped input iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
+ */
+template <typename ValueType, typename ConversionOp, typename InputIteratorT, typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+  // Required iterator traits
+
+  /// My own type
+  using self_type = TransformInputIterator;
+
+  /// Type to express the result of subtracting one iterator from another
+  using difference_type = OffsetT;
+
+  /// The type of the element the iterator can point to
+  using value_type = ValueType;
+
+  /// The type of a pointer to an element the iterator can point to
+  using pointer = ValueType*;
+
+  /// The type of a reference to an element the iterator can point to
+  using reference = ValueType;
+
+#if (THRUST_VERSION >= 100700)
+  // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+  /// The iterator category
+  using iterator_category = typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+    THRUST_NS_QUALIFIER::any_system_tag,
+    THRUST_NS_QUALIFIER::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+#else
+  /// The iterator category
+  using iterator_category = std::random_access_iterator_tag;
+#endif // THRUST_VERSION
+
+private:
+  ConversionOp conversion_op;
+  InputIteratorT input_itr;
+
+public:
+  /**
+   * @param input_itr
+   *   Input iterator to wrap
+   *
+   * @param conversion_op
+   *   Conversion functor to wrap
+   */
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE TransformInputIterator(InputIteratorT input_itr, ConversionOp conversion_op)
+      : conversion_op(conversion_op)
+      , input_itr(input_itr)
+  {}
+
+  /// Postfix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
+  {
+    self_type retval = *this;
+    input_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++()
+  {
+    input_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
+  {
+    return conversion_op(*input_itr);
+  }
+
+  /// Addition
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator+(Distance n) const
+  {
+    self_type retval(input_itr + n, conversion_op);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator+=(Distance n)
+  {
+    input_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator-(Distance n) const
+  {
+    self_type retval(input_itr - n, conversion_op);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type& operator-=(Distance n)
+  {
+    input_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_type other) const
+  {
+    return input_itr - other.input_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](Distance n) const
+  {
+    return conversion_op(input_itr[n]);
+  }
+
+  /// Equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_type& rhs) const
+  {
+    return (input_itr == rhs.input_itr);
+  }
+
+  /// Not equal to
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_type& rhs) const
+  {
+    return (input_itr != rhs.input_itr);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */)
+  {
+    return os;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_driver.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_driver.cuh
new file mode 100644
index 000000000..5220c881f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_driver.cuh
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !defined(_CCCL_CUDACC_BELOW_12_0)
+
+#  include <cuda.h>
+
+#  include <cub/util_device.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+struct CudaDriverLauncher
+{
+  dim3 grid;
+  dim3 block;
+  std::size_t shared_mem;
+  CUstream stream;
+
+  template <typename... Args>
+  cudaError_t doit(CUkernel kernel, Args const&... args) const
+  {
+    void* kernel_args[] = {const_cast<void*>(static_cast<void const*>(&args))...};
+
+    CUfunction kernel_fn;
+    auto status = static_cast<cudaError_t>(cuKernelGetFunction(&kernel_fn, kernel));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    return static_cast<cudaError_t>(
+      cuLaunchKernel(kernel_fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, shared_mem, stream, kernel_args, 0));
+  }
+};
+
+struct CudaDriverLauncherFactory
+{
+  CudaDriverLauncher operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, CUstream stream) const
+  {
+    return CudaDriverLauncher{grid, block, shared_mem, stream};
+  }
+
+  cudaError_t PtxVersion(int& version) const
+  {
+    version = cc;
+    return cudaSuccess;
+  }
+
+  cudaError_t MultiProcessorCount(int& sm_count) const
+  {
+    return static_cast<cudaError_t>(cuDeviceGetAttribute(&sm_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+  }
+
+  cudaError_t MaxSmOccupancy(int& sm_occupancy, CUkernel kernel, int block_size, int dynamic_smem_bytes = 0) const
+  {
+    // Older drivers have issues handling CUkernel in the occupancy queries, get the CUfunction instead.
+    CUfunction kernel_fn;
+    auto status = static_cast<cudaError_t>(cuKernelGetFunction(&kernel_fn, kernel));
+    if (status != cudaSuccess)
+    {
+      return status;
+    }
+
+    return static_cast<cudaError_t>(
+      cuOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_fn, block_size, dynamic_smem_bytes));
+  }
+
+  CUdevice device;
+  int cc;
+};
+
+CUB_NAMESPACE_END
+
+#endif // !defined(_CCCL_CUDACC_BELOW_12_0)
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_runtime.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_runtime.cuh
new file mode 100644
index 000000000..f8064b3c6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/launcher/cuda_runtime.cuh
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_device.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+CUB_NAMESPACE_BEGIN
+
+struct TripleChevronFactory
+{
+  CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron
+  operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const
+  {
+    return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
+  {
+    return cub::PtxVersion(version);
+  }
+
+  CUB_RUNTIME_FUNCTION cudaError_t MultiProcessorCount(int& sm_count) const
+  {
+    int device_ordinal;
+    cudaError_t error = CubDebug(cudaGetDevice(&device_ordinal));
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Get SM count
+    return cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal);
+  }
+
+  template <typename Kernel>
+  CUB_RUNTIME_FUNCTION cudaError_t
+  MaxSmOccupancy(int& sm_occupancy, Kernel kernel_ptr, int block_size, int dynamic_smem_bytes = 0)
+  {
+    return cudaOccupancyMaxActiveBlocksPerMultiprocessor(&sm_occupancy, kernel_ptr, block_size, dynamic_smem_bytes);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_load.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_load.cuh
new file mode 100644
index 000000000..3db8d7303
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_load.cuh
@@ -0,0 +1,387 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * @brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+  LOAD_DEFAULT, ///< Default (no modifier)
+  LOAD_CA, ///< Cache at all levels
+  LOAD_CG, ///< Cache at global level
+  LOAD_CS, ///< Cache streaming (likely to be accessed once)
+  LOAD_CV, ///< Cache as volatile (including cached system lines)
+  LOAD_LDG, ///< Cache as texture
+  LOAD_VOLATILE, ///< Volatile (any memory space)
+};
+
+/**
+ * @name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * @brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.
+ *        Can be used to load any data type.
+ *
+ * @par Example
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * @tparam MODIFIER
+ *   <b>[inferred]</b> CacheLoadModifier enumeration
+ *
+ * @tparam RandomAccessIterator
+ *   <b>[inferred]</b> The input's iterator type \iterator
+ */
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr);
+
+//@}  end member group
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/// Helper structure for templated load iteration (inductive case)
+/// \deprecated [Since 2.6.0] Use UnrolledThreadLoad() or UnrolledCopy() instead.
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+  template <CacheLoadModifier MODIFIER, typename T>
+  CUB_DEPRECATED_BECAUSE("Use UnrolledThreadLoad() instead")
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* ptr, T* vals)
+  {
+    vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+    IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+  }
+
+  template <typename RandomAccessIterator, typename T>
+  CUB_DEPRECATED_BECAUSE("Use UnrolledCopy() instead")
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator itr, T* vals)
+  {
+    vals[COUNT] = itr[COUNT];
+    IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+  }
+};
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+  template <CacheLoadModifier MODIFIER, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T const* /*ptr*/, T* /*vals*/)
+  {}
+
+  template <typename RandomAccessIterator, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(RandomAccessIterator /*itr*/, T* /*vals*/)
+  {}
+};
+
+namespace detail
+{
+template <CacheLoadModifier MODIFIER, typename T, int... Is>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+UnrolledThreadLoadImpl(T const* src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
+{
+  // TODO(bgruber): replace by fold over comma in C++17
+  int dummy[] = {(dst[Is] = ThreadLoad<MODIFIER>(src + Is), 0)...};
+  (void) dummy;
+}
+
+template <typename RandomAccessIterator, typename T, int... Is>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+UnrolledCopyImpl(RandomAccessIterator src, T* dst, ::cuda::std::integer_sequence<int, Is...>)
+{
+  // TODO(bgruber): replace by fold over comma in C++17
+  int dummy[] = {(dst[Is] = src[Is], 0)...};
+  (void) dummy;
+}
+} // namespace detail
+
+template <int Count, CacheLoadModifier MODIFIER, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledThreadLoad(T const* src, T* dst)
+{
+  detail::UnrolledThreadLoadImpl<MODIFIER>(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
+}
+
+template <int Count, typename RandomAccessIterator, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void UnrolledCopy(RandomAccessIterator src, T* dst)
+{
+  detail::UnrolledCopyImpl(src, dst, ::cuda::std::make_integer_sequence<int, Count>{});
+}
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#  define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                               \
+    template <>                                                                                                  \
+    _CCCL_DEVICE _CCCL_FORCEINLINE uint4 ThreadLoad<cub_modifier, uint4 const*>(uint4 const* ptr)                \
+    {                                                                                                            \
+      uint4 retval;                                                                                              \
+      asm volatile("ld." #ptx_modifier ".v4.u32 {%0, %1, %2, %3}, [%4];"                                         \
+                   : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)                              \
+                   : _CUB_ASM_PTR_(ptr));                                                                        \
+      return retval;                                                                                             \
+    }                                                                                                            \
+    template <>                                                                                                  \
+    _CCCL_DEVICE _CCCL_FORCEINLINE ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const*>(ulonglong2 const* ptr) \
+    {                                                                                                            \
+      ulonglong2 retval;                                                                                         \
+      asm volatile("ld." #ptx_modifier ".v2.u64 {%0, %1}, [%2];"                                                 \
+                   : "=l"(retval.x), "=l"(retval.y)                                                              \
+                   : _CUB_ASM_PTR_(ptr));                                                                        \
+      return retval;                                                                                             \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#  define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                          \
+    template <>                                                                                            \
+    _CCCL_DEVICE _CCCL_FORCEINLINE ushort4 ThreadLoad<cub_modifier, ushort4 const*>(ushort4 const* ptr)    \
+    {                                                                                                      \
+      ushort4 retval;                                                                                      \
+      asm volatile("ld." #ptx_modifier ".v4.u16 {%0, %1, %2, %3}, [%4];"                                   \
+                   : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)                        \
+                   : _CUB_ASM_PTR_(ptr));                                                                  \
+      return retval;                                                                                       \
+    }                                                                                                      \
+    template <>                                                                                            \
+    _CCCL_DEVICE _CCCL_FORCEINLINE uint2 ThreadLoad<cub_modifier, uint2 const*>(uint2 const* ptr)          \
+    {                                                                                                      \
+      uint2 retval;                                                                                        \
+      asm volatile("ld." #ptx_modifier ".v2.u32 {%0, %1}, [%2];"                                           \
+                   : "=r"(retval.x), "=r"(retval.y)                                                        \
+                   : _CUB_ASM_PTR_(ptr));                                                                  \
+      return retval;                                                                                       \
+    }                                                                                                      \
+    template <>                                                                                            \
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long ThreadLoad<cub_modifier, unsigned long long const*>( \
+      unsigned long long const* ptr)                                                                       \
+    {                                                                                                      \
+      unsigned long long retval;                                                                           \
+      asm volatile("ld." #ptx_modifier ".u64 %0, [%1];" : "=l"(retval) : _CUB_ASM_PTR_(ptr));              \
+      return retval;                                                                                       \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#  define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                                      \
+    template <>                                                                                                        \
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int ThreadLoad<cub_modifier, unsigned int const*>(unsigned int const* ptr) \
+    {                                                                                                                  \
+      unsigned int retval;                                                                                             \
+      asm volatile("ld." #ptx_modifier ".u32 %0, [%1];" : "=r"(retval) : _CUB_ASM_PTR_(ptr));                          \
+      return retval;                                                                                                   \
+    }
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#  define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    template <>                                                                                    \
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned short ThreadLoad<cub_modifier, unsigned short const*>( \
+      unsigned short const* ptr)                                                                   \
+    {                                                                                              \
+      unsigned short retval;                                                                       \
+      asm volatile("ld." #ptx_modifier ".u16 %0, [%1];" : "=h"(retval) : _CUB_ASM_PTR_(ptr));      \
+      return retval;                                                                               \
+    }
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#  define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                \
+    template <>                                                                                  \
+    _CCCL_DEVICE _CCCL_FORCEINLINE unsigned char ThreadLoad<cub_modifier, unsigned char const*>( \
+      unsigned char const* ptr)                                                                  \
+    {                                                                                            \
+      unsigned short retval;                                                                     \
+      asm volatile(                                                                              \
+        "{"                                                                                      \
+        "   .reg .u8 datum;"                                                                     \
+        "    ld." #ptx_modifier ".u8 datum, [%1];"                                               \
+        "    cvt.u16.u8 %0, datum;"                                                              \
+        "}"                                                                                      \
+        : "=h"(retval)                                                                           \
+        : _CUB_ASM_PTR_(ptr));                                                                   \
+      return (unsigned char) retval;                                                             \
+    }
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#  define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)        \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)         \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)         \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)         \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+_CUB_LOAD_ALL(LOAD_CA, ca)
+_CUB_LOAD_ALL(LOAD_CG, cg)
+_CUB_LOAD_ALL(LOAD_CS, cs)
+_CUB_LOAD_ALL(LOAD_CV, cv)
+_CUB_LOAD_ALL(LOAD_LDG, global.nc)
+
+// Macro cleanup
+#  undef _CUB_LOAD_ALL
+#  undef _CUB_LOAD_1
+#  undef _CUB_LOAD_2
+#  undef _CUB_LOAD_4
+#  undef _CUB_LOAD_8
+#  undef _CUB_LOAD_16
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator>
+ThreadLoad(RandomAccessIterator itr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
+{
+  return *itr;
+}
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  return *ptr;
+}
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<true> /*is_primitive*/)
+{
+  T retval = *reinterpret_cast<const volatile T*>(ptr);
+  return retval;
+}
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoadVolatilePointer(const T* ptr, Int2Type<false> /*is_primitive*/)
+{
+  // Word type for memcpying
+  using VolatileWord              = typename UnitWord<T>::VolatileWord;
+  constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+
+  T retval;
+  VolatileWord* words = reinterpret_cast<VolatileWord*>(&retval);
+  UnrolledCopy<VOLATILE_MULTIPLE>(reinterpret_cast<const volatile VolatileWord*>(ptr), words);
+  return retval;
+}
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadLoad(const T* ptr, Int2Type<LOAD_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadLoad(T const* ptr, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  using DeviceWord              = typename UnitWord<T>::DeviceWord;
+  constexpr int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+  DeviceWord words[DEVICE_MULTIPLE];
+  UnrolledThreadLoad<DEVICE_MULTIPLE, CacheLoadModifier(MODIFIER)>(reinterpret_cast<const DeviceWord*>(ptr), words);
+  return *reinterpret_cast<T*>(words);
+}
+
+template <CacheLoadModifier MODIFIER, typename RandomAccessIterator>
+_CCCL_DEVICE _CCCL_FORCEINLINE cub::detail::value_t<RandomAccessIterator> ThreadLoad(RandomAccessIterator itr)
+{
+  return ThreadLoad(itr, Int2Type<MODIFIER>(), Int2Type<::cuda::std::is_pointer<RandomAccessIterator>::value>());
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_operators.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_operators.cuh
new file mode 100644
index 000000000..0475bdba7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_operators.cuh
@@ -0,0 +1,592 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/type_traits.cuh> // always_false
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/type_traits> // cuda::std::common_type
+#include <cuda/std/utility> // cuda::std::forward
+
+// #include <functional> // std::plus
+
+CUB_NAMESPACE_BEGIN
+
+/// @brief Inequality functor (wraps equality functor)
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+  /// Wrapped equality operator
+  EqualityOp op;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InequalityWrapper(EqualityOp op)
+      : op(op)
+  {}
+
+  /// Boolean inequality operator, returns `t != u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u)
+  {
+    return !op(::cuda::std::forward<T>(t), ::cuda::std::forward<U>(u));
+  }
+};
+
+#if _CCCL_STD_VER > 2011
+using Equality   = ::cuda::std::equal_to<>;
+using Inequality = ::cuda::std::not_equal_to<>;
+using Sum        = ::cuda::std::plus<>;
+using Difference = ::cuda::std::minus<>;
+using Division   = ::cuda::std::divides<>;
+#else
+/// @brief Default equality functor
+struct Equality
+{
+  /// Boolean equality operator, returns `t == u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u) const
+  {
+    return ::cuda::std::forward<T>(t) == ::cuda::std::forward<U>(u);
+  }
+};
+
+/// @brief Default inequality functor
+struct Inequality
+{
+  /// Boolean inequality operator, returns `t != u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator()(T&& t, U&& u) const
+  {
+    return ::cuda::std::forward<T>(t) != ::cuda::std::forward<U>(u);
+  }
+};
+
+/// @brief Default sum functor
+struct Sum
+{
+  /// Binary sum operator, returns `t + u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+  operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward<T>(t) + ::cuda::std::forward<U>(u))
+  {
+    return ::cuda::std::forward<T>(t) + ::cuda::std::forward<U>(u);
+  }
+};
+
+/// @brief Default difference functor
+struct Difference
+{
+  /// Binary difference operator, returns `t - u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+  operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward<T>(t) - ::cuda::std::forward<U>(u))
+  {
+    return ::cuda::std::forward<T>(t) - ::cuda::std::forward<U>(u);
+  }
+};
+
+/// @brief Default division functor
+struct Division
+{
+  /// Binary division operator, returns `t / u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
+  operator()(T&& t, U&& u) const -> decltype(::cuda::std::forward<T>(t) / ::cuda::std::forward<U>(u))
+  {
+    return ::cuda::std::forward<T>(t) / ::cuda::std::forward<U>(u);
+  }
+};
+#endif
+
+/// @brief Default max functor
+struct Max
+{
+  /// Boolean max operator, returns `(t > u) ? t : u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::common_type<T, U>::type operator()(T&& t, U&& u) const
+  {
+    return CUB_MAX(t, u);
+  }
+};
+
+/// @brief Arg max functor (keeps the value and offset of the first occurrence
+///        of the larger item)
+struct ArgMax
+{
+  /// Boolean max operator, preferring the item having the smaller offset in
+  /// case of ties
+  template <typename T, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair<OffsetT, T>
+  operator()(const KeyValuePair<OffsetT, T>& a, const KeyValuePair<OffsetT, T>& b) const
+  {
+    // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+    // return ((b.value > a.value) ||
+    //         ((a.value == b.value) && (b.key < a.key)))
+    //      ? b : a;
+
+    if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+    {
+      return b;
+    }
+
+    return a;
+  }
+};
+
+/// @brief Default min functor
+struct Min
+{
+  /// Boolean min operator, returns `(t < u) ? t : u`
+  template <typename T, typename U>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename ::cuda::std::common_type<T, U>::type operator()(T&& t, U&& u) const
+  {
+    return CUB_MIN(t, u);
+  }
+};
+
+/// @brief Arg min functor (keeps the value and offset of the first occurrence
+///        of the smallest item)
+struct ArgMin
+{
+  /// Boolean min operator, preferring the item having the smaller offset in
+  /// case of ties
+  template <typename T, typename OffsetT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair<OffsetT, T>
+  operator()(const KeyValuePair<OffsetT, T>& a, const KeyValuePair<OffsetT, T>& b) const
+  {
+    // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+    // return ((b.value < a.value) ||
+    //         ((a.value == b.value) && (b.key < a.key)))
+    //      ? b : a;
+
+    if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+    {
+      return b;
+    }
+
+    return a;
+  }
+};
+
+namespace detail
+{
+template <typename ScanOpT>
+struct ScanBySegmentOp
+{
+  /// Wrapped operator
+  ScanOpT op;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanBySegmentOp() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanBySegmentOp(ScanOpT op)
+      : op(op)
+  {}
+
+  /**
+   * @brief Scan operator
+   *
+   * @tparam KeyValuePairT
+   *   KeyValuePair pairing of T (value) and int (head flag)
+   *
+   * @param[in] first
+   *   First partial reduction
+   *
+   * @param[in] second
+   *   Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second)
+  {
+    KeyValuePairT retval;
+    retval.key = first.key | second.key;
+#ifdef _NVHPC_CUDA // WAR bug on nvc++
+    if (second.key)
+    {
+      retval.value = second.value;
+    }
+    else
+    {
+      // If second.value isn't copied into a temporary here, nvc++ will
+      // crash while compiling the TestScanByKeyWithLargeTypes test in
+      // thrust/testing/scan_by_key.cu:
+      auto v2      = second.value;
+      retval.value = op(first.value, v2);
+    }
+#else // not nvc++:
+    // if (second.key) {
+    //   The second partial reduction spans a segment reset, so it's value
+    //   aggregate becomes the running aggregate
+    // else {
+    //   The second partial reduction does not span a reset, so accumulate both
+    //   into the running aggregate
+    // }
+    retval.value = (second.key) ? second.value : op(first.value, second.value);
+#endif
+    return retval;
+  }
+};
+
+template <class OpT>
+struct basic_binary_op_t
+{
+  static constexpr bool value = false;
+};
+
+template <>
+struct basic_binary_op_t<Sum>
+{
+  static constexpr bool value = true;
+};
+
+template <>
+struct basic_binary_op_t<Min>
+{
+  static constexpr bool value = true;
+};
+
+template <>
+struct basic_binary_op_t<Max>
+{
+  static constexpr bool value = true;
+};
+} // namespace detail
+
+/// @brief Default cast functor
+template <typename B>
+struct CastOp
+{
+  /// Cast operator, returns `(B) a`
+  template <typename A>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE B operator()(A&& a) const
+  {
+    return (B) a;
+  }
+};
+
+/// @brief Binary operator wrapper for switching non-commutative scan arguments
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+  /// Wrapped scan operator
+  ScanOp scan_op;
+
+public:
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE SwizzleScanOp(ScanOp scan_op)
+      : scan_op(scan_op)
+  {}
+
+  /// Switch the scan arguments
+  template <typename T>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T operator()(const T& a, const T& b)
+  {
+    T _a(a);
+    T _b(b);
+
+    return scan_op(_b, _a);
+  }
+};
+
+/**
+ * @brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative
+ * combining operator `f(const T &x, const T &y)`, an instance of this functor
+ * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose
+ * `value` field is either `b.value` if `b.key` is non-zero, or
+ * `f(a.value, b.value)` otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining
+ * operator for input sequences of cub::KeyValuePair pairings. Such sequences
+ * are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ * @tparam ReductionOpT Binary reduction operator to apply to values
+ */
+template <typename ReductionOpT>
+struct ReduceBySegmentOp
+{
+  /// Wrapped reduction operator
+  ReductionOpT op;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceBySegmentOp() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceBySegmentOp(ReductionOpT op)
+      : op(op)
+  {}
+
+  /**
+   * @brief Scan operator
+   *
+   * @tparam KeyValuePairT
+   *   KeyValuePair pairing of T (value) and OffsetT (head flag)
+   *
+   * @param[in] first
+   *   First partial reduction
+   *
+   * @param[in] second
+   *   Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second)
+  {
+    KeyValuePairT retval;
+    retval.key = first.key + second.key;
+#ifdef _NVHPC_CUDA // WAR bug on nvc++
+    if (second.key)
+    {
+      retval.value = second.value;
+    }
+    else
+    {
+      // If second.value isn't copied into a temporary here, nvc++ will
+      // crash while compiling the TestScanByKeyWithLargeTypes test in
+      // thrust/testing/scan_by_key.cu:
+      auto v2      = second.value;
+      retval.value = op(first.value, v2);
+    }
+#else // not nvc++:
+    // if (second.key) {
+    //   The second partial reduction spans a segment reset, so it's value
+    //   aggregate becomes the running aggregate
+    // else {
+    //   The second partial reduction does not span a reset, so accumulate both
+    //   into the running aggregate
+    // }
+    retval.value = (second.key) ? second.value : op(first.value, second.value);
+#endif
+    return retval;
+  }
+};
+
+/**
+ * @tparam ReductionOpT Binary reduction operator to apply to values
+ */
+template <typename ReductionOpT>
+struct ReduceByKeyOp
+{
+  /// Wrapped reduction operator
+  ReductionOpT op;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyOp() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyOp(ReductionOpT op)
+      : op(op)
+  {}
+
+  /**
+   * @brief Scan operator
+   *
+   * @param[in] first First partial reduction
+   * @param[in] second Second partial reduction
+   */
+  template <typename KeyValuePairT>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePairT operator()(const KeyValuePairT& first, const KeyValuePairT& second)
+  {
+    KeyValuePairT retval = second;
+
+    if (first.key == second.key)
+    {
+      retval.value = op(first.value, retval.value);
+    }
+
+    return retval;
+  }
+};
+
+template <typename BinaryOpT>
+struct BinaryFlip
+{
+  BinaryOpT binary_op;
+
+  _CCCL_HOST_DEVICE explicit BinaryFlip(BinaryOpT binary_op)
+      : binary_op(binary_op)
+  {}
+
+  template <typename T, typename U>
+  _CCCL_DEVICE auto
+  operator()(T&& t, U&& u) -> decltype(binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t)))
+  {
+    return binary_op(::cuda::std::forward<U>(u), ::cuda::std::forward<T>(t));
+  }
+};
+
+template <typename BinaryOpT>
+_CCCL_HOST_DEVICE BinaryFlip<BinaryOpT> MakeBinaryFlip(BinaryOpT binary_op)
+{
+  return BinaryFlip<BinaryOpT>(binary_op);
+}
+
+namespace internal
+{
+// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+
+template <typename T>
+struct DpxMin
+{
+  static_assert(detail::always_false<T>(), "DpxMin is not supported for this type");
+};
+
+template <>
+struct DpxMin<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmins2(a, b);
+  }
+};
+
+template <>
+struct DpxMin<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vminu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxMax
+{
+  static_assert(detail::always_false<T>(), "DpxMax is not supported for this type");
+};
+
+template <>
+struct DpxMax<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxs2(a, b);
+  }
+};
+
+template <>
+struct DpxMax<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vmaxu2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+struct DpxSum
+{
+  static_assert(detail::always_false<T>(), "DpxSum is not supported for this type");
+};
+
+template <>
+struct DpxSum<::cuda::std::int16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+template <>
+struct DpxSum<::cuda::std::uint16_t>
+{
+  _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE unsigned operator()(unsigned a, unsigned b) const
+  {
+    return __vadd2(a, b);
+  }
+};
+
+//----------------------------------------------------------------------------------------------------------------------
+
+template <typename ReduceOp, typename T>
+struct CubOperatorToDpx
+{
+  static_assert(detail::always_false<T>(), "Dpx is not supported for this operator");
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Min, T>
+{
+  using type = DpxMin<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Max, T>
+{
+  using type = DpxMax<T>;
+};
+
+template <typename T>
+struct CubOperatorToDpx<cub::Sum, T>
+{
+  using type = DpxSum<T>;
+};
+
+// template <typename T>
+// struct CubOperatorToDpx<std::plus<T>, T>
+//{
+//   using type = DpxSum<T>;
+// };
+
+template <typename ReduceOp, typename T>
+using cub_operator_to_dpx_t = CubOperatorToDpx<ReduceOp, T>;
+
+} // namespace internal
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_reduce.cuh
new file mode 100644
index 000000000..a956321f7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_reduce.cuh
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Thread reduction over statically-sized array-like types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/type_traits.cuh> // are_same()
+#include <cub/thread/thread_operators.cuh> // cub_operator_to_dpx_t
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/bit> // bit_cast
+#include <cuda/std/cstdint> // uint16_t
+#include <cuda/std/functional> // cuda::std::plus
+#include <cuda/std/utility> // pair
+
+// #include <functional> // std::plus
+
+CUB_NAMESPACE_BEGIN
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal
+{
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/// DPX instructions compute min, max, and sum for up to three 16 and 32-bit signed or unsigned integer parameters
+/// see DPX documetation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dpx
+/// NOTE: The compiler is able to automatically vectorize all cases with 3 operands
+///       However, all other cases with per-halfword comparison need to be explicitly vectorized
+/// TODO: Remove DPX specilization when nvbug 4823237 is fixed
+///
+/// DPX reduction is enabled if the following conditions are met:
+/// - Hopper+ architectures. DPX instructions are emulated before Hopper
+/// - The number of elements must be large enough for performance reasons (see below)
+/// - All types must be the same
+/// - Only works with integral types of 2 bytes
+/// - DPX instructions provide Min, Max, and Sum SIMD operations
+/// If the number of instructions is the same, we favor the compiler
+
+template <typename Input, typename ReductionOp, typename AccumT>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE // clang-format off
+constexpr bool enable_dpx_reduction()
+{
+  using T = decltype(::cuda::std::declval<Input>()[0]);
+  // TODO: use constexpr variable in C++14+
+  using Lenght = ::cuda::std::integral_constant<int, detail::static_size<Input>()>;
+  return ((Lenght{} >= 9 && detail::are_same<ReductionOp, cub::Sum/*, std::plus<T>*/>()) || Lenght{} >= 10)
+            && detail::are_same<T, AccumT>()
+            && detail::is_one_of<T, int16_t, uint16_t>()
+            && detail::is_one_of<ReductionOp, cub::Min, cub::Max, cub::Sum/*, std::plus<T>*/>();
+}
+// clang-format on
+
+// Considering compiler vectorization with 3-way comparison, the number of SASS instructions is
+// Standard: ceil((L - 3) / 2) + 1
+//   replacing L with L/2 for SIMD
+// DPX:      ceil((L/2 - 3) / 2) + 1 + 2 [for halfword comparison: PRMT, VIMNMX] + L % 2 [for last element]
+//   finally, the last two comparision operations are vectorized in a 3-way reduction
+//           ceil((L/2 - 3) / 2) + 3
+//
+// length | Standard |  DPX
+//  2     |    1     |  NA
+//  3     |    1     |  NA
+//  4     |    2     |  3
+//  5     |    2     |  3
+//  6     |    3     |  3
+//  7     |    3     |  3
+//  8     |    4     |  4
+//  9     |    4     |  4
+// 10     |    5     |  4 // ***
+// 11     |    5     |  4 // ***
+// 12     |    6     |  5 // ***
+// 13     |    6     |  5 // ***
+// 14     |    7     |  5 // ***
+// 15     |    7     |  5 // ***
+// 16     |    8     |  6 // ***
+
+template <typename AccumT, typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduceSequential(const Input& input, ReductionOp reduction_op)
+{
+  AccumT retval = input[0];
+#  pragma unroll
+  for (int i = 1; i < detail::static_size<Input>(); ++i)
+  {
+    retval = reduction_op(retval, input[i]);
+  }
+  return retval;
+}
+
+/// Specialization for DPX reduction
+template <typename Input, typename ReductionOp>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
+ThreadReduceDpx(const Input& input, ReductionOp reduction_op) -> ::cuda::std::__remove_cvref_t<decltype(input[0])>
+{
+  using T              = ::cuda::std::__remove_cvref_t<decltype(input[0])>;
+  constexpr int length = detail::static_size<Input>();
+  T array[length];
+#  pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i] = input[i];
+  }
+  using DpxReduceOp   = cub_operator_to_dpx_t<ReductionOp, T>;
+  using SimdType      = ::cuda::std::pair<T, T>;
+  auto unsigned_input = reinterpret_cast<const unsigned*>(array);
+  auto simd_reduction = ThreadReduceSequential<length / 2>(unsigned_input, DpxReduceOp{});
+  auto simd_values    = ::cuda::std::bit_cast<SimdType>(simd_reduction);
+  auto ret_value      = reduction_op(simd_values.first, simd_values.second);
+  return (length % 2 == 0) ? ret_value : reduction_op(ret_value, input[length - 1]);
+}
+
+// DPX/Sequential dispatch
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(sizeof(Input) != sizeof(Input), "a");
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  NV_IF_TARGET(NV_PROVIDES_SM_90,
+               (return ThreadReduceDpx(input, reduction_op);),
+               (return ThreadReduceSequential<AccumT>(input, reduction_op);))
+}
+
+template <typename Input,
+          typename ReductionOp,
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT>,
+          _CUB_TEMPLATE_REQUIRES(!enable_dpx_reduction<Input, ReductionOp, AccumT>())>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const Input& input, ReductionOp reduction_op)
+{
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  return ThreadReduceSequential<AccumT>(input, reduction_op);
+}
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * @brief Reduction over statically-sized array-like types, seeded with the specified @p prefix.
+ *
+ * @tparam Input
+ *   <b>[inferred]</b> The data type to be reduced having member
+ *   <tt>operator[](int i)</tt> and must be statically-sized (size() method or static array)
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT></tt>
+ */
+template <typename Input,
+          typename ReductionOp,
+          typename PrefixT,
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+          typename ValueT = ::cuda::std::__remove_cvref_t<decltype(::cuda::std::declval<Input>()[0])>,
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, ValueT, PrefixT>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const Input& input, ReductionOp reduction_op, PrefixT prefix)
+{
+  static_assert(detail::has_subscript<Input>::value, "Input must support the subscript operator[]");
+  static_assert(detail::has_size<Input>::value, "Input must have the size() method");
+  static_assert(detail::has_binary_call_operator<ReductionOp, ValueT>::value,
+                "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
+  constexpr int length = detail::static_size<Input>();
+  // copy to a temporary array of type AccumT
+  AccumT array[length + 1];
+  array[0] = prefix;
+#pragma unroll
+  for (int i = 0; i < length; ++i)
+  {
+    array[i + 1] = input[i];
+  }
+  return ThreadReduce<decltype(array), ReductionOp, AccumT, AccumT>(array, reduction_op);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
+ *
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer. The aggregate is returned.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input pointer
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T></tt>
+ */
+template <int Length, typename T, typename ReductionOp, typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T>>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* input, ReductionOp reduction_op)
+{
+  static_assert(Length > 0, "Length must be greater than 0");
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  using ArrayT = T[Length];
+  auto array   = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op);
+}
+
+/**
+ * @remark The pointer interface adds little value and requires Length to be explicit.
+ *         Prefer using the array-like interface
+ *
+ * @brief Perform a sequential reduction over @p length elements of the @p input pointer, seeded with the specified @p
+ *        prefix. The aggregate is returned.
+ *
+ * @tparam length
+ *   Length of input pointer
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @tparam PrefixT
+ *   <b>[inferred]</b> The prefix type
+ *
+ * @param[in] input
+ *   Input pointer
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
+ *
+ * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T, PrefixT></tt>
+ */
+template <int Length,
+          typename T,
+          typename ReductionOp,
+          typename PrefixT,
+          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>,
+          _CUB_TEMPLATE_REQUIRES(Length > 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
+ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
+{
+  static_assert(detail::has_binary_call_operator<ReductionOp, T>::value,
+                "ReductionOp must have the binary call operator: operator(V1, V2)");
+  auto array = reinterpret_cast<const T(*)[Length]>(input);
+  return ThreadReduce(*array, reduction_op, prefix);
+}
+
+template <int Length, typename T, typename ReductionOp, typename PrefixT, _CUB_TEMPLATE_REQUIRES(Length == 0)>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix)
+{
+  return prefix;
+}
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
+
+} // namespace internal
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_scan.cuh
new file mode 100644
index 000000000..669a82cd5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_scan.cuh
@@ -0,0 +1,340 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_operators.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal
+{
+
+/**
+ * @name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+/**
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanExclusive(T inclusive, T exclusive, T* input, T* output, ScanOp scan_op, Int2Type<LENGTH> /*length*/)
+{
+#pragma unroll
+  for (int i = 0; i < LENGTH; ++i)
+  {
+    inclusive = scan_op(exclusive, input[i]);
+    output[i] = exclusive;
+    exclusive = inclusive;
+  }
+
+  return inclusive;
+}
+
+/**
+ * @brief Perform a sequential exclusive prefix scan over @p LENGTH elements of
+ *        the @p input array, seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   If not, the first output element is undefined.
+ *   (Handy for preventing thread-0 from applying a prefix.)
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanExclusive(T* input, T* output, ScanOp scan_op, T prefix, bool apply_prefix = true)
+{
+  T inclusive = input[0];
+  if (apply_prefix)
+  {
+    inclusive = scan_op(prefix, inclusive);
+  }
+  output[0]   = prefix;
+  T exclusive = inclusive;
+
+  return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+/**
+ * @brief Perform a sequential exclusive prefix scan over the statically-sized
+ *        @p input array, seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanExclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op, T prefix, bool apply_prefix = true)
+{
+  return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+/**
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanInclusive(T inclusive, T* input, T* output, ScanOp scan_op, Int2Type<LENGTH> /*length*/)
+{
+#pragma unroll
+  for (int i = 0; i < LENGTH; ++i)
+  {
+    inclusive = scan_op(inclusive, input[i]);
+    output[i] = inclusive;
+  }
+
+  return inclusive;
+}
+
+/**
+ * @brief Perform a sequential inclusive prefix scan over
+ *        @p LENGTH elements of the @p input array. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T* input, T* output, ScanOp scan_op)
+{
+  T inclusive = input[0];
+  output[0]   = inclusive;
+
+  // Continue scan
+  return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+/**
+ * @brief Perform a sequential inclusive prefix scan over the
+ *        statically-sized @p input array. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ThreadScanInclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op)
+{
+  return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+/**
+ * @brief Perform a sequential inclusive prefix scan over
+ *        @p LENGTH elements of the @p input array, seeded with the
+ *        specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanInclusive(T* input, T* output, ScanOp scan_op, T prefix, bool apply_prefix = true)
+{
+  T inclusive = input[0];
+  if (apply_prefix)
+  {
+    inclusive = scan_op(prefix, inclusive);
+  }
+  output[0] = inclusive;
+
+  // Continue scan
+  return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+/**
+ * @brief Perform a sequential inclusive prefix scan over the
+ *        statically-sized @p input array, seeded with the specified @p prefix.
+ *        The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
+ */
+template <int LENGTH, typename T, typename ScanOp>
+_CCCL_DEVICE _CCCL_FORCEINLINE T
+ThreadScanInclusive(T (&input)[LENGTH], T (&output)[LENGTH], ScanOp scan_op, T prefix, bool apply_prefix = true)
+{
+  return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+//@}  end member group
+
+} // namespace internal
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_search.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_search.cuh
new file mode 100644
index 000000000..802d4ec96
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_search.cuh
@@ -0,0 +1,196 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <iterator>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <typename AIteratorT, typename BIteratorT, typename OffsetT, typename CoordinateT>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE void MergePathSearch(
+  OffsetT diagonal, AIteratorT a, BIteratorT b, OffsetT a_len, OffsetT b_len, CoordinateT& path_coordinate)
+{
+  /// The value type of the input iterator
+  using T = cub::detail::value_t<AIteratorT>;
+
+  OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+  OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+  while (split_min < split_max)
+  {
+    OffsetT split_pivot = (split_min + split_max) >> 1;
+    if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+    {
+      // Move candidate split range up A, down B
+      split_min = split_pivot + 1;
+    }
+    else
+    {
+      // Move candidate split range up B, down A
+      split_max = split_pivot;
+    }
+  }
+
+  path_coordinate.x = CUB_MIN(split_min, a_len);
+  path_coordinate.y = diagonal - split_min;
+}
+
+/**
+ * @brief Returns the offset of the first value within @p input which does not compare
+ *        less than @p val
+ *
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
+ */
+// TODO(bgruber): deprecate once ::cuda::std::lower_bound is made public
+template <typename InputIteratorT, typename OffsetT, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val)
+{
+  OffsetT retval = 0;
+  while (num_items > 0)
+  {
+    OffsetT half = num_items >> 1;
+    if (input[retval + half] < val)
+    {
+      retval    = retval + (half + 1);
+      num_items = num_items - (half + 1);
+    }
+    else
+    {
+      num_items = half;
+    }
+  }
+
+  return retval;
+}
+
+/**
+ * @brief Returns the offset of the first value within @p input which compares
+ *        greater than @p val
+ *
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
+ */
+// TODO(bgruber): deprecate once ::cuda::std::upper_bound is made public
+template <typename InputIteratorT, typename OffsetT, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
+{
+  OffsetT retval = 0;
+  while (num_items > 0)
+  {
+    OffsetT half = num_items >> 1;
+    if (val < input[retval + half])
+    {
+      num_items = half;
+    }
+    else
+    {
+      retval    = retval + (half + 1);
+      num_items = num_items - (half + 1);
+    }
+  }
+
+  return retval;
+}
+
+#if defined(__CUDA_FP16_TYPES_EXIST__)
+/**
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
+ */
+template <typename InputIteratorT, typename OffsetT>
+_CCCL_DEVICE _CCCL_FORCEINLINE OffsetT UpperBound(InputIteratorT input, OffsetT num_items, __half val)
+{
+  OffsetT retval = 0;
+  while (num_items > 0)
+  {
+    OffsetT half = num_items >> 1;
+
+    bool lt;
+    NV_IF_TARGET(NV_PROVIDES_SM_53,
+                 (lt = __hlt(val, input[retval + half]);),
+                 (lt = __half2float(val) < __half2float(input[retval + half]);));
+
+    if (lt)
+    {
+      num_items = half;
+    }
+    else
+    {
+      retval    = retval + (half + 1);
+      num_items = num_items - (half + 1);
+    }
+  }
+
+  return retval;
+}
+#endif // __CUDA_FP16_TYPES_EXIST__
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_sort.cuh
new file mode 100644
index 000000000..7d9e8622f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_sort.cuh
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void Swap(T& lhs, T& rhs)
+{
+  T temp = lhs;
+  lhs    = rhs;
+  rhs    = temp;
+}
+
+/**
+ * @brief Sorts data using odd-even sort method
+ *
+ * The sorting method is stable. Further details can be found in:
+ * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction
+ * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972.
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted.
+ *
+ * @tparam CompareOp
+ *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @param[in,out] keys
+ *   Keys to sort
+ *
+ * @param[in,out] items
+ *   Values to sort
+ *
+ * @param[in] compare_op
+ *   Comparison function object which returns true if the first argument is
+ *   ordered before the second
+ */
+template <typename KeyT, typename ValueT, typename CompareOp, int ITEMS_PER_THREAD>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THREAD], CompareOp compare_op)
+{
+  constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+
+#pragma unroll
+  for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+  {
+#pragma unroll
+    for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+    {
+      if (compare_op(keys[j + 1], keys[j]))
+      {
+        Swap(keys[j], keys[j + 1]);
+        if (!KEYS_ONLY)
+        {
+          Swap(items[j], items[j + 1]);
+        }
+      }
+    } // inner loop
+  } // outer loop
+}
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_store.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_store.cuh
new file mode 100644
index 000000000..0f421fe2b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/thread/thread_store.cuh
@@ -0,0 +1,358 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * @brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+  STORE_DEFAULT, ///< Default (no modifier)
+  STORE_WB, ///< Cache write-back all coherent levels
+  STORE_CG, ///< Cache at global level
+  STORE_CS, ///< Cache streaming (likely to be accessed once)
+  STORE_WT, ///< Cache write-through (to system memory)
+  STORE_VOLATILE, ///< Volatile shared (any memory space)
+};
+
+/**
+ * @name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * @brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.
+ *        Can be used to store any data type.
+ *
+ * @par Example
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * @endcode
+ *
+ * @tparam MODIFIER
+ *   <b>[inferred]</b> CacheStoreModifier enumeration
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> Output iterator type \iterator
+ *
+ * @tparam T
+ *   <b>[inferred]</b> Data type of output value
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val);
+
+//@}  end member group
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+  template <CacheStoreModifier MODIFIER, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* ptr, T* vals)
+  {
+    ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+    IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+  }
+
+  template <typename OutputIteratorT, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT ptr, T* vals)
+  {
+    ptr[COUNT] = vals[COUNT];
+    IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+  }
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+  template <CacheStoreModifier MODIFIER, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* /*ptr*/, T* /*vals*/)
+  {}
+
+  template <typename OutputIteratorT, typename T>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE void Dereference(OutputIteratorT /*ptr*/, T* /*vals*/)
+  {}
+};
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#  define _CUB_STORE_16(cub_modifier, ptx_modifier)                                                               \
+    template <>                                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, uint4*, uint4>(uint4 * ptr, uint4 val)          \
+    {                                                                                                             \
+      asm volatile("st." #ptx_modifier ".v4.u32 [%0], {%1, %2, %3, %4};"                                          \
+                   :                                                                                              \
+                   : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));                         \
+    }                                                                                                             \
+    template <>                                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(                       \
+      ulonglong2 * ptr, ulonglong2 val)                                                                           \
+    {                                                                                                             \
+      asm volatile("st." #ptx_modifier ".v2.u64 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)); \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#  define _CUB_STORE_8(cub_modifier, ptx_modifier)                                                                \
+    template <>                                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4 * ptr, ushort4 val)  \
+    {                                                                                                             \
+      asm volatile("st." #ptx_modifier ".v4.u16 [%0], {%1, %2, %3, %4};"                                          \
+                   :                                                                                              \
+                   : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w));                         \
+    }                                                                                                             \
+    template <>                                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, uint2*, uint2>(uint2 * ptr, uint2 val)          \
+    {                                                                                                             \
+      asm volatile("st." #ptx_modifier ".v2.u32 [%0], {%1, %2};" : : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)); \
+    }                                                                                                             \
+    template <>                                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(       \
+      unsigned long long* ptr, unsigned long long val)                                                            \
+    {                                                                                                             \
+      asm volatile("st." #ptx_modifier ".u64 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "l"(val));                        \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#  define _CUB_STORE_4(cub_modifier, ptx_modifier)                                              \
+    template <>                                                                                 \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, unsigned int*, unsigned int>( \
+      unsigned int* ptr, unsigned int val)                                                      \
+    {                                                                                           \
+      asm volatile("st." #ptx_modifier ".u32 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "r"(val));      \
+    }
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#  define _CUB_STORE_2(cub_modifier, ptx_modifier)                                                  \
+    template <>                                                                                     \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, unsigned short*, unsigned short>( \
+      unsigned short* ptr, unsigned short val)                                                      \
+    {                                                                                               \
+      asm volatile("st." #ptx_modifier ".u16 [%0], %1;" : : _CUB_ASM_PTR_(ptr), "h"(val));          \
+    }
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#  define _CUB_STORE_1(cub_modifier, ptx_modifier)                                                \
+    template <>                                                                                   \
+    _CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore<cub_modifier, unsigned char*, unsigned char>( \
+      unsigned char* ptr, unsigned char val)                                                      \
+    {                                                                                             \
+      asm volatile(                                                                               \
+        "{"                                                                                       \
+        "   .reg .u8 datum;"                                                                      \
+        "   cvt.u8.u16 datum, %1;"                                                                \
+        "   st." #ptx_modifier ".u8 [%0], datum;"                                                 \
+        "}"                                                                                       \
+        :                                                                                         \
+        : _CUB_ASM_PTR_(ptr), "h"((unsigned short) val));                                         \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#  define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)        \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)         \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)         \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)         \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+_CUB_STORE_ALL(STORE_WB, wb)
+_CUB_STORE_ALL(STORE_CG, cg)
+_CUB_STORE_ALL(STORE_CS, cs)
+_CUB_STORE_ALL(STORE_WT, wt)
+
+// Macro cleanup
+#  undef _CUB_STORE_ALL
+#  undef _CUB_STORE_1
+#  undef _CUB_STORE_2
+#  undef _CUB_STORE_4
+#  undef _CUB_STORE_8
+#  undef _CUB_STORE_16
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+ThreadStore(OutputIteratorT itr, T val, Int2Type<STORE_DEFAULT> /*modifier*/, Int2Type<false> /*is_pointer*/)
+{
+  *itr = val;
+}
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+ThreadStore(T* ptr, T val, Int2Type<STORE_DEFAULT> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  *ptr = val;
+}
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Type<true> /*is_primitive*/)
+{
+  *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStoreVolatilePtr(T* ptr, T val, Int2Type<false> /*is_primitive*/)
+{
+  // Create a temporary using shuffle-words, then store using volatile-words
+  using VolatileWord = typename UnitWord<T>::VolatileWord;
+  using ShuffleWord  = typename UnitWord<T>::ShuffleWord;
+
+  constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+  constexpr int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+
+  VolatileWord words[VOLATILE_MULTIPLE];
+
+#  pragma unroll
+  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+  {
+    reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+  }
+
+  IterateThreadStore<0, VOLATILE_MULTIPLE>::Dereference(reinterpret_cast<volatile VolatileWord*>(ptr), words);
+}
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+ThreadStore(T* ptr, T val, Int2Type<STORE_VOLATILE> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+ThreadStore(T* ptr, T val, Int2Type<MODIFIER> /*modifier*/, Int2Type<true> /*is_pointer*/)
+{
+  // Create a temporary using shuffle-words, then store using device-words
+  using DeviceWord  = typename UnitWord<T>::DeviceWord;
+  using ShuffleWord = typename UnitWord<T>::ShuffleWord;
+
+  constexpr int DEVICE_MULTIPLE  = sizeof(T) / sizeof(DeviceWord);
+  constexpr int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
+
+  DeviceWord words[DEVICE_MULTIPLE];
+
+#  pragma unroll
+  for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+  {
+    reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+  }
+
+  IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+    reinterpret_cast<DeviceWord*>(ptr), words);
+}
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadStore(OutputIteratorT itr, T val)
+{
+  ThreadStore(itr, val, Int2Type<MODIFIER>(), Int2Type<std::is_pointer<OutputIteratorT>::value>());
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_allocator.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_allocator.cuh
new file mode 100644
index 000000000..d9559b874
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_allocator.cuh
@@ -0,0 +1,957 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_debug.cuh>
+#include <cub/util_deprecated.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <map>
+#include <mutex>
+#include <set>
+
+#include <math.h>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * @brief A simple caching allocator for device memory allocations.
+ *
+ * @par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * @par
+ * - Allocations from the allocator are associated with an @p active_stream. Once freed,
+ *   the allocation becomes available immediately for reuse within the @p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to @p active_stream has completed.
+ * - Allocations are categorized and cached by bin size. A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   @p bin_growth provided during construction. Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below ( @p bin_growth ^ @p min_bin ) are rounded up to
+ *   ( @p bin_growth ^ @p min_bin ).
+ * - Allocations above ( @p bin_growth ^ @p max_bin ) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - If the total storage of cached allocations on a given device will exceed
+ *   @p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * @par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - @p bin_growth          = 8
+ * - @p min_bin             = 3
+ * - @p max_bin             = 7
+ * - @p max_cached_bytes    = 6MB - 1B
+ *
+ * @par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+  //---------------------------------------------------------------------
+  // Constants
+  //---------------------------------------------------------------------
+
+  /// Out-of-bounds bin
+  static constexpr unsigned int INVALID_BIN = (unsigned int) -1;
+
+  /// Invalid size
+  static constexpr size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+  /// Invalid device ordinal
+  static constexpr int INVALID_DEVICE_ORDINAL = -1;
+
+  //---------------------------------------------------------------------
+  // Type definitions and helper types
+  //---------------------------------------------------------------------
+
+  /**
+   * Descriptor for device memory allocations
+   */
+  struct BlockDescriptor
+  {
+    // Device pointer
+    void* d_ptr;
+
+    // Size of allocation in bytes
+    size_t bytes;
+
+    // Bin enumeration
+    unsigned int bin;
+
+    // device ordinal
+    int device;
+
+    // Associated associated_stream
+    cudaStream_t associated_stream;
+
+    // Signal when associated stream has run to the point at which this block was freed
+    cudaEvent_t ready_event;
+
+    // Constructor (suitable for searching maps for a specific block, given its pointer and
+    // device)
+    BlockDescriptor(void* d_ptr, int device)
+        : d_ptr(d_ptr)
+        , bytes(0)
+        , bin(INVALID_BIN)
+        , device(device)
+        , associated_stream(0)
+        , ready_event(0)
+    {}
+
+    // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+    BlockDescriptor(int device)
+        : d_ptr(nullptr)
+        , bytes(0)
+        , bin(INVALID_BIN)
+        , device(device)
+        , associated_stream(0)
+        , ready_event(0)
+    {}
+
+    // Comparison functor for comparing device pointers
+    static bool PtrCompare(const BlockDescriptor& a, const BlockDescriptor& b)
+    {
+      if (a.device == b.device)
+      {
+        return (a.d_ptr < b.d_ptr);
+      }
+      else
+      {
+        return (a.device < b.device);
+      }
+    }
+
+    // Comparison functor for comparing allocation sizes
+    static bool SizeCompare(const BlockDescriptor& a, const BlockDescriptor& b)
+    {
+      if (a.device == b.device)
+      {
+        return (a.bytes < b.bytes);
+      }
+      else
+      {
+        return (a.device < b.device);
+      }
+    }
+  };
+
+  /// BlockDescriptor comparator function interface
+  using Compare = bool (*)(const BlockDescriptor&, const BlockDescriptor&);
+
+  class TotalBytes
+  {
+  public:
+    size_t free;
+    size_t live;
+    TotalBytes()
+    {
+      free = live = 0;
+    }
+  };
+
+  /// Set type for cached blocks (ordered by size)
+  using CachedBlocks = std::multiset<BlockDescriptor, Compare>;
+
+  /// Set type for live blocks (ordered by ptr)
+  using BusyBlocks = std::multiset<BlockDescriptor, Compare>;
+
+  /// Map type of device ordinals to the number of cached bytes cached by each device
+  using GpuCachedBytes = std::map<int, TotalBytes>;
+
+  //---------------------------------------------------------------------
+  // Utility functions
+  //---------------------------------------------------------------------
+
+  /**
+   * Integer pow function for unsigned base and exponent
+   */
+  static unsigned int IntPow(unsigned int base, unsigned int exp)
+  {
+    unsigned int retval = 1;
+    while (exp > 0)
+    {
+      if (exp & 1)
+      {
+        retval = retval * base; // multiply the result by the current base
+      }
+      base = base * base; // square the base
+      exp  = exp >> 1; // divide the exponent in half
+    }
+    return retval;
+  }
+
+  /**
+   * Round up to the nearest power-of
+   */
+  void NearestPowerOf(unsigned int& power, size_t& rounded_bytes, unsigned int base, size_t value)
+  {
+    power         = 0;
+    rounded_bytes = 1;
+
+    if (value * base < value)
+    {
+      // Overflow
+      power         = sizeof(size_t) * 8;
+      rounded_bytes = size_t(0) - 1;
+      return;
+    }
+
+    while (rounded_bytes < value)
+    {
+      rounded_bytes *= base;
+      power++;
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Fields
+  //---------------------------------------------------------------------
+
+  /// Mutex for thread-safety
+  std::mutex mutex;
+
+  /// Geometric growth factor for bin-sizes
+  unsigned int bin_growth;
+
+  /// Minimum bin enumeration
+  unsigned int min_bin;
+
+  /// Maximum bin enumeration
+  unsigned int max_bin;
+
+  /// Minimum bin size
+  size_t min_bin_bytes;
+
+  /// Maximum bin size
+  size_t max_bin_bytes;
+
+  /// Maximum aggregate cached bytes per device
+  size_t max_cached_bytes;
+
+  /// Whether or not to skip a call to FreeAllCached() when destructor is called.
+  /// (The CUDA runtime may have already shut down for statically declared allocators)
+  const bool skip_cleanup;
+
+  /// Whether or not to print (de)allocation events to stdout
+  bool debug;
+
+  /// Map of device ordinal to aggregate cached bytes on that device
+  GpuCachedBytes cached_bytes;
+
+  /// Set of cached device allocations available for reuse
+  CachedBlocks cached_blocks;
+
+  /// Set of live device allocations currently in use
+  BusyBlocks live_blocks;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+  //---------------------------------------------------------------------
+  // Methods
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Constructor.
+   *
+   * @param bin_growth
+   *   Geometric growth factor for bin-sizes
+   *
+   * @param min_bin
+   *   Minimum bin (default is bin_growth ^ 1)
+   *
+   * @param max_bin
+   *   Maximum bin (default is no max bin)
+   *
+   * @param max_cached_bytes
+   *   Maximum aggregate cached bytes per device (default is no limit)
+   *
+   * @param skip_cleanup
+   *   Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default
+   *   is to deallocate)
+   *
+   * @param debug
+   *   Whether or not to print (de)allocation events to stdout (default is no stderr output)
+   */
+  CachingDeviceAllocator(
+    unsigned int bin_growth,
+    unsigned int min_bin    = 1,
+    unsigned int max_bin    = INVALID_BIN,
+    size_t max_cached_bytes = INVALID_SIZE,
+    bool skip_cleanup       = false)
+      : bin_growth(bin_growth)
+      , min_bin(min_bin)
+      , max_bin(max_bin)
+      , min_bin_bytes(IntPow(bin_growth, min_bin))
+      , max_bin_bytes(IntPow(bin_growth, max_bin))
+      , max_cached_bytes(max_cached_bytes)
+      , skip_cleanup(skip_cleanup)
+      , debug(false)
+      , cached_blocks(BlockDescriptor::SizeCompare)
+      , live_blocks(BlockDescriptor::PtrCompare)
+  {}
+
+  /**
+   * @brief Constructor.
+   *
+   * @param bin_growth
+   *   Geometric growth factor for bin-sizes
+   *
+   * @param min_bin
+   *   Minimum bin (default is bin_growth ^ 1)
+   *
+   * @param max_bin
+   *   Maximum bin (default is no max bin)
+   *
+   * @param max_cached_bytes
+   *   Maximum aggregate cached bytes per device (default is no limit)
+   *
+   * @param skip_cleanup
+   *   Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default
+   *   is to deallocate)
+   *
+   * @param debug
+   *   Whether or not to print (de)allocation events to stdout (default is no stderr output)
+   */
+  CUB_DEPRECATED_BECAUSE("CUB no longer accepts `debug` parameter. "
+                         "Define CUB_DEBUG_LOG instead, or silence this message with "
+                         "CUB_IGNORE_DEPRECATED_API.")
+  CachingDeviceAllocator(
+    unsigned int bin_growth,
+    unsigned int min_bin,
+    unsigned int max_bin,
+    size_t max_cached_bytes,
+    bool skip_cleanup,
+    bool /* debug */)
+      : CachingDeviceAllocator(bin_growth, min_bin, max_bin, max_cached_bytes, skip_cleanup)
+  {}
+
+  /**
+   * @brief Default constructor.
+   *
+   * Configured with:
+   * @par
+   * - @p bin_growth          = 8
+   * - @p min_bin             = 3
+   * - @p max_bin             = 7
+   * - @p max_cached_bytes    = ( @p bin_growth ^ @p max_bin) * 3 ) - 1 = 6,291,455 bytes
+   *
+   * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+   * sets a maximum of 6,291,455 cached bytes per device
+   */
+  CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
+      : bin_growth(8)
+      , min_bin(3)
+      , max_bin(7)
+      , min_bin_bytes(IntPow(bin_growth, min_bin))
+      , max_bin_bytes(IntPow(bin_growth, max_bin))
+      , max_cached_bytes((max_bin_bytes * 3) - 1)
+      , skip_cleanup(skip_cleanup)
+      , debug(debug)
+      , cached_blocks(BlockDescriptor::SizeCompare)
+      , live_blocks(BlockDescriptor::PtrCompare)
+  {}
+
+  /**
+   * @brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+   *
+   * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+   * cached-in-reserve) to be freed.  See \p FreeAllCached().
+   */
+  cudaError_t SetMaxCachedBytes(size_t max_cached_bytes_)
+  {
+    // Lock
+    mutex.lock();
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    _CubLog(
+      "Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes_);
+#endif
+
+    this->max_cached_bytes = max_cached_bytes_;
+
+    // Unlock
+    mutex.unlock();
+
+    return cudaSuccess;
+  }
+
+  /**
+   * @brief Provides a suitable allocation of device memory for the given size on the specified
+   *        device.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within the @p
+   * active_stream with which it was associated with during allocation, and it becomes available
+   * for reuse within other streams when all prior work submitted to @p active_stream has
+   * completed.
+   *
+   * @param[in] device
+   *   Device on which to place the allocation
+   *
+   * @param[out] d_ptr
+   *   Reference to pointer to the allocation
+   *
+   * @param[in] bytes
+   *   Minimum number of bytes for the allocation
+   *
+   * @param[in] active_stream
+   *   The stream to be associated with this allocation
+   */
+  cudaError_t DeviceAllocate(int device, void** d_ptr, size_t bytes, cudaStream_t active_stream = 0)
+  {
+    *d_ptr                = nullptr;
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    cudaError_t error     = cudaSuccess;
+
+    if (device == INVALID_DEVICE_ORDINAL)
+    {
+      error = CubDebug(cudaGetDevice(&entrypoint_device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+      device = entrypoint_device;
+    }
+
+    // Create a block descriptor for the requested allocation
+    bool found = false;
+    BlockDescriptor search_key(device);
+    search_key.associated_stream = active_stream;
+    NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+    if (search_key.bin > max_bin)
+    {
+      // Bin is greater than our maximum bin: allocate the request
+      // exactly and give out-of-bounds bin.  It will not be cached
+      // for reuse when returned.
+      search_key.bin   = INVALID_BIN;
+      search_key.bytes = bytes;
+    }
+    else
+    {
+      // Search for a suitable cached allocation: lock
+      mutex.lock();
+
+      if (search_key.bin < min_bin)
+      {
+        // Bin is less than minimum bin: round up
+        search_key.bin   = min_bin;
+        search_key.bytes = min_bin_bytes;
+      }
+
+      // Iterate through the range of cached blocks on the same device in the same bin
+      CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+      while ((block_itr != cached_blocks.end()) && (block_itr->device == device) && (block_itr->bin == search_key.bin))
+      {
+        // To prevent races with reusing blocks returned by the host but still
+        // in use by the device, only consider cached blocks that are
+        // either (from the active stream) or (from an idle stream)
+        bool is_reusable = false;
+        if (active_stream == block_itr->associated_stream)
+        {
+          is_reusable = true;
+        }
+        else
+        {
+          const cudaError_t event_status = cudaEventQuery(block_itr->ready_event);
+          if (event_status != cudaErrorNotReady)
+          {
+            CubDebug(event_status);
+            is_reusable = true;
+          }
+        }
+
+        if (is_reusable)
+        {
+          // Reuse existing cache block.  Insert into live blocks.
+          found                        = true;
+          search_key                   = *block_itr;
+          search_key.associated_stream = active_stream;
+          live_blocks.insert(search_key);
+
+          // Remove from free blocks
+          cached_bytes[device].free -= search_key.bytes;
+          cached_bytes[device].live += search_key.bytes;
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+          _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with "
+                  "stream %lld).\n",
+                  device,
+                  search_key.d_ptr,
+                  (long long) search_key.bytes,
+                  (long long) search_key.associated_stream,
+                  (long long) block_itr->associated_stream);
+#endif
+
+          cached_blocks.erase(block_itr);
+
+          break;
+        }
+        block_itr++;
+      }
+
+      // Done searching: unlock
+      mutex.unlock();
+    }
+
+    // Allocate the block if necessary
+    if (!found)
+    {
+      // Set runtime's current device to specified device (entrypoint may not be set)
+      if (device != entrypoint_device)
+      {
+        error = CubDebug(cudaGetDevice(&entrypoint_device));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+
+        error = CubDebug(cudaSetDevice(device));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+      }
+
+      // Attempt to allocate
+      error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes));
+      if (error == cudaErrorMemoryAllocation)
+      {
+        // The allocation attempt failed: free all cached blocks on device and retry
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                device,
+                (long long) search_key.bytes,
+                (long long) search_key.associated_stream);
+#endif
+
+        error = cudaSuccess; // Reset the error we will return
+        cudaGetLastError(); // Reset CUDART's error
+
+        // Lock
+        mutex.lock();
+
+        // Iterate the range of free blocks on the same device
+        BlockDescriptor free_key(device);
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+        while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+        {
+          // No need to worry about synchronization with the device: cudaFree is
+          // blocking and will synchronize across all kernels executing
+          // on the current device
+
+          // Free device memory and destroy stream event.
+          error = CubDebug(cudaFree(block_itr->d_ptr));
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+
+          error = CubDebug(cudaEventDestroy(block_itr->ready_event));
+          if (cudaSuccess != error)
+          {
+            break;
+          }
+
+          // Reduce balance and erase entry
+          cached_bytes[device].free -= block_itr->bytes;
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+          _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
+                  "(%lld bytes) outstanding.\n",
+                  device,
+                  (long long) block_itr->bytes,
+                  (long long) cached_blocks.size(),
+                  (long long) cached_bytes[device].free,
+                  (long long) live_blocks.size(),
+                  (long long) cached_bytes[device].live);
+#endif
+
+          block_itr = cached_blocks.erase(block_itr);
+        }
+
+        // Unlock
+        mutex.unlock();
+
+        // Return under error
+        if (error)
+        {
+          return error;
+        }
+
+        // Try to allocate again
+        error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+      }
+
+      // Create ready event
+      error = CubDebug(cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
+
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+      // Insert into live blocks
+      mutex.lock();
+      live_blocks.insert(search_key);
+      cached_bytes[device].live += search_key.bytes;
+      mutex.unlock();
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+              device,
+              search_key.d_ptr,
+              (long long) search_key.bytes,
+              (long long) search_key.associated_stream);
+#endif
+
+      // Attempt to revert back to previous device if necessary
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+      {
+        error = CubDebug(cudaSetDevice(entrypoint_device));
+        if (cudaSuccess != error)
+        {
+          return error;
+        }
+      }
+    }
+
+    // Copy device pointer to output parameter
+    *d_ptr = search_key.d_ptr;
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+    if (debug)
+    {
+      _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+              (long long) cached_blocks.size(),
+              (long long) cached_bytes[device].free,
+              (long long) live_blocks.size(),
+              (long long) cached_bytes[device].live);
+    }
+#endif
+
+    return error;
+  }
+
+  /**
+   * @brief Provides a suitable allocation of device memory for the given size on the current
+   *        device.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within the @p
+   * active_stream with which it was associated with during allocation, and it becomes available
+   * for reuse within other streams when all prior work submitted to @p active_stream has
+   * completed.
+   *
+   * @param[out] d_ptr
+   *   Reference to pointer to the allocation
+   *
+   * @param[in] bytes
+   *   Minimum number of bytes for the allocation
+   *
+   * @param[in] active_stream
+   *   The stream to be associated with this allocation
+   */
+  cudaError_t DeviceAllocate(void** d_ptr, size_t bytes, cudaStream_t active_stream = 0)
+  {
+    return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+  }
+
+  /**
+   * @brief Frees a live allocation of device memory on the specified device, returning it to the
+   *        allocator.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within the
+   * @p active_stream with which it was associated with during allocation, and it becomes
+   * available for reuse within other streams when all prior work submitted to @p active_stream
+   * has completed.
+   */
+  cudaError_t DeviceFree(int device, void* d_ptr)
+  {
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    cudaError_t error     = cudaSuccess;
+
+    if (device == INVALID_DEVICE_ORDINAL)
+    {
+      error = CubDebug(cudaGetDevice(&entrypoint_device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+      device = entrypoint_device;
+    }
+
+    // Lock
+    mutex.lock();
+
+    // Find corresponding block descriptor
+    bool recached = false;
+    BlockDescriptor search_key(d_ptr, device);
+    BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+    if (block_itr != live_blocks.end())
+    {
+      // Remove from live blocks
+      search_key = *block_itr;
+      live_blocks.erase(block_itr);
+      cached_bytes[device].live -= search_key.bytes;
+
+      // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+      if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+      {
+        // Insert returned allocation into free blocks
+        recached = true;
+        cached_blocks.insert(search_key);
+        cached_bytes[device].free += search_key.bytes;
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+        _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld "
+                "bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                device,
+                (long long) search_key.bytes,
+                (long long) search_key.associated_stream,
+                (long long) cached_blocks.size(),
+                (long long) cached_bytes[device].free,
+                (long long) live_blocks.size(),
+                (long long) cached_bytes[device].live);
+#endif
+      }
+    }
+
+    // Unlock
+    mutex.unlock();
+
+    // First set to specified device (entrypoint may not be set)
+    if (device != entrypoint_device)
+    {
+      error = CubDebug(cudaGetDevice(&entrypoint_device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+      error = CubDebug(cudaSetDevice(device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    if (recached)
+    {
+      // Insert the ready event in the associated stream (must have current device set properly)
+      error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    if (!recached)
+    {
+      // Free the allocation from the runtime and cleanup the event.
+      error = CubDebug(cudaFree(d_ptr));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+      error = CubDebug(cudaEventDestroy(search_key.ready_event));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld "
+              "bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              device,
+              (long long) search_key.bytes,
+              (long long) search_key.associated_stream,
+              (long long) cached_blocks.size(),
+              (long long) cached_bytes[device].free,
+              (long long) live_blocks.size(),
+              (long long) cached_bytes[device].live);
+#endif
+    }
+
+    // Reset device
+    if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+    {
+      error = CubDebug(cudaSetDevice(entrypoint_device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    return error;
+  }
+
+  /**
+   * @brief Frees a live allocation of device memory on the current device, returning it to the
+   *        allocator.
+   *
+   * Once freed, the allocation becomes available immediately for reuse within the @p
+   * active_stream with which it was associated with during allocation, and it becomes available
+   * for reuse within other streams when all prior work submitted to @p active_stream has
+   * completed.
+   */
+  cudaError_t DeviceFree(void* d_ptr)
+  {
+    return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+  }
+
+  /**
+   * @brief Frees all cached device allocations on all devices
+   */
+  cudaError_t FreeAllCached()
+  {
+    cudaError_t error     = cudaSuccess;
+    int entrypoint_device = INVALID_DEVICE_ORDINAL;
+    int current_device    = INVALID_DEVICE_ORDINAL;
+
+    mutex.lock();
+
+    while (!cached_blocks.empty())
+    {
+      // Get first block
+      CachedBlocks::iterator begin = cached_blocks.begin();
+
+      // Get entry-point device ordinal if necessary
+      if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+      {
+        error = CubDebug(cudaGetDevice(&entrypoint_device));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+      }
+
+      // Set current device ordinal if necessary
+      if (begin->device != current_device)
+      {
+        error = CubDebug(cudaSetDevice(begin->device));
+        if (cudaSuccess != error)
+        {
+          break;
+        }
+        current_device = begin->device;
+      }
+
+      // Free device memory
+      error = CubDebug(cudaFree(begin->d_ptr));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      error = CubDebug(cudaEventDestroy(begin->ready_event));
+      if (cudaSuccess != error)
+      {
+        break;
+      }
+
+      // Reduce balance and erase entry
+      const size_t block_bytes = begin->bytes;
+      cached_bytes[current_device].free -= block_bytes;
+      cached_blocks.erase(begin);
+
+#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
+      _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              current_device,
+              (long long) block_bytes,
+              (long long) cached_blocks.size(),
+              (long long) cached_bytes[current_device].free,
+              (long long) live_blocks.size(),
+              (long long) cached_bytes[current_device].live);
+#endif
+    }
+
+    mutex.unlock();
+
+    // Attempt to revert back to entry-point device if necessary
+    if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+    {
+      error = CubDebug(cudaSetDevice(entrypoint_device));
+      if (cudaSuccess != error)
+      {
+        return error;
+      }
+    }
+
+    return error;
+  }
+
+  /**
+   * @brief Destructor
+   */
+  virtual ~CachingDeviceAllocator()
+  {
+    if (!skip_cleanup)
+    {
+      FreeAllCached();
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_arch.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_arch.cuh
new file mode 100644
index 000000000..5f8780620
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_arch.cuh
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_cpp_dialect.cuh> // IWYU pragma: export
+#include <cub/util_macro.cuh>
+#include <cub/util_namespace.cuh>
+
+// Legacy include; this functionality used to be defined in here.
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+// \deprecated [Since 2.1.0]
+#  define CUB_USE_COOPERATIVE_GROUPS
+
+/// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
+/// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
+#  ifndef CUB_PTX_ARCH
+#    if defined(_NVHPC_CUDA)
+// __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
+// when compiling both host code and device code. Currently, only one
+// PTX version can be targeted.
+#      define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
+#    elif !defined(__CUDA_ARCH__)
+#      define CUB_PTX_ARCH 0
+#    else
+#      define CUB_PTX_ARCH __CUDA_ARCH__
+#    endif
+#  endif
+
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+// For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#  ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS
+#    ifndef CUB_IS_DEVICE_CODE
+#      if defined(_NVHPC_CUDA)
+#        define CUB_IS_DEVICE_CODE      __builtin_is_device_code()
+#        define CUB_IS_HOST_CODE        (!__builtin_is_device_code())
+#        define CUB_INCLUDE_DEVICE_CODE 1
+#        define CUB_INCLUDE_HOST_CODE   1
+#      elif CUB_PTX_ARCH > 0
+#        define CUB_IS_DEVICE_CODE      1
+#        define CUB_IS_HOST_CODE        0
+#        define CUB_INCLUDE_DEVICE_CODE 1
+#        define CUB_INCLUDE_HOST_CODE   0
+#      else
+#        define CUB_IS_DEVICE_CODE      0
+#        define CUB_IS_HOST_CODE        1
+#        define CUB_INCLUDE_DEVICE_CODE 0
+#        define CUB_INCLUDE_HOST_CODE   1
+#      endif
+#    endif
+#  endif // CUB_PROVIDE_LEGACY_ARCH_MACROS
+
+/// Maximum number of devices supported.
+#  ifndef CUB_MAX_DEVICES
+#    define CUB_MAX_DEVICES (128)
+#  endif
+
+static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
+
+/// Number of threads per warp
+#  ifndef CUB_LOG_WARP_THREADS
+#    define CUB_LOG_WARP_THREADS(unused) (5)
+#    define CUB_WARP_THREADS(unused)     (1 << CUB_LOG_WARP_THREADS(0))
+
+#    define CUB_PTX_WARP_THREADS     CUB_WARP_THREADS(0)
+#    define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(0)
+#  endif
+
+/// Number of smem banks
+#  ifndef CUB_LOG_SMEM_BANKS
+#    define CUB_LOG_SMEM_BANKS(unused) (5)
+#    define CUB_SMEM_BANKS(unused)     (1 << CUB_LOG_SMEM_BANKS(0))
+
+#    define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(0)
+#    define CUB_PTX_SMEM_BANKS     CUB_SMEM_BANKS
+#  endif
+
+/// Oversubscription factor
+#  ifndef CUB_SUBSCRIPTION_FACTOR
+#    define CUB_SUBSCRIPTION_FACTOR(unused) (5)
+#    define CUB_PTX_SUBSCRIPTION_FACTOR     CUB_SUBSCRIPTION_FACTOR(0)
+#  endif
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#  ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+#    define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1)
+#    define CUB_PTX_PREFER_CONFLICT_OVER_PADDING     CUB_PREFER_CONFLICT_OVER_PADDING(0)
+#  endif
+
+namespace detail
+{
+// The maximum amount of static shared memory available per thread block
+// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
+static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
+} // namespace detail
+
+template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
+struct RegBoundScaling
+{
+  enum
+  {
+    ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
+    BLOCK_THREADS    = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
+                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+  };
+};
+
+template <int NOMINAL_4B_BLOCK_THREADS, int NOMINAL_4B_ITEMS_PER_THREAD, typename T>
+struct MemBoundScaling
+{
+  enum
+  {
+    ITEMS_PER_THREAD =
+      CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
+    BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS,
+                            ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
+  };
+};
+
+#endif // Do not document
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_compiler.cuh
similarity index 66%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh
rename to source/tnn/device/cuda/thirdparty/cub/cub/util_compiler.cuh
index 9be94922a..cae7565ec 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_compiler.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_compiler.cuh
@@ -32,50 +32,79 @@
 
 #pragma once
 
+// For _CCCL_IMPLICIT_SYSTEM_HEADER
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 // enumerate host compilers we know about
+//! deprecated [Since 2.7]
 #define CUB_HOST_COMPILER_UNKNOWN 0
+//! deprecated [Since 2.7]
 #define CUB_HOST_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
 #define CUB_HOST_COMPILER_GCC 2
+//! deprecated [Since 2.7]
 #define CUB_HOST_COMPILER_CLANG 3
 
 // enumerate device compilers we know about
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_UNKNOWN 0
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_MSVC 1
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_GCC 2
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_NVCC 3
+//! deprecated [Since 2.7]
 #define CUB_DEVICE_COMPILER_CLANG 4
 
 // figure out which host compiler we're using
-#if defined(_MSC_VER)
+#if defined(_CCCL_COMPILER_MSVC)
+//! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
-#  define CUB_MSVC_VERSION _MSC_VER
-#  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
-#elif defined(__clang__)
+//! deprecated [Since 2.7]
+#  define CUB_MSVC_VERSION _CCCL_MSVC_VERSION
+//! deprecated [Since 2.7]
+#  define CUB_MSVC_VERSION_FULL _CCCL_MSVC_VERSION_FULL
+#elif defined(_CCCL_COMPILER_CLANG)
+//! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
-#  define CUB_CLANG_VERSION                                                    \
-    (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
-#elif defined(__GNUC__)
+//! deprecated [Since 2.7]
+#  define CUB_CLANG_VERSION _CCCL_CLANG_VERSION
+#elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
-#  define CUB_GCC_VERSION                                                      \
-    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
-#endif // CUB_HOST_COMPILER
+//! deprecated [Since 2.7]
+#  define CUB_GCC_VERSION   _CCCL_GCC_VERSION
+#endif
 
 // figure out which device compiler we're using
-#if defined(__CUDACC__)
+#if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_CUDA_COMPILER_NVHPC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#elif defined(_CCCL_COMPILER_MSVC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
+#elif defined(_CCCL_COMPILER_GCC)
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
+#elif defined(_CCCL_COMPILER_CLANG)
 // CUDA-capable clang should behave similar to NVCC.
-#  if defined(__CUDA__)
+#  if defined(_CCCL_CUDA_COMPILER_NVCC)
+//! deprecated [Since 2.7]
 #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
 #  else
+//! deprecated [Since 2.7]
 #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
 #  endif
 #else
+//! deprecated [Since 2.7]
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
 #endif
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_cpp_dialect.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_cpp_dialect.cuh
new file mode 100644
index 000000000..c658b97d9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_cpp_dialect.cuh
@@ -0,0 +1,136 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! Detect the version of the C++ standard used by the compiler.
+
+#pragma once
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_compiler.cuh> // IWYU pragma: export
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - CUB_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - CUB_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_CPP_14:
+//   Ignore deprecation warnings when compiling with C++14. C++03 and outdated
+//   compilers will still issue warnings.
+// - CUB_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03, C++11 and C++14 will still issue warnings.
+
+// Check for the thrust opt-outs as well:
+#  if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
+#    define CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#  endif
+#  if !defined(CUB_IGNORE_DEPRECATED_CPP_11) && defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+#    define CUB_IGNORE_DEPRECATED_CPP_11
+#  endif
+#  if !defined(CUB_IGNORE_DEPRECATED_COMPILER) && defined(THRUST_IGNORE_DEPRECATED_COMPILER)
+#    define CUB_IGNORE_DEPRECATED_COMPILER
+#  endif
+
+#  ifdef CUB_IGNORE_DEPRECATED_CPP_DIALECT
+#    define CUB_IGNORE_DEPRECATED_CPP_11
+#    define CUB_IGNORE_DEPRECATED_CPP_14
+#    ifndef CUB_IGNORE_DEPRECATED_COMPILER
+#      define CUB_IGNORE_DEPRECATED_COMPILER
+#    endif
+#  endif
+
+#  define CUB_CPP_DIALECT _CCCL_STD_VER
+
+// Define CUB_COMPILER_DEPRECATION macro:
+#  if defined(_CCCL_COMPILER_MSVC)
+#    define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(message(__FILE__ ":" _CCCL_TO_STRING(__LINE__) ": warning: " #msg))
+#  else // clang / gcc:
+#    define CUB_COMP_DEPR_IMPL(msg) _CCCL_PRAGMA(GCC warning #msg)
+#  endif
+
+#  define CUB_COMPILER_DEPRECATION(REQ) \
+    CUB_COMP_DEPR_IMPL(CUB requires at least REQ.Define CUB_IGNORE_DEPRECATED_COMPILER to suppress this message.)
+
+#  define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR)                                                             \
+    CUB_COMP_DEPR_IMPL(                                                                                       \
+      CUB requires at least REQ.CUR is deprecated but still supported.CUR support will be removed in a future \
+        release.Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#  ifndef CUB_IGNORE_DEPRECATED_COMPILER
+
+// Compiler checks:
+#    if defined(_CCCL_COMPILER_GCC) && _CCCL_GCC_VERSION < 50000
+CUB_COMPILER_DEPRECATION(GCC 5.0);
+#    elif defined(_CCCL_COMPILER_CLANG) && _CCCL_CLANG_VERSION < 70000
+CUB_COMPILER_DEPRECATION(Clang 7.0);
+#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1910
+// <2017. Hard upgrade message:
+CUB_COMPILER_DEPRECATION(MSVC 2019(19.20 / 16.0 / 14.20));
+#    elif defined(_CCCL_COMPILER_MSVC) && _CCCL_MSVC_VERSION < 1920
+// >=2017, <2019. Soft deprecation message:
+CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019(19.20 / 16.0 / 14.20), MSVC 2017);
+#    endif
+
+#  endif // CUB_IGNORE_DEPRECATED_COMPILER
+
+#  ifndef CUB_IGNORE_DEPRECATED_DIALECT
+
+// Dialect checks:
+#    if _CCCL_STD_VER < 2011
+// <C++11. Hard upgrade message:
+CUB_COMPILER_DEPRECATION(C++ 17);
+#    elif _CCCL_STD_VER == 2011 && !defined(CUB_IGNORE_DEPRECATED_CPP_11)
+// =C++11. Soft upgrade message:
+CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 11);
+#    elif _CCCL_STD_VER == 2014 && !defined(CUB_IGNORE_DEPRECATED_CPP_14)
+// =C++14. Soft upgrade message:
+CUB_COMPILER_DEPRECATION_SOFT(C++ 17, C++ 14);
+#    endif
+
+#  endif // CUB_IGNORE_DEPRECATED_DIALECT
+
+#  undef CUB_COMPILER_DEPRECATION_SOFT
+#  undef CUB_COMPILER_DEPRECATION
+#  undef CUB_COMP_DEPR_IMPL
+#  undef CUB_COMP_DEPR_IMPL0
+#  undef CUB_COMP_DEPR_IMPL1
+
+#endif // !DOXYGEN_SHOULD_SKIP_THIS
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_debug.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_debug.cuh
new file mode 100644
index 000000000..edb75a64d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_debug.cuh
@@ -0,0 +1,312 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <nv/target>
+
+#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+
+/**
+ * @def CUB_DEBUG_LOG
+ *
+ * Causes kernel launch configurations to be printed to the console
+ */
+#  define CUB_DEBUG_LOG
+
+/**
+ * @def CUB_DEBUG_SYNC
+ *
+ * Causes synchronization of the stream after every kernel launch to check
+ * for errors. Also causes kernel launch configurations to be printed to the
+ * console.
+ */
+#  define CUB_DEBUG_SYNC
+
+/**
+ * @def CUB_DEBUG_HOST_ASSERTIONS
+ *
+ * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition
+ * assertions.
+ */
+#  define CUB_DEBUG_HOST_ASSERTIONS
+
+/**
+ * @def CUB_DEBUG_DEVICE_ASSERTIONS
+ *
+ * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side
+ * precondition assertions.
+ */
+#  define CUB_DEBUG_DEVICE_ASSERTIONS
+
+/**
+ * @def CUB_DEBUG_ALL
+ *
+ * Causes host and device-side precondition assertions to be checked. Apart
+ * from that, causes synchronization of the stream after every kernel launch to
+ * check for errors. Also causes kernel launch configurations to be printed to
+ * the console.
+ */
+#  define CUB_DEBUG_ALL
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+// `CUB_DETAIL_DEBUG_LEVEL_*`: Implementation details, internal use only:
+
+#define CUB_DETAIL_DEBUG_LEVEL_NONE                 0
+#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY 1
+#define CUB_DETAIL_DEBUG_LEVEL_LOG                  2
+#define CUB_DETAIL_DEBUG_LEVEL_SYNC                 3
+#define CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS      4
+#define CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS    5
+#define CUB_DETAIL_DEBUG_LEVEL_ALL                  1000
+
+// `CUB_DEBUG_*`: User interfaces:
+
+// Extra logging, no syncs
+#ifdef CUB_DEBUG_LOG
+#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_LOG
+#endif
+
+// Logging + syncs
+#ifdef CUB_DEBUG_SYNC
+#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_SYNC
+#endif
+
+// Logging + syncs + host assertions
+#ifdef CUB_DEBUG_HOST_ASSERTIONS
+#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS
+#endif
+
+// Logging + syncs + host assertions + device assertions
+#ifdef CUB_DEBUG_DEVICE_ASSERTIONS
+#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
+#endif
+
+// All
+#ifdef CUB_DEBUG_ALL
+#  define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL
+#endif
+
+// Default case, no extra debugging:
+#ifndef CUB_DETAIL_DEBUG_LEVEL
+#  ifdef NDEBUG
+#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_NONE
+#  else
+#    define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY
+#  endif
+#endif
+
+/*
+ * `CUB_DETAIL_DEBUG_ENABLE_*`:
+ * Internal implementation details, used for testing enabled debug features:
+ */
+
+#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_LOG
+#  define CUB_DETAIL_DEBUG_ENABLE_LOG
+#endif
+
+#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_SYNC
+#  define CUB_DETAIL_DEBUG_ENABLE_SYNC
+#endif
+
+#if (CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS) \
+  || (CUB_DETAIL_DEBUG_LEVEL == CUB_DETAIL_DEBUG_LEVEL_HOST_ASSERTIONS_ONLY)
+#  define CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS
+#endif
+
+#if CUB_DETAIL_DEBUG_LEVEL >= CUB_DETAIL_DEBUG_LEVEL_DEVICE_ASSERTIONS
+#  define CUB_DETAIL_DEBUG_ENABLE_DEVICE_ASSERTIONS
+#endif
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+#  define CUB_STDERR
+#endif
+
+#if defined(CUB_STDERR) || defined(CUB_DETAIL_DEBUG_ENABLE_LOG)
+#  include <cstdio>
+#endif
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the
+ * corresponding error message is printed to \p stderr (or \p stdout in device
+ * code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Debug(cudaError_t error, const char* filename, int line)
+{
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+
+  // clang-format off
+  #ifndef CUB_RDC_ENABLED
+  #define CUB_TEMP_DEVICE_CODE
+  #else
+  #define CUB_TEMP_DEVICE_CODE last_error = cudaGetLastError()
+  #endif
+
+  cudaError_t last_error = cudaSuccess;
+
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (last_error = cudaGetLastError();),
+    (CUB_TEMP_DEVICE_CODE;)
+  );
+
+  #undef CUB_TEMP_DEVICE_CODE
+  // clang-format on
+
+  if (error == cudaSuccess && last_error != cudaSuccess)
+  {
+    error = last_error;
+  }
+
+#ifdef CUB_STDERR
+  if (error)
+  {
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+       fflush(stderr);),
+      (printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n",
+              error,
+              blockIdx.z,
+              blockIdx.y,
+              blockIdx.x,
+              threadIdx.z,
+              threadIdx.y,
+              threadIdx.x,
+              filename,
+              line);));
+  }
+#else
+  (void) filename;
+  (void) line;
+#endif
+
+  return error;
+}
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+#  define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+#  define CubDebugExit(e)                                               \
+    if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) \
+    {                                                                   \
+      exit(1);                                                          \
+    }
+#endif
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+#  if defined(_NVHPC_CUDA) || !(defined(__clang__) && defined(__CUDA__))
+
+// NVCC / NVC++
+#    define _CubLog(format, ...)                                    \
+      do                                                            \
+      {                                                             \
+        NV_IF_TARGET(                                               \
+          NV_IS_HOST,                                               \
+          (printf(format, __VA_ARGS__);),                           \
+          (printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+                  blockIdx.z,                                       \
+                  blockIdx.y,                                       \
+                  blockIdx.x,                                       \
+                  threadIdx.z,                                      \
+                  threadIdx.y,                                      \
+                  threadIdx.x,                                      \
+                  __VA_ARGS__);));                                  \
+      } while (false)
+
+#  else // Clang:
+
+// XXX shameless hack for clang around variadic printf...
+//     Compilies w/o supplying -std=c++11 but shows warning,
+//     so we silence them :)
+#    pragma clang diagnostic ignored "-Wc++11-extensions"
+#    pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+#    ifdef CUB_STDERR
+template <class... Args>
+inline _CCCL_HOST_DEVICE void va_printf(char const* format, Args const&... args)
+{
+#      ifdef __CUDA_ARCH__
+  printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+#      else
+  printf(format, args...);
+#      endif
+}
+#    else // !defined(CUB_STDERR)
+template <class... Args>
+inline _CCCL_HOST_DEVICE void va_printf(char const*, Args const&...)
+{}
+#    endif // !defined(CUB_STDERR)
+
+#    ifndef __CUDA_ARCH__
+#      define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format, __VA_ARGS__);
+#    else
+#      define _CubLog(format, ...)                               \
+        CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread " \
+                                    "(%d,%d,%d)]: " format,      \
+                                    __VA_ARGS__);
+#    endif
+#  endif
+#endif
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_deprecated.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_deprecated.cuh
new file mode 100644
index 000000000..b5c68ecf9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_deprecated.cuh
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define CUB_DEPRECATED macro.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/type_traits.cuh>
+#include <cub/util_debug.cuh>
+
+#if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
+#  define CUB_IGNORE_DEPRECATED_API
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_API
+#  define CUB_DEPRECATED
+#  define CUB_DEPRECATED_BECAUSE(MSG)
+#elif _CCCL_STD_VER >= 2014
+#  define CUB_DEPRECATED              [[deprecated]]
+#  define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]]
+#elif defined(_CCCL_COMPILER_MSVC)
+#  define CUB_DEPRECATED              __declspec(deprecated)
+#  define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG))
+#elif defined(_CCCL_COMPILER_CLANG)
+#  define CUB_DEPRECATED              __attribute__((deprecated))
+#  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
+#elif defined(_CCCL_COMPILER_GCC)
+#  define CUB_DEPRECATED              __attribute__((deprecated))
+#  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
+#else
+#  define CUB_DEPRECATED
+#  define CUB_DEPRECATED_BECAUSE(MSG)
+#endif
+
+#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED             \
+  CUB_DEPRECATED_BECAUSE(                                          \
+    "CUB no longer accepts `debug_synchronous` parameter. "        \
+    "Define CUB_DEBUG_SYNC instead, or silence this message with " \
+    "CUB_IGNORE_DEPRECATED_API.")
+
+#define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG                     \
+  if (debug_synchronous)                                            \
+  {                                                                 \
+    _CubLog("%s\n",                                                 \
+            "CUB no longer accepts `debug_synchronous` parameter. " \
+            "Define CUB_DEBUG_SYNC instead.");                      \
+  }
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_device.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_device.cuh
new file mode 100644
index 000000000..e395b17f6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_device.cuh
@@ -0,0 +1,782 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle.
+ *
+ * \note
+ * This file contains __host__ only functions and utilities, and should not be
+ * included in code paths that could be online-compiled (ex: using NVRTC).
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/device_synchronize.cuh> // IWYU pragma: export
+#include <cub/util_debug.cuh>
+#include <cub/util_type.cuh>
+// for backward compatibility
+#include <cub/util_temporary_storage.cuh>
+
+#include <cuda/std/__cuda/ensure_current_device.h> // IWYU pragma: export
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+
+#include <array>
+#include <atomic>
+#include <cassert>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+/**
+ * @brief Helper class template that allows overwriting the `BLOCK_THREAD` and `ITEMS_PER_THREAD`
+ * configurations of a given policy.
+ */
+// TODO(bgruber): this should be called something like "override_policy"
+template <typename PolicyT, int BLOCK_THREADS_, int ITEMS_PER_THREAD_ = PolicyT::ITEMS_PER_THREAD>
+struct policy_wrapper_t : PolicyT
+{
+  static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_;
+  static constexpr int BLOCK_THREADS    = BLOCK_THREADS_;
+  static constexpr int ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD;
+};
+} // namespace detail
+
+/**
+ * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel()
+{}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the current device or -1 if an error occurred.
+ */
+CUB_RUNTIME_FUNCTION inline int CurrentDevice()
+{
+  int device = -1;
+  if (CubDebug(cudaGetDevice(&device)))
+  {
+    return -1;
+  }
+  return device;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+//! @brief RAII helper which saves the current device and switches to the specified device on construction and switches
+//! to the saved device on destruction.
+using SwitchDevice = ::cuda::__ensure_current_device;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Returns the number of CUDA devices available or -1 if an error
+ *        occurred.
+ */
+CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
+{
+  int count = -1;
+  if (CubDebug(cudaGetDeviceCount(&count)))
+  {
+    // CUDA makes no guarantees about the state of the output parameter if
+    // `cudaGetDeviceCount` fails; in practice, they don't, but out of
+    // paranoia we'll reset `count` to `-1`.
+    count = -1;
+  }
+  return count;
+}
+
+/**
+ * \brief Cache for an arbitrary value produced by a nullary function.
+ * deprecated [Since 2.6.0]
+ */
+template <typename T, T (*Function)()>
+struct CUB_DEPRECATED ValueCache
+{
+  T const value;
+
+  /**
+   * \brief Call the nullary function to produce the value and construct the
+   *        cache.
+   */
+  _CCCL_HOST inline ValueCache()
+      : value(Function())
+  {}
+};
+
+// Host code. This is a separate function to avoid defining a local static in a host/device function.
+_CCCL_HOST inline int DeviceCountCachedValue()
+{
+  static int count = DeviceCountUncached();
+  return count;
+}
+
+/**
+ * \brief Returns the number of CUDA devices available.
+ *
+ * \note This function may cache the result internally.
+ *
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline int DeviceCount()
+{
+  int result = -1;
+
+  NV_IF_TARGET(NV_IS_HOST, (result = DeviceCountCachedValue();), (result = DeviceCountUncached();));
+
+  return result;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+/**
+ * \brief Per-device cache for a CUDA attribute value; the attribute is queried
+ *        and stored for each device upon construction.
+ */
+struct PerDeviceAttributeCache
+{
+  struct DevicePayload
+  {
+    int attribute;
+    cudaError_t error;
+  };
+
+  // Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
+  // `DeviceEntryInitializing` state, and then proceeds to the
+  // `DeviceEntryReady` state. These are the only state transitions allowed;
+  // i.e. a linear sequence of transitions.
+  enum DeviceEntryStatus
+  {
+    DeviceEntryEmpty = 0,
+    DeviceEntryInitializing,
+    DeviceEntryReady
+  };
+
+  struct DeviceEntry
+  {
+    std::atomic<DeviceEntryStatus> flag;
+    DevicePayload payload;
+  };
+
+private:
+  std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;
+
+public:
+  /**
+   * \brief Construct the cache.
+   */
+  _CCCL_HOST inline PerDeviceAttributeCache()
+      : entries_()
+  {
+    assert(DeviceCount() <= CUB_MAX_DEVICES);
+  }
+
+  /**
+   * \brief Retrieves the payload of the cached function \p f for \p device.
+   *
+   * \note You must pass a morally equivalent function in to every call or
+   *       this function has undefined behavior.
+   */
+  template <typename Invocable>
+  _CCCL_HOST DevicePayload operator()(Invocable&& f, int device)
+  {
+    if (device >= DeviceCount() || device < 0)
+    {
+      return DevicePayload{0, cudaErrorInvalidDevice};
+    }
+
+    auto& entry   = entries_[device];
+    auto& flag    = entry.flag;
+    auto& payload = entry.payload;
+
+    DeviceEntryStatus old_status = DeviceEntryEmpty;
+
+    // First, check for the common case of the entry being ready.
+    if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
+    {
+      // Assume the entry is empty and attempt to lock it so we can fill
+      // it by trying to set the state from `DeviceEntryReady` to
+      // `DeviceEntryInitializing`.
+      if (flag.compare_exchange_strong(
+            old_status, DeviceEntryInitializing, std::memory_order_acq_rel, std::memory_order_acquire))
+      {
+        // We successfully set the state to `DeviceEntryInitializing`;
+        // we have the lock and it's our job to initialize this entry
+        // and then release it.
+
+        // We don't use `CubDebug` here because we let the user code
+        // decide whether or not errors are hard errors.
+        payload.error = ::cuda::std::forward<Invocable>(f)(payload.attribute);
+        if (payload.error)
+        {
+          // Clear the global CUDA error state which may have been
+          // set by the last call. Otherwise, errors may "leak" to
+          // unrelated kernel launches.
+          cudaGetLastError();
+        }
+
+        // Release the lock by setting the state to `DeviceEntryReady`.
+        flag.store(DeviceEntryReady, std::memory_order_release);
+      }
+
+      // If the `compare_exchange_weak` failed, then `old_status` has
+      // been updated with the value of `flag` that it observed.
+
+      else if (old_status == DeviceEntryInitializing)
+      {
+        // Another execution agent is initializing this entry; we need
+        // to wait for them to finish; we'll know they're done when we
+        // observe the entry status as `DeviceEntryReady`.
+        do
+        {
+          old_status = flag.load(std::memory_order_acquire);
+        } while (old_status != DeviceEntryReady);
+        // FIXME: Use `atomic::wait` instead when we have access to
+        // host-side C++20 atomics. We could use libcu++, but it only
+        // supports atomics for SM60 and up, even if you're only using
+        // them in host code.
+      }
+    }
+
+    // We now know that the state of our entry is `DeviceEntryReady`, so
+    // just return the entry's payload.
+    return entry.payload;
+  }
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
+{
+  // Instantiate `EmptyKernel<void>` in both host and device code to ensure
+  // it can be called.
+  using EmptyKernelPtr        = void (*)();
+  EmptyKernelPtr empty_kernel = EmptyKernel<void>;
+
+  // This is necessary for unused variable warnings in host compilers. The
+  // usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
+  (void) reinterpret_cast<void*>(empty_kernel);
+
+  // Define a temporary macro that expands to the current target ptx version
+  // in device code.
+  // <nv/target> may provide an abstraction for this eventually. For now,
+  // we have to keep this usage of __CUDA_ARCH__.
+#if defined(_NVHPC_CUDA)
+#  define CUB_TEMP_GET_PTX __builtin_current_device_sm()
+#else
+#  define CUB_TEMP_GET_PTX __CUDA_ARCH__
+#endif
+
+  cudaError_t result = cudaSuccess;
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (cudaFuncAttributes empty_kernel_attrs;
+
+     result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, reinterpret_cast<void*>(empty_kernel)));
+
+     ptx_version = empty_kernel_attrs.ptxVersion * 10;),
+    // NV_IS_DEVICE
+    (
+      // This is necessary to ensure instantiation of EmptyKernel in device
+      // code. The `reinterpret_cast` is necessary to suppress a
+      // set-but-unused warnings. This is a meme now:
+      // https://twitter.com/blelbach/status/1222391615576100864
+      (void) reinterpret_cast<EmptyKernelPtr>(empty_kernel);
+
+      ptx_version = CUB_TEMP_GET_PTX;));
+
+#undef CUB_TEMP_GET_PTX
+
+  return result;
+}
+
+/**
+ * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
+ */
+_CCCL_HOST inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
+{
+  SwitchDevice sd(device);
+  (void) sd;
+  return PtxVersionUncached(ptx_version);
+}
+
+template <typename Tag>
+_CCCL_HOST inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+{
+  static PerDeviceAttributeCache cache;
+  return cache;
+}
+
+struct PtxVersionCacheTag
+{};
+struct SmVersionCacheTag
+{};
+
+/**
+ * \brief Retrieves the PTX virtual architecture that will be used on \p device (major * 100 + minor * 10). If
+ * __CUDA_ARCH_LIST__ is defined, this value is one of __CUDA_ARCH_LIST__.
+ *
+ * \note This function may cache the result internally.
+ * \note This function is thread safe.
+ */
+_CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
+{
+  // Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
+  // architectures in __CUDA_ARCH__ and the runtime queried ptx version.
+  auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
+    // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
+    [=](int& pv) {
+      return PtxVersionUncached(pv, device);
+    },
+    device);
+
+  if (!CubDebug(payload.error))
+  {
+    ptx_version = payload.attribute;
+  }
+
+  return payload.error;
+}
+
+/**
+ * \brief Retrieves the PTX virtual architecture that will be used on the current device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
+{
+  // Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
+  // architectures in __CUDA_ARCH__ and the runtime queried ptx version.
+  cudaError_t result = cudaErrorUnknown;
+  NV_IF_TARGET(NV_IS_HOST,
+               (result = PtxVersion(ptx_version, CurrentDevice());),
+               ( // NV_IS_DEVICE:
+                 result = PtxVersionUncached(ptx_version);));
+  return result;
+}
+
+/**
+ * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+{
+  cudaError_t error = cudaSuccess;
+  do
+  {
+    int major = 0, minor = 0;
+    error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+
+    error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+    if (cudaSuccess != error)
+    {
+      break;
+    }
+    sm_version = major * 100 + minor * 10;
+  } while (0);
+
+  return error;
+}
+
+/**
+ * \brief Retrieves the SM version (i.e. compute capability) of \p device (major * 100 + minor * 10).
+ *
+ * \note This function may cache the result internally.
+ * \note This function is thread safe.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+{
+  cudaError_t result = cudaErrorUnknown;
+
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
+       // If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
+       [=](int& pv) {
+         return SmVersionUncached(pv, device);
+       },
+       device);
+
+     if (!CubDebug(payload.error)) { sm_version = payload.attribute; };
+
+     result = payload.error;),
+    ( // NV_IS_DEVICE
+      result = SmVersionUncached(sm_version, device);));
+
+  return result;
+}
+
+/**
+ * Synchronize the specified \p stream.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
+{
+  cudaError_t result = cudaErrorNotSupported;
+
+  NV_IF_TARGET(NV_IS_HOST,
+               (result = CubDebug(cudaStreamSynchronize(stream));),
+               ((void) stream; result = CubDebug(cub::detail::device_synchronize());));
+
+  return result;
+}
+
+namespace detail
+{
+
+/**
+ * Same as SyncStream, but intended for use with the debug_synchronous flags
+ * in device algorithms. This should not be used if synchronization is required
+ * for correctness.
+ *
+ * If `debug_synchronous` is false, this function will immediately return
+ * cudaSuccess. If true, one of the following will occur:
+ *
+ * If synchronization is supported by the current compilation target and
+ * settings, the sync is performed and the sync result is returned.
+ *
+ * If syncs are not supported then no sync is performed, but a message is logged
+ * via _CubLog and cudaSuccess is returned.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
+{
+#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
+
+  (void) stream;
+  return cudaSuccess;
+
+#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:
+
+#  define CUB_TMP_SYNC_AVAILABLE         \
+    _CubLog("%s\n", "Synchronizing..."); \
+    return SyncStream(stream)
+
+#  define CUB_TMP_DEVICE_SYNC_UNAVAILABLE                                        \
+    (void) stream;                                                               \
+    _CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
+            "device-side sync requires <sm_90, RDC, and CDPv1");                 \
+    return cudaSuccess
+
+#  ifdef CUB_DETAIL_CDPv1
+
+  // Can sync everywhere but SM_90+
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;), (CUB_TMP_SYNC_AVAILABLE;));
+
+#  else // CDPv2 or no CDP:
+
+  // Can only sync on host
+  NV_IF_TARGET(NV_IS_HOST, (CUB_TMP_SYNC_AVAILABLE;), (CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));
+
+#  endif // CDP version
+
+#  undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
+#  undef CUB_TMP_SYNC_AVAILABLE
+
+#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
+}
+
+/** \brief Gets whether the current device supports unified addressing */
+CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
+{
+  has_uva           = false;
+  int device        = -1;
+  cudaError_t error = CubDebug(cudaGetDevice(&device));
+  if (cudaSuccess != error)
+  {
+    return error;
+  }
+
+  int uva = 0;
+  error   = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device));
+  if (cudaSuccess != error)
+  {
+    return error;
+  }
+  has_uva = uva == 1;
+  return error;
+}
+
+} // namespace detail
+
+/**
+ * @brief Computes maximum SM occupancy in thread blocks for executing the given kernel function
+ *        pointer @p kernel_ptr on the current device with @p block_threads per thread block.
+ *
+ * @par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * @endcode
+ *
+ * @param[out] max_sm_occupancy
+ *   maximum number of thread blocks that can reside on a single SM
+ *
+ * @param[in] kernel_ptr
+ *   Kernel pointer for which to compute SM occupancy
+ *
+ * @param[in] block_threads
+ *   Number of threads per thread block
+ *
+ * @param[in] dynamic_smem_bytes
+ *   Dynamically allocated shared memory in bytes. Default is 0.
+ */
+template <typename KernelPtr>
+_CCCL_VISIBILITY_HIDDEN CUB_RUNTIME_FUNCTION inline cudaError_t
+MaxSmOccupancy(int& max_sm_occupancy, KernelPtr kernel_ptr, int block_threads, int dynamic_smem_bytes = 0)
+{
+  return CubDebug(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_sm_occupancy, kernel_ptr, block_threads, dynamic_smem_bytes));
+}
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+template <typename PolicyT, typename = void>
+struct PolicyWrapper : PolicyT
+{
+  CUB_RUNTIME_FUNCTION PolicyWrapper(PolicyT base)
+      : PolicyT(base)
+  {}
+};
+
+template <typename StaticPolicyT>
+struct PolicyWrapper<
+  StaticPolicyT,
+  _CUDA_VSTD::void_t<decltype(StaticPolicyT::BLOCK_THREADS), decltype(StaticPolicyT::ITEMS_PER_THREAD)>> : StaticPolicyT
+{
+  CUB_RUNTIME_FUNCTION PolicyWrapper(StaticPolicyT base)
+      : StaticPolicyT(base)
+  {}
+
+  CUB_RUNTIME_FUNCTION static constexpr int BlockThreads()
+  {
+    return StaticPolicyT::BLOCK_THREADS;
+  }
+
+  CUB_RUNTIME_FUNCTION static constexpr int ItemsPerThread()
+  {
+    return StaticPolicyT::ITEMS_PER_THREAD;
+  }
+};
+
+template <typename PolicyT>
+CUB_RUNTIME_FUNCTION PolicyWrapper<PolicyT> MakePolicyWrapper(PolicyT policy)
+{
+  return PolicyWrapper<PolicyT>{policy};
+}
+
+struct TripleChevronFactory;
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+  int block_threads;
+  int items_per_thread;
+  int tile_size;
+  int sm_occupancy;
+
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE KernelConfig()
+      : block_threads(0)
+      , items_per_thread(0)
+      , tile_size(0)
+      , sm_occupancy(0)
+  {}
+
+  template <typename AgentPolicyT, typename KernelPtrT, typename LauncherFactory = TripleChevronFactory>
+  CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
+  Init(KernelPtrT kernel_ptr, AgentPolicyT agent_policy = {}, LauncherFactory launcher_factory = {})
+  {
+    block_threads    = MakePolicyWrapper(agent_policy).BlockThreads();
+    items_per_thread = MakePolicyWrapper(agent_policy).ItemsPerThread();
+    tile_size        = block_threads * items_per_thread;
+    return launcher_factory.MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+  }
+};
+
+/// Helper for dispatching into a policy chain
+template <int PolicyPtxVersion, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+  /// The policy for the active compiler pass
+  using ActivePolicy = ::cuda::std::_If<(CUB_PTX_ARCH < PolicyPtxVersion), typename PrevPolicyT::ActivePolicy, PolicyT>;
+
+  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+  template <typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
+  {
+    // __CUDA_ARCH_LIST__ is only available from CTK 11.5 onwards
+#ifdef __CUDA_ARCH_LIST__
+    return runtime_to_compiletime<__CUDA_ARCH_LIST__>(device_ptx_version, op);
+#else
+    if (device_ptx_version < PolicyPtxVersion)
+    {
+      return PrevPolicyT::Invoke(device_ptx_version, op);
+    }
+    return op.template Invoke<PolicyT>();
+#endif
+  }
+
+private:
+  template <int, typename, typename>
+  friend struct ChainedPolicy; // let us call invoke_static of other ChainedPolicy instantiations
+
+  template <int... CudaArches, typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t runtime_to_compiletime(int device_ptx_version, FunctorT& op)
+  {
+    // We instantiate invoke_static for each CudaArches, but only call the one matching device_ptx_version.
+    // If there's no exact match of any of the architectures in __CUDA_ARCH_LIST__ and the runtime
+    // queried ptx version (i.e., the closest ptx version to the current device's architecture that the EmptyKernel was
+    // compiled for), we return cudaErrorInvalidDeviceFunction. Such a scenario may arise if CUB_DISABLE_NAMESPACE_MAGIC
+    // is set and different TUs are compiled for different sets of architecture.
+    cudaError_t e             = cudaErrorInvalidDeviceFunction;
+    const cudaError_t dummy[] = {
+      (device_ptx_version == CudaArches ? (e = invoke_static<CudaArches>(op, ::cuda::std::true_type{}))
+                                        : cudaSuccess)...};
+    (void) dummy;
+    return e;
+  }
+
+  template <int DevicePtxVersion, typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
+  {
+    // TODO(bgruber): drop diagnostic suppression in C++17
+    _CCCL_DIAG_PUSH
+    _CCCL_DIAG_SUPPRESS_MSVC(4127) // suppress Conditional Expression is Constant
+    _CCCL_IF_CONSTEXPR (DevicePtxVersion < PolicyPtxVersion)
+    {
+      // TODO(bgruber): drop boolean tag dispatches in C++17, since _CCCL_IF_CONSTEXPR will discard this branch properly
+      return PrevPolicyT::template invoke_static<DevicePtxVersion>(
+        op, ::cuda::std::bool_constant<(DevicePtxVersion < PolicyPtxVersion)>{});
+    }
+    else
+    {
+      return do_invoke(op, ::cuda::std::bool_constant<DevicePtxVersion >= PolicyPtxVersion>{});
+    }
+    _CCCL_DIAG_POP
+  }
+
+  template <int, typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
+  {
+    _CCCL_UNREACHABLE();
+  }
+
+  template <typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT& op, ::cuda::std::true_type)
+  {
+    return op.template Invoke<PolicyT>();
+  }
+
+  template <typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT&, ::cuda::std::false_type)
+  {
+    _CCCL_UNREACHABLE();
+  }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PolicyPtxVersion, typename PolicyT>
+struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
+{
+  template <int, typename, typename>
+  friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static
+
+  /// The policy for the active compiler pass
+  using ActivePolicy = PolicyT;
+
+  /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+  template <typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op)
+  {
+    return op.template Invoke<PolicyT>();
+  }
+
+private:
+  template <int, typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
+  {
+    return op.template Invoke<PolicyT>();
+  }
+
+  template <int, typename FunctorT>
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
+  {
+    _CCCL_UNREACHABLE();
+  }
+};
+
+CUB_NAMESPACE_END
+
+#include <cub/launcher/cuda_runtime.cuh> // to complete the definition of TripleChevronFactory
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_macro.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_macro.cuh
new file mode 100644
index 000000000..af9977ece
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_macro.cuh
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/detect_cuda_runtime.cuh> // IWYU pragma: export
+#include <cub/util_namespace.cuh> // IWYU pragma: export
+
+#include <cuda/std/utility>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+#  define CUB_PREVENT_MACRO_SUBSTITUTION
+template <typename T, typename U>
+constexpr _CCCL_HOST_DEVICE auto min CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
+  -> decltype(t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u))
+{
+  return t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u);
+}
+
+template <typename T, typename U>
+constexpr _CCCL_HOST_DEVICE auto max CUB_PREVENT_MACRO_SUBSTITUTION(T&& t, U&& u)
+  -> decltype(t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t))
+{
+  return t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t);
+}
+#  undef CUB_PREVENT_MACRO_SUBSTITUTION
+#endif
+
+#ifndef CUB_MAX
+/// Select maximum(a, b)
+#  define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+/// Select minimum(a, b)
+#  define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+/// Quotient of x/y rounded down to nearest integer
+#  define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+/// Quotient of x/y rounded up to nearest integer
+// FIXME(bgruber): the following computation can overflow, use cuda::ceil_div instead
+#  define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+/// x rounded up to the nearest multiple of y
+#  define CUB_ROUND_UP_NEAREST(x, y) (CUB_QUOTIENT_CEILING(x, y) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+/// x rounded down to the nearest multiple of y
+#  define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+#ifndef CUB_DETAIL_KERNEL_ATTRIBUTES
+#  define CUB_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
+#endif
+
+/**
+ * @def CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION
+ * If defined, the default suppression of kernel visibility attribute warning is disabled.
+ */
+#if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION)
+_CCCL_DIAG_SUPPRESS_GCC("-Wattributes")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wattributes")
+#  if !defined(_CCCL_CUDA_COMPILER_NVHPC)
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+#  endif // !_CCCL_CUDA_COMPILER_NVHPC
+#  if defined(_CCCL_COMPILER_ICC)
+#    pragma nv_diag_suppress 1407 // the "__visibility__" attribute can only appear on functions and
+                                  // variables with external linkage'
+#    pragma warning(disable : 1890) // the "__visibility__" attribute can only appear on functions and
+                                    // variables with external linkage'
+#  endif // _CCCL_COMPILER_ICC
+#endif // !CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION
+
+#ifndef CUB_DEFINE_KERNEL_GETTER
+#  define CUB_DEFINE_KERNEL_GETTER(name, ...)                           \
+    CUB_RUNTIME_FUNCTION static constexpr decltype(&__VA_ARGS__) name() \
+    {                                                                   \
+      return &__VA_ARGS__;                                              \
+    }
+#endif
+
+#ifndef CUB_DEFINE_SUB_POLICY_GETTER
+#  define CUB_DEFINE_SUB_POLICY_GETTER(name)                                                         \
+    CUB_RUNTIME_FUNCTION static constexpr PolicyWrapper<typename StaticPolicyT::name##Policy> name() \
+    {                                                                                                \
+      return MakePolicyWrapper(typename StaticPolicyT::name##Policy());                              \
+    }
+#endif
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_math.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_math.cuh
new file mode 100644
index 000000000..d170664c8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_math.cuh
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Define helper math functions.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/cmath>
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <typename T>
+using is_integral_or_enum =
+  ::cuda::std::integral_constant<bool, ::cuda::std::is_integral<T>::value || ::cuda::std::is_enum<T>::value>;
+
+/**
+ * Computes lhs + rhs, but bounds the result to the maximum number representable by the given type, if the addition
+ * would overflow. Note, lhs must be non-negative.
+ *
+ * Effectively performs `min((lhs + rhs), ::cuda::std::numeric_limits<OffsetT>::max())`, but is robust against the case
+ * where `(lhs + rhs)` would overflow.
+ */
+template <typename OffsetT>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE OffsetT safe_add_bound_to_max(OffsetT lhs, OffsetT rhs)
+{
+  static_assert(::cuda::std::is_integral<OffsetT>::value, "OffsetT must be an integral type");
+  static_assert(sizeof(OffsetT) >= 4, "OffsetT must be at least 32 bits in size");
+  auto const capped_operand_rhs = (cub::min)(rhs, ::cuda::std::numeric_limits<OffsetT>::max() - lhs);
+  return lhs + capped_operand_rhs;
+}
+
+} // namespace detail
+
+/**
+ * Divide n by d, round up if any remainder, and return the result.
+ *
+ * Effectively performs `(n + d - 1) / d`, but is robust against the case where
+ * `(n + d - 1)` would overflow.
+ * deprecated [Since 2.8.0] `cub::DivideAndRoundUp` is deprecated. Use `cuda::ceil_div` instead.
+ */
+template <typename NumeratorT, typename DenominatorT>
+CUB_DEPRECATED_BECAUSE("Use cuda::ceil_div instead")
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr NumeratorT DivideAndRoundUp(NumeratorT n, DenominatorT d)
+{
+  static_assert(
+    cub::detail::is_integral_or_enum<NumeratorT>::value && cub::detail::is_integral_or_enum<DenominatorT>::value,
+    "DivideAndRoundUp is only intended for integral types.");
+  return ::cuda::ceil_div(n, d);
+}
+
+constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
+{
+  return (cub::min)(nominal_4b_items_per_thread, (cub::max)(1, nominal_4b_items_per_thread * 8 / combined_bytes));
+}
+
+template <typename T>
+constexpr _CCCL_HOST_DEVICE int Nominal4BItemsToItems(int nominal_4b_items_per_thread)
+{
+  return (cub::min)(nominal_4b_items_per_thread,
+                    (cub::max)(1, nominal_4b_items_per_thread * 4 / static_cast<int>(sizeof(T))));
+}
+
+template <typename ItemT>
+constexpr _CCCL_HOST_DEVICE int Nominal8BItemsToItems(int nominal_8b_items_per_thread)
+{
+  return sizeof(ItemT) <= 8u
+         ? nominal_8b_items_per_thread
+         : (cub::min)(nominal_8b_items_per_thread,
+                      (cub::max)(1,
+                                 ((nominal_8b_items_per_thread * 8) + static_cast<int>(sizeof(ItemT)) - 1)
+                                   / static_cast<int>(sizeof(ItemT))));
+}
+
+/**
+ * \brief Computes the midpoint of the integers
+ *
+ * Extra operation is performed in order to prevent overflow.
+ *
+ * \return Half the sum of \p begin and \p end
+ */
+template <typename T>
+constexpr _CCCL_HOST_DEVICE T MidPoint(T begin, T end)
+{
+  return begin + (end - begin) / 2;
+}
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_namespace.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_namespace.cuh
new file mode 100644
index 000000000..854d252cc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_namespace.cuh
@@ -0,0 +1,223 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file util_namespace.cuh
+ * \brief Utilities that allow `cub::` to be placed inside an
+ * application-specific namespace.
+ */
+
+#pragma once
+
+// This is not used by this file; this is a hack so that we can detect the
+// CUB version from Thrust on older versions of CUB that did not have
+// version.cuh.
+#include <cub/version.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+// Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users
+// that they must now define the qualifier macro, too.
+#if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER)
+#  error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined.
+#endif
+
+/**
+ * \def THRUST_CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` and `cub::` namespaces.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#  define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE
+#endif
+
+/**
+ * \def CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `cub::` namespace.
+ * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef CUB_WRAPPED_NAMESPACE
+#  define CUB_NS_PREFIX             \
+    namespace CUB_WRAPPED_NAMESPACE \
+    {
+
+#  define CUB_NS_POSTFIX }
+
+#  define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub
+#endif
+
+/**
+ * \def CUB_NS_PREFIX
+ * This macro is inserted prior to all `namespace cub { ... }` blocks. It is
+ * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise.
+ * It may be defined by users, in which case CUB_NS_PREFIX,
+ * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_PREFIX
+#  define CUB_NS_PREFIX
+#endif
+
+/**
+ * \def CUB_NS_POSTFIX
+ * This macro is inserted following the closing braces of all
+ * `namespace cub { ... }` block. It is defined appropriately when
+ * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_POSTFIX
+#  define CUB_NS_POSTFIX
+#endif
+
+/**
+ * \def CUB_NS_QUALIFIER
+ * This macro is used to qualify members of cub:: when accessing them from
+ * outside of their namespace. By default, this is just `::cub`, and will be
+ * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_QUALIFIER
+#  define CUB_NS_QUALIFIER ::cub
+#endif
+
+#if !defined(CUB_DETAIL_MAGIC_NS_NAME)
+#  define CUB_DETAIL_COUNT_N(                                                                          \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, N, ...) \
+    N
+#  define CUB_DETAIL_COUNT(...) \
+    CUB_DETAIL_IDENTITY(        \
+      CUB_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1))
+#  define CUB_DETAIL_IDENTITY(N)                                N
+#  define CUB_DETAIL_APPLY(MACRO, ...)                          CUB_DETAIL_IDENTITY(MACRO(__VA_ARGS__))
+#  define CUB_DETAIL_MAGIC_NS_NAME1(P1)                         CUB_##P1##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME2(P1, P2)                     CUB_##P1##_##P2##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME3(P1, P2, P3)                 CUB_##P1##_##P2##_##P3##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4)             CUB_##P1##_##P2##_##P3##_##P4##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5)         CUB_##P1##_##P2##_##P3##_##P4##_##P5##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6)     CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME19(                                                     \
+    P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS
+#  define CUB_DETAIL_MAGIC_NS_NAME20(                                                          \
+    P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \
+    CUB_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS
+#  define CUB_DETAIL_DISPATCH(N) CUB_DETAIL_MAGIC_NS_NAME##N
+#  define CUB_DETAIL_MAGIC_NS_NAME(...) \
+    CUB_DETAIL_IDENTITY(CUB_DETAIL_APPLY(CUB_DETAIL_DISPATCH, CUB_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__))
+#endif // !defined(CUB_DETAIL_MAGIC_NS_NAME)
+
+// clang-format off
+#if defined(CUB_DISABLE_NAMESPACE_MAGIC) || defined(CUB_WRAPPED_NAMESPACE)
+#  if !defined(CUB_WRAPPED_NAMESPACE)
+#    if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR)
+#      error "Disabling namespace magic is unsafe without wrapping namespace"
+#    endif // !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR)
+#  endif // !defined(CUB_WRAPPED_NAMESPACE)
+#  define CUB_DETAIL_MAGIC_NS_BEGIN
+#  define CUB_DETAIL_MAGIC_NS_END
+#else // not defined(CUB_DISABLE_NAMESPACE_MAGIC)
+#  if defined(_NVHPC_CUDA)
+#    define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) {
+#    define CUB_DETAIL_MAGIC_NS_END }
+#  else // not defined(_NVHPC_CUDA)
+#    define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) {
+#    define CUB_DETAIL_MAGIC_NS_END }
+#  endif // not defined(_NVHPC_CUDA)
+#endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC)
+// clang-format on
+
+/**
+ * \def CUB_NAMESPACE_BEGIN
+ * This macro is used to open a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_BEGIN \
+  CUB_NS_PREFIX             \
+  namespace cub             \
+  {                         \
+  CUB_DETAIL_MAGIC_NS_BEGIN
+
+/**
+ * \def CUB_NAMESPACE_END
+ * This macro is used to close a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_END   \
+  CUB_DETAIL_MAGIC_NS_END   \
+  } /* end namespace cub */ \
+  CUB_NS_POSTFIX
+
+// Declare these namespaces here for the purpose of Doxygenating them
+CUB_NS_PREFIX
+
+/*! \namespace cub
+ *  \brief \p cub is the top-level namespace which contains all CUB
+ *         functions and types.
+ */
+namespace cub
+{
+}
+
+CUB_NS_POSTFIX
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_ptx.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_ptx.cuh
new file mode 100644
index 000000000..db7e85939
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_ptx.cuh
@@ -0,0 +1,789 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_debug.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#  if defined(_WIN64) || defined(__LP64__)
+#    define __CUB_LP64__ 1
+// 64-bit register modifier for inlined asm
+#    define _CUB_ASM_PTR_      "l"
+#    define _CUB_ASM_PTR_SIZE_ "u64"
+#  else
+#    define __CUB_LP64__       0
+// 32-bit register modifier for inlined asm
+#    define _CUB_ASM_PTR_      "r"
+#    define _CUB_ASM_PTR_SIZE_ "u32"
+#  endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+namespace detail
+{
+/**
+ * @brief Shifts @p val left by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftLeft(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shl.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+
+/**
+ * @brief Shifts @p val right by the amount specified by unsigned 32-bit value in @p num_bits. If @p
+ * num_bits is larger than 32 bits, @p num_bits is clamped to 32.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t LogicShiftRight(uint32_t val, uint32_t num_bits)
+{
+  uint32_t ret{};
+  asm("shr.b32 %0, %1, %2;" : "=r"(ret) : "r"(val), "r"(num_bits));
+  return ret;
+}
+} // namespace detail
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHR_ADD(unsigned int x, unsigned int shift, unsigned int addend)
+{
+  unsigned int ret;
+  asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+  return ret;
+}
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHL_ADD(unsigned int x, unsigned int shift, unsigned int addend)
+{
+  unsigned int ret;
+  asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+  return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<BYTE_LEN> /*byte_len*/)
+{
+  unsigned int bits;
+  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+  return bits;
+}
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<8> /*byte_len*/)
+{
+  const unsigned long long MASK = (1ull << num_bits) - 1;
+  return (source >> bit_start) & MASK;
+}
+
+#  if CUB_IS_INT128_ENABLED
+/**
+ * Bitfield-extract for 128-bit types.
+ */
+template <typename UnsignedBits>
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits, Int2Type<16> /*byte_len*/)
+{
+  const __uint128_t MASK = (__uint128_t{1} << num_bits) - 1;
+  return (source >> bit_start) & MASK;
+}
+#  endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p
+ * source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int BFE(UnsignedBits source, unsigned int bit_start, unsigned int num_bits)
+{
+  return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void
+BFI(unsigned int& ret, unsigned int x, unsigned int y, unsigned int bit_start, unsigned int num_bits)
+{
+  asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+}
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+  asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+  return x;
+}
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit
+ * destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+  int ret;
+  asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+  return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void BAR(int count)
+{
+  asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void CTA_SYNC()
+{
+  __syncthreads();
+}
+
+/**
+ * CTA barrier with predicate
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_AND(int p)
+{
+  return __syncthreads_and(p);
+}
+
+/**
+ * CTA barrier with predicate
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int CTA_SYNC_OR(int p)
+{
+  return __syncthreads_or(p);
+}
+
+/**
+ * Warp barrier
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void WARP_SYNC(unsigned int member_mask)
+{
+  __syncwarp(member_mask);
+}
+
+/**
+ * Warp any
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ANY(int predicate, unsigned int member_mask)
+{
+  return __any_sync(member_mask, predicate);
+}
+
+/**
+ * Warp any
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_ALL(int predicate, unsigned int member_mask)
+{
+  return __all_sync(member_mask, predicate);
+}
+
+/**
+ * Warp ballot
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+  return __ballot_sync(member_mask, predicate);
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+  asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+               : "=r"(word)
+               : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+  return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+  asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+               : "=r"(word)
+               : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+  return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+  asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+               : "=r"(word)
+               : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+  return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
+{
+  return __shfl_sync(member_mask, word, src_lane);
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE float FMUL_RZ(float a, float b)
+{
+  float d;
+  asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+  return d;
+}
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE float FFMA_RZ(float a, float b, float c)
+{
+  float d;
+  asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+  return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadExit()
+{
+  asm volatile("exit;");
+}
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE void ThreadTrap()
+{
+  asm volatile("trap;");
+}
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+  return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y))
+       + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + threadIdx.x;
+}
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneId()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%laneid;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not
+ * correspond to a zero-based ranking within the thread block.
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int WarpId()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%warpid;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * @brief Returns the warp mask for a warp of @p LOGICAL_WARP_THREADS threads
+ *
+ * @par
+ * If the number of threads assigned to the virtual warp is not a power of two,
+ * it's assumed that only one virtual warp exists.
+ *
+ * @tparam LOGICAL_WARP_THREADS <b>[optional]</b> The number of threads per
+ *                              "logical" warp (may be less than the number of
+ *                              hardware warp threads).
+ * @param warp_id Id of virtual warp within architectural warp
+ */
+template <int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE unsigned int WarpMask(unsigned int warp_id)
+{
+  constexpr bool is_pow_of_two = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE;
+  constexpr bool is_arch_warp  = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+
+  unsigned int member_mask = 0xFFFFFFFFu >> (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS);
+
+  _CCCL_IF_CONSTEXPR (is_pow_of_two && !is_arch_warp)
+  {
+    member_mask <<= warp_id * LOGICAL_WARP_THREADS;
+  }
+  (void) warp_id;
+
+  return member_mask;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLt()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskLe()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGt()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int LaneMaskGe()
+{
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(ret));
+  return ret;
+}
+
+/**
+ * @brief Shuffle-up for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by
+ *        <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.
+ *        For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread.
+ *        ![](shfl_up_logo.png)
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp. Must be a power-of-two <= 32.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
+ *
+ * @par
+ * - Available only for SM3.0 or newer
+ *
+ * @par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>. The corresponding output @p peer_data will be
+ * <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
+ *
+ * @param[in] src_offset
+ *   The relative down-offset of the peer to read from
+ *
+ * @param[in] first_thread
+ *   Index of first lane in logical warp (typically 0)
+ *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
+ */
+template <int LOGICAL_WARP_THREADS, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
+{
+  /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+  enum
+  {
+    SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+  };
+
+  using ShuffleWord = typename UnitWord<T>::ShuffleWord;
+
+  constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+  T output;
+  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+
+  unsigned int shuffle_word;
+  shuffle_word    = SHFL_UP_SYNC((unsigned int) input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int WORD = 1; WORD < WORDS; ++WORD)
+  {
+    shuffle_word       = SHFL_UP_SYNC((unsigned int) input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[WORD] = shuffle_word;
+  }
+
+  return output;
+}
+
+/**
+ * @brief Shuffle-down for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by
+ *        <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.
+ *        For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the
+ *        thread. ![](shfl_down_logo.png)
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
+ *
+ * @par
+ * - Available only for SM3.0 or newer
+ *
+ * @par Snippet
+ * The code snippet below illustrates each thread obtaining a @p double value from the
+ * successor of its successor.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output @p peer_data will be
+ * <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
+ *
+ * @param[in] src_offset
+ *   The relative up-offset of the peer to read from
+ *
+ * @param[in] last_thread
+ *   Index of last thread in logical warp (typically 31 for a 32-thread warp)
+ *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
+ */
+template <int LOGICAL_WARP_THREADS, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
+{
+  /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+  enum
+  {
+    SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+  };
+
+  using ShuffleWord = typename UnitWord<T>::ShuffleWord;
+
+  constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+  T output;
+  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+
+  unsigned int shuffle_word;
+  shuffle_word    = SHFL_DOWN_SYNC((unsigned int) input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int WORD = 1; WORD < WORDS; ++WORD)
+  {
+    shuffle_word = SHFL_DOWN_SYNC((unsigned int) input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[WORD] = shuffle_word;
+  }
+
+  return output;
+}
+
+/**
+ * @brief Shuffle-broadcast for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input
+ *        contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.
+ *        For @p src_lane < 0 or @p src_lane >= WARP_THREADS,
+ *        then the thread's own @p input is returned to the thread.
+ *        ![](shfl_broadcast_logo.png)
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
+ *
+ * @par
+ * - Available only for SM3.0 or newer
+ *
+ * @par Snippet
+ * The code snippet below illustrates each thread obtaining a @p double value from
+ * <em>warp-lane</em><sub>0</sub>.
+ *
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output @p peer_data will be
+ * <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
+ *
+ * @param[in] src_lane
+ *   Which warp lane is to do the broadcasting
+ *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
+ */
+template <int LOGICAL_WARP_THREADS, typename T>
+_CCCL_DEVICE _CCCL_FORCEINLINE T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
+{
+  /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+  enum
+  {
+    SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+  };
+
+  using ShuffleWord = typename UnitWord<T>::ShuffleWord;
+
+  constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+  T output;
+  ShuffleWord* output_alias = reinterpret_cast<ShuffleWord*>(&output);
+  ShuffleWord* input_alias  = reinterpret_cast<ShuffleWord*>(&input);
+
+  unsigned int shuffle_word;
+  shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[0], src_lane, SHFL_C, member_mask);
+
+  output_alias[0] = shuffle_word;
+
+#pragma unroll
+  for (int WORD = 1; WORD < WORDS; ++WORD)
+  {
+    shuffle_word = SHFL_IDX_SYNC((unsigned int) input_alias[WORD], src_lane, SHFL_C, member_mask);
+
+    output_alias[WORD] = shuffle_word;
+  }
+
+  return output;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+namespace detail
+{
+
+/**
+ * Implementation detail for `MatchAny`. It provides specializations for full and partial warps.
+ * For partial warps, inactive threads must be masked out. This is done in the partial warp
+ * specialization below.
+ * Usage:
+ * ```
+ * // returns a mask of threads with the same 4 least-significant bits of `label`
+ * // in a warp with 16 active threads
+ * warp_matcher_t<4, 16>::match_any(label);
+ *
+ * // returns a mask of threads with the same 4 least-significant bits of `label`
+ * // in a warp with 32 active threads (no extra work is done)
+ * warp_matcher_t<4, 32>::match_any(label);
+ * ```
+ */
+template <int LABEL_BITS, int WARP_ACTIVE_THREADS>
+struct warp_matcher_t
+{
+  static _CCCL_DEVICE unsigned int match_any(unsigned int label)
+  {
+    return warp_matcher_t<LABEL_BITS, 32>::match_any(label) & ~(~0 << WARP_ACTIVE_THREADS);
+  }
+};
+
+template <int LABEL_BITS>
+struct warp_matcher_t<LABEL_BITS, CUB_PTX_WARP_THREADS>
+{
+  // match.any.sync.b32 is slower when matching a few bits
+  // using a ballot loop instead
+  static _CCCL_DEVICE unsigned int match_any(unsigned int label)
+  {
+    unsigned int retval;
+
+// Extract masks of common threads for each bit
+#  pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+      unsigned int mask;
+      unsigned int current_bit = 1 << BIT;
+      asm("{\n"
+          "    .reg .pred p;\n"
+          "    and.b32 %0, %1, %2;"
+          "    setp.ne.u32 p, %0, 0;\n"
+          "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+          "    @!p not.b32 %0, %0;\n"
+          "}\n"
+          : "=r"(mask)
+          : "r"(label), "r"(current_bit));
+
+      // Remove peers who differ
+      retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+  }
+};
+
+} // namespace detail
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS, int WARP_ACTIVE_THREADS = CUB_PTX_WARP_THREADS>
+inline _CCCL_DEVICE unsigned int MatchAny(unsigned int label)
+{
+  return detail::warp_matcher_t<LABEL_BITS, WARP_ACTIVE_THREADS>::match_any(label);
+}
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_temporary_storage.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_temporary_storage.cuh
new file mode 100644
index 000000000..ee456083c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_temporary_storage.cuh
@@ -0,0 +1,117 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Utilities for device-accessible temporary storages.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_debug.cuh>
+#include <cub/util_namespace.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ *
+ * @param[in] d_temp_storage
+ *   Device-accessible allocation of temporary storage.
+ *   When nullptr, the required allocation size is written to @p temp_storage_bytes and no work is
+ *   done.
+ *
+ * @param[in,out] temp_storage_bytes
+ *   Size in bytes of @p d_temp_storage allocation
+ *
+ * @param[in,out] allocations
+ *   Pointers to device allocations needed
+ *
+ * @param[in] allocation_sizes
+ *   Sizes in bytes of device allocations needed
+ */
+template <int ALLOCATIONS>
+_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t AliasTemporaries(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  void* (&allocations)[ALLOCATIONS],
+  const size_t (&allocation_sizes)[ALLOCATIONS])
+{
+  constexpr size_t ALIGN_BYTES = 256;
+  constexpr size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+
+  // Compute exclusive prefix sum over allocation requests
+  size_t allocation_offsets[ALLOCATIONS];
+  size_t bytes_needed = 0;
+  for (int i = 0; i < ALLOCATIONS; ++i)
+  {
+    const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+    allocation_offsets[i]         = bytes_needed;
+    bytes_needed += allocation_bytes;
+  }
+  bytes_needed += ALIGN_BYTES - 1;
+
+  // Check if the caller is simply requesting the size of the storage allocation
+  if (!d_temp_storage)
+  {
+    temp_storage_bytes = bytes_needed;
+    return cudaSuccess;
+  }
+
+  // Check if enough storage provided
+  if (temp_storage_bytes < bytes_needed)
+  {
+    return CubDebug(cudaErrorInvalidValue);
+  }
+
+  // Alias
+  d_temp_storage =
+    reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+  for (int i = 0; i < ALLOCATIONS; ++i)
+  {
+    allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+  }
+
+  return cudaSuccess;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_type.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_type.cuh
new file mode 100644
index 000000000..cabab8ea6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_type.cuh
@@ -0,0 +1,1243 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/detail/uninitialized_copy.cuh>
+#include <cub/util_deprecated.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#if defined(_CCCL_HAS_NVBF16)
+#  if !defined(_CCCL_CUDACC_BELOW_11_8)
+// cuda_fp8.h resets default for C4127, so we have to guard the inclusion
+_CCCL_DIAG_PUSH
+#    include <cuda_fp8.h>
+_CCCL_DIAG_POP
+#  endif // !_CCCL_CUDACC_BELOW_11_8
+#endif // _CCCL_HAS_NV_BF16
+
+#ifdef _CCCL_COMPILER_NVRTC
+#  include <cuda/std/iterator>
+#else // !defined(_CCCL_COMPILER_NVRTC)
+#  include <iterator>
+#endif // defined(_CCCL_COMPILER_NVRTC)
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef CUB_IS_INT128_ENABLED
+#  if defined(__CUDACC_RTC__)
+#    if defined(__CUDACC_RTC_INT128__)
+#      define CUB_IS_INT128_ENABLED 1
+#    endif // !defined(__CUDACC_RTC_INT128__)
+#  else // !defined(__CUDACC_RTC__)
+#    if _CCCL_CUDACC_VER >= 1105000
+#      if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_ICC) \
+        || defined(_CCCL_COMPILER_NVHPC)
+#        define CUB_IS_INT128_ENABLED 1
+#      endif // GCC || CLANG || ICC || NVHPC
+#    endif // CTK >= 11.5
+#  endif // !defined(__CUDACC_RTC__)
+#endif // !defined(CUB_IS_INT128_ENABLED)
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+namespace detail
+{
+//! Alias to the given iterator's value_type.
+// Aliases to std::iterator_traits, since users can specialize this template to provide traits for their iterators. We
+// only defer to the libcu++ implementation for NVRTC.
+template <typename Iterator>
+using value_t =
+#  ifdef _CCCL_COMPILER_NVRTC
+  typename ::cuda::std::iterator_traits<Iterator>::value_type;
+#  else // !defined(_CCCL_COMPILER_NVRTC)
+  typename std::iterator_traits<Iterator>::value_type;
+#  endif // defined(_CCCL_COMPILER_NVRTC)
+
+template <typename It, typename FallbackT, bool = ::cuda::std::is_void<::cuda::std::__remove_pointer_t<It>>::value>
+struct non_void_value_impl
+{
+  using type = FallbackT;
+};
+
+template <typename It, typename FallbackT>
+struct non_void_value_impl<It, FallbackT, false>
+{
+  using type = ::cuda::std::_If<::cuda::std::is_void<value_t<It>>::value, FallbackT, value_t<It>>;
+};
+
+/**
+ * The output value type
+ * type = (if IteratorT's value type is void) ?
+ * ... then the FallbackT,
+ * ... else the IteratorT's value type
+ */
+template <typename It, typename FallbackT>
+using non_void_value_t = typename non_void_value_impl<It, FallbackT>::type;
+} // namespace detail
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+  /// Static logarithm value
+  enum
+  {
+    VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE
+  }; // Inductive case
+};
+
+#  ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+  enum
+  {
+    VALUE = (1 << (COUNT - 1) < N) ? // Base case
+              COUNT
+                                   : COUNT - 1
+  };
+};
+
+#  endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+  enum
+  {
+    VALUE = ((N & (N - 1)) == 0)
+  };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+/**
+ * \brief A simple "null" marker type
+ */
+struct NullType
+{
+  using value_type = NullType;
+
+  NullType() = default;
+
+  template <typename T>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE explicit NullType(const T&)
+  {}
+
+  template <typename T>
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE NullType& operator=(const T&)
+  {
+    return *this;
+  }
+
+  friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const NullType&, const NullType&)
+  {
+    return true;
+  }
+
+  friend _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const NullType&, const NullType&)
+  {
+    return false;
+  }
+};
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call
+ * dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+  enum
+  {
+    VALUE = A
+  };
+};
+
+/**
+ * \brief Allows algorithms that take a value as input to take a future value that is not computed yet at launch time.
+ *
+ * Note that it is user's responsibility to ensure that the result will be ready before use via external synchronization
+ * or stream-ordering dependencies.
+ *
+ * \code
+ * int *d_intermediate_result;
+ * allocator.DeviceAllocate((void **)&d_intermediate_result, sizeof(int));
+ * compute_intermediate_result<<<blocks, threads>>>(
+ *     d_intermediate_result,  // output
+ *     arg1,                   // input
+ *     arg2);                  // input
+ * cub::FutureValue<int> init_value(d_intermediate_result);
+ * cub::DeviceScan::ExclusiveScan(
+ *     d_temp_storage,
+ *     temp_storage_bytes,
+ *     d_in,
+ *     d_out,
+ *     cub::Sum(),
+ *     init_value,
+ *     num_items);
+ * allocator.DeviceFree(d_intermediate_result);
+ * \endcode
+ */
+template <typename T, typename IterT = T*>
+struct FutureValue
+{
+  using value_type    = T;
+  using iterator_type = IterT;
+  explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE FutureValue(IterT iter)
+      : m_iter(iter)
+  {}
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator T()
+  {
+    return *m_iter;
+  }
+
+private:
+  IterT m_iter;
+};
+
+namespace detail
+{
+
+/**
+ * \brief Allows algorithms to instantiate a single kernel to support both immediate value and future value.
+ */
+template <typename T, typename IterT = T*>
+struct InputValue
+{
+  using value_type    = T;
+  using iterator_type = IterT;
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE operator T()
+  {
+    if (m_is_future)
+    {
+      return m_future_value;
+    }
+    return m_immediate_value;
+  }
+  explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(T immediate_value)
+      : m_is_future(false)
+      , m_immediate_value(immediate_value)
+  {}
+  explicit _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(FutureValue<T, IterT> future_value)
+      : m_is_future(true)
+      , m_future_value(future_value)
+  {}
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InputValue(const InputValue& other)
+      : m_is_future(other.m_is_future)
+  {
+    if (m_is_future)
+    {
+      m_future_value = other.m_future_value;
+    }
+    else
+    {
+      detail::uninitialized_copy_single(&m_immediate_value, other.m_immediate_value);
+    }
+  }
+
+private:
+  bool m_is_future;
+  union
+  {
+    FutureValue<T, IterT> m_future_value;
+    T m_immediate_value;
+  };
+};
+
+} // namespace detail
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+  /// The "true CUDA" alignment of T in bytes
+  static constexpr unsigned ALIGN_BYTES = alignof(T);
+
+  /// The "truly aligned" type
+  using Type = T;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#  define __CUB_ALIGN_BYTES(t, b)                                                                  \
+    template <>                                                                                    \
+    struct AlignBytes<t>                                                                           \
+    {                                                                                              \
+      static constexpr unsigned ALIGN_BYTES = b;                                                   \
+                                                                                                   \
+      typedef __align__(b) t Type;                                                                 \
+      /* TODO(bgruber): rewriting the above to using Type __align__(b) = t; does not compile :S */ \
+    };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#  ifdef _WIN32
+__CUB_ALIGN_BYTES(long2, 8)
+__CUB_ALIGN_BYTES(ulong2, 8)
+#  else
+__CUB_ALIGN_BYTES(long2, 16)
+__CUB_ALIGN_BYTES(ulong2, 16)
+#  endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+// clang-format off
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+// clang-format on
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+  static constexpr auto ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES;
+
+  template <typename Unit>
+  struct IsMultiple
+  {
+    static constexpr auto UNIT_ALIGN_BYTES = AlignBytes<Unit>::ALIGN_BYTES;
+    static constexpr bool IS_MULTIPLE =
+      (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0);
+  };
+
+  /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+  using ShuffleWord =
+    ::cuda::std::_If<IsMultiple<int>::IS_MULTIPLE,
+                     unsigned int,
+                     ::cuda::std::_If<IsMultiple<short>::IS_MULTIPLE, unsigned short, unsigned char>>;
+
+  /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+  using VolatileWord = ::cuda::std::_If<IsMultiple<long long>::IS_MULTIPLE, unsigned long long, ShuffleWord>;
+
+  /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+  using DeviceWord = ::cuda::std::_If<IsMultiple<longlong2>::IS_MULTIPLE, ulonglong2, VolatileWord>;
+
+  /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+  using TextureWord = ::cuda::std::
+    _If<IsMultiple<int4>::IS_MULTIPLE, uint4, ::cuda::std::_If<IsMultiple<int2>::IS_MULTIPLE, uint2, ShuffleWord>>;
+};
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord<float2>
+{
+  using ShuffleWord  = int;
+  using VolatileWord = unsigned long long;
+  using DeviceWord   = unsigned long long;
+  using TextureWord  = float2;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord<float4>
+{
+  using ShuffleWord  = int;
+  using VolatileWord = unsigned long long;
+  using DeviceWord   = ulonglong2;
+  using TextureWord  = float4;
+};
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord<char2>
+{
+  using ShuffleWord  = unsigned short;
+  using VolatileWord = unsigned short;
+  using DeviceWord   = unsigned short;
+  using TextureWord  = unsigned short;
+};
+
+// clang-format off
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+// clang-format on
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member alias \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p
+ * Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements>
+struct CubVector
+{
+  static_assert(!sizeof(T), "CubVector can only have 1-4 elements");
+};
+
+/// The maximum number of elements in CUDA vector types
+_CCCL_INLINE_VAR constexpr int MAX_VEC_ELEMENTS = 4;
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+  T x;
+
+  using BaseType = T;
+  using Type     = CubVector;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+  T x;
+  T y;
+
+  using BaseType = T;
+  using Type     = CubVector;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+  T x;
+  T y;
+  T z;
+
+  using BaseType = T;
+  using Type     = CubVector;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+  T x;
+  T y;
+  T z;
+  T w;
+
+  using BaseType = T;
+  using Type     = CubVector;
+};
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#  define CUB_DEFINE_VECTOR_TYPE(base_type, short_type)                                     \
+                                                                                            \
+    template <>                                                                             \
+    struct CubVector<base_type, 1> : short_type##1                                          \
+    {                                                                                       \
+      using BaseType = base_type;                                                           \
+      using Type     = short_type##1;                                                       \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x + other.x;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x - other.x;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+    };                                                                                      \
+                                                                                            \
+    template <>                                                                             \
+    struct CubVector<base_type, 2> : short_type##2                                          \
+    {                                                                                       \
+      using BaseType = base_type;                                                           \
+      using Type     = short_type##2;                                                       \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x + other.x;                                                             \
+        retval.y = y + other.y;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x - other.x;                                                             \
+        retval.y = y - other.y;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+    };                                                                                      \
+                                                                                            \
+    template <>                                                                             \
+    struct CubVector<base_type, 3> : short_type##3                                          \
+    {                                                                                       \
+      using BaseType = base_type;                                                           \
+      using Type     = short_type##3;                                                       \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x + other.x;                                                             \
+        retval.y = y + other.y;                                                             \
+        retval.z = z + other.z;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x - other.x;                                                             \
+        retval.y = y - other.y;                                                             \
+        retval.z = z - other.z;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+    };                                                                                      \
+                                                                                            \
+    template <>                                                                             \
+    struct CubVector<base_type, 4> : short_type##4                                          \
+    {                                                                                       \
+      using BaseType = base_type;                                                           \
+      using Type     = short_type##4;                                                       \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator+(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x + other.x;                                                             \
+        retval.y = y + other.y;                                                             \
+        retval.z = z + other.z;                                                             \
+        retval.w = w + other.w;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+      _CCCL_HOST_DEVICE _CCCL_FORCEINLINE CubVector operator-(const CubVector& other) const \
+      {                                                                                     \
+        CubVector retval;                                                                   \
+        retval.x = x - other.x;                                                             \
+        retval.y = y - other.y;                                                             \
+        retval.z = z - other.z;                                                             \
+        retval.w = w - other.w;                                                             \
+        return retval;                                                                      \
+      }                                                                                     \
+    };
+
+// Expand CUDA vector types for built-in primitives
+// clang-format off
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+// clang-format on
+
+#  undef CUB_DEFINE_VECTOR_TYPE
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+  /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+  using DeviceWord = typename UnitWord<T>::DeviceWord;
+
+  static constexpr ::cuda::std::size_t DATA_SIZE = sizeof(T);
+  static constexpr ::cuda::std::size_t WORD_SIZE = sizeof(DeviceWord);
+  static constexpr ::cuda::std::size_t WORDS     = DATA_SIZE / WORD_SIZE;
+
+  /// Backing storage
+  DeviceWord storage[WORDS];
+
+  /// Alias
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T& Alias()
+  {
+    return reinterpret_cast<T&>(*this);
+  }
+};
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <typename _Key,
+          typename _Value
+#  if defined(_WIN32) && !defined(_WIN64)
+          ,
+          bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES),
+          bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#  endif // #if defined(_WIN32) && !defined(_WIN64)
+          >
+struct KeyValuePair
+{
+  using Key   = _Key; ///< Key data type
+  using Value = _Value; ///< Value data type
+
+  Key key; ///< Item key
+  Value value; ///< Item value
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value)
+      : key(key)
+      , value(value)
+  {}
+
+  /// Inequality operator
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b)
+  {
+    return (value != b.value) || (key != b.key);
+  }
+};
+
+#  if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+  using Key   = K;
+  using Value = V;
+
+  using Pad = char[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+  Value value; // Value has larger would-be alignment and goes first
+  Key key;
+  Pad pad;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value)
+      : key(key)
+      , value(value)
+  {}
+
+  /// Inequality operator
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b)
+  {
+    return (value != b.value) || (key != b.key);
+  }
+};
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+  using Key   = K;
+  using Value = V;
+
+  using Pad = char[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+  Key key; // Key has larger would-be alignment and goes first
+  Value value;
+  Pad pad;
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair() {}
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE KeyValuePair(Key const& key, Value const& value)
+      : key(key)
+      , value(value)
+  {}
+
+  /// Inequality operator
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const KeyValuePair& b)
+  {
+    return (value != b.value) || (key != b.key);
+  }
+};
+
+#  endif // #if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ * deprecated [Since 2.5.0] The `cub::ArrayWrapper` is deprecated. Use `cuda::std::array` instead.
+ */
+template <typename T, int COUNT>
+struct CUB_DEPRECATED_BECAUSE("Use cuda::std::array instead.") ArrayWrapper
+{
+  /// Statically-sized array of type \p T
+  T array[COUNT];
+
+  /// Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE ArrayWrapper() {}
+};
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array
+ * for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+  /// Pair of device buffer pointers
+  T* d_buffers[2]{};
+
+  ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+  int selector = 0;
+
+  /// \brief Constructor
+  DoubleBuffer() = default;
+
+  /// \brief Constructor
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE DoubleBuffer(T* d_current, ///< The currently valid buffer
+                                                   T* d_alternate) ///< Alternate storage buffer of the same size as \p
+                                                                   ///< d_current
+      : d_buffers{d_current, d_alternate}
+  {}
+
+  /// \brief Return pointer to the currently valid buffer
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* Current()
+  {
+    return d_buffers[selector];
+  }
+
+  /// \brief Return pointer to the currently invalid buffer
+  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T* Alternate()
+  {
+    return d_buffers[selector ^ 1];
+  }
+};
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a
+ * constant member \p value indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#  define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)                                \
+    template <typename T, typename = void>                                                              \
+    struct detector_name : ::cuda::std::false_type                                                      \
+    {                                                                                                   \
+      CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool VALUE = false;                \
+    };                                                                                                  \
+    template <typename T>                                                                               \
+    struct detector_name<T, ::cuda::std::void_t<typename T::nested_type_name>> : ::cuda::std::true_type \
+    {                                                                                                   \
+      CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool VALUE = true;                 \
+    };
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or
+ * <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp, typename = void>
+struct BinaryOpHasIdxParam : ::cuda::std::false_type
+{
+  CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool HAS_PARAM = false;
+};
+
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam<T,
+                           BinaryOp,
+                           ::cuda::std::void_t<decltype(::cuda::std::declval<BinaryOp>()(
+                             ::cuda::std::declval<T>(), ::cuda::std::declval<T>(), int{}))>> : ::cuda::std::true_type
+{
+  CUB_DEPRECATED_BECAUSE("Use ::value instead") static constexpr bool HAS_PARAM = true;
+};
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+  NOT_A_NUMBER,
+  SIGNED_INTEGER,
+  UNSIGNED_INTEGER,
+  FLOATING_POINT
+};
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+  static constexpr Category CATEGORY = _CATEGORY;
+  static constexpr bool PRIMITIVE    = _PRIMITIVE;
+  static constexpr bool NULL_TYPE    = _NULL_TYPE;
+};
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+  using UnsignedBits = _UnsignedBits;
+
+  static constexpr Category CATEGORY       = UNSIGNED_INTEGER;
+  static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0);
+  static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1);
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
+  {
+    UnsignedBits retval_bits = MAX_KEY;
+    T retval;
+    memcpy(&retval, &retval_bits, sizeof(T));
+    return retval;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
+  {
+    UnsignedBits retval_bits = LOWEST_KEY;
+    T retval;
+    memcpy(&retval, &retval_bits, sizeof(T));
+    return retval;
+  }
+};
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+  using UnsignedBits = _UnsignedBits;
+
+  static constexpr Category CATEGORY       = SIGNED_INTEGER;
+  static constexpr UnsignedBits HIGH_BIT   = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+  static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT;
+  static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1) ^ HIGH_BIT;
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
+  {
+    return key ^ HIGH_BIT;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key)
+  {
+    return key ^ HIGH_BIT;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
+  {
+    UnsignedBits retval = MAX_KEY;
+    return reinterpret_cast<T&>(retval);
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
+  {
+    UnsignedBits retval = LOWEST_KEY;
+    return reinterpret_cast<T&>(retval);
+  }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Max()
+  {
+    return ::cuda::std::numeric_limits<float>::max();
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE float Lowest()
+  {
+    return ::cuda::std::numeric_limits<float>::lowest();
+  }
+};
+
+template <>
+struct FpLimits<double>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Max()
+  {
+    return ::cuda::std::numeric_limits<double>::max();
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE double Lowest()
+  {
+    return ::cuda::std::numeric_limits<double>::lowest();
+  }
+};
+
+#  if defined(_CCCL_HAS_NVFP16)
+template <>
+struct FpLimits<__half>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Max()
+  {
+    unsigned short max_word = 0x7BFF;
+    return reinterpret_cast<__half&>(max_word);
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __half Lowest()
+  {
+    unsigned short lowest_word = 0xFBFF;
+    return reinterpret_cast<__half&>(lowest_word);
+  }
+};
+#  endif // _CCCL_HAS_NVFP16
+
+#  if defined(_CCCL_HAS_NVBF16)
+template <>
+struct FpLimits<__nv_bfloat16>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Max()
+  {
+    unsigned short max_word = 0x7F7F;
+    return reinterpret_cast<__nv_bfloat16&>(max_word);
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_bfloat16 Lowest()
+  {
+    unsigned short lowest_word = 0xFF7F;
+    return reinterpret_cast<__nv_bfloat16&>(lowest_word);
+  }
+};
+#  endif // _CCCL_HAS_NVBF16
+
+#  if defined(__CUDA_FP8_TYPES_EXIST__)
+template <>
+struct FpLimits<__nv_fp8_e4m3>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Max()
+  {
+    unsigned char max_word = 0x7EU;
+    __nv_fp8_e4m3 ret_val;
+    memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e4m3));
+    return ret_val;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e4m3 Lowest()
+  {
+    unsigned char lowest_word = 0xFEU;
+    __nv_fp8_e4m3 ret_val;
+    memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e4m3));
+    return ret_val;
+  }
+};
+
+template <>
+struct FpLimits<__nv_fp8_e5m2>
+{
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Max()
+  {
+    unsigned char max_word = 0x7BU;
+    __nv_fp8_e5m2 ret_val;
+    memcpy(&ret_val, &max_word, sizeof(__nv_fp8_e5m2));
+    return ret_val;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE __nv_fp8_e5m2 Lowest()
+  {
+    unsigned char lowest_word = 0xFBU;
+    __nv_fp8_e5m2 ret_val;
+    memcpy(&ret_val, &lowest_word, sizeof(__nv_fp8_e5m2));
+    return ret_val;
+  }
+};
+
+#  endif // __CUDA_FP8_TYPES_EXIST__
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+  using UnsignedBits = _UnsignedBits;
+
+  static constexpr Category CATEGORY       = FLOATING_POINT;
+  static constexpr UnsignedBits HIGH_BIT   = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+  static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(-1);
+  static constexpr UnsignedBits MAX_KEY    = UnsignedBits(-1) ^ HIGH_BIT;
+  static constexpr bool PRIMITIVE          = true;
+  static constexpr bool NULL_TYPE          = false;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
+  {
+    UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+    return key ^ mask;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key)
+  {
+    UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+    return key ^ mask;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
+  {
+    return FpLimits<T>::Max();
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
+  {
+    return FpLimits<T>::Lowest();
+  }
+};
+
+/**
+ * \brief Numeric type traits
+ */
+// clang-format off
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(::cuda::std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+
+#if CUB_IS_INT128_ENABLED
+template <>
+struct NumericTraits<__uint128_t>
+{
+  using T = __uint128_t;
+  using UnsignedBits = __uint128_t;
+
+  static constexpr Category       CATEGORY    = UNSIGNED_INTEGER;
+  static constexpr UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+  static constexpr UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+  static constexpr bool PRIMITIVE = false;
+  static constexpr bool NULL_TYPE = false;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key)
+  {
+    return key;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
+  {
+    return MAX_KEY;
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
+  {
+    return LOWEST_KEY;
+  }
+};
+
+template <>
+struct NumericTraits<__int128_t>
+{
+  using T = __int128_t;
+  using UnsignedBits = __uint128_t;
+
+  static constexpr Category       CATEGORY    = SIGNED_INTEGER;
+  static constexpr UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+  static constexpr UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+  static constexpr UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+  static constexpr bool PRIMITIVE = false;
+  static constexpr bool NULL_TYPE = false;
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleIn(UnsignedBits key)
+  {
+    return key ^ HIGH_BIT;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE UnsignedBits TwiddleOut(UnsignedBits key)
+  {
+    return key ^ HIGH_BIT;
+  };
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Max()
+  {
+    UnsignedBits retval = MAX_KEY;
+    return reinterpret_cast<T&>(retval);
+  }
+
+  static _CCCL_HOST_DEVICE _CCCL_FORCEINLINE T Lowest()
+  {
+    UnsignedBits retval = LOWEST_KEY;
+    return reinterpret_cast<T&>(retval);
+  }
+};
+#endif
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#  if defined(_CCCL_HAS_NVFP16)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#  endif // _CCCL_HAS_NVFP16
+#  if defined(_CCCL_HAS_NVBF16)
+    template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, false, unsigned short, __nv_bfloat16> {};
+#  endif // _CCCL_HAS_NVBF16
+
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+    template <> struct NumericTraits<__nv_fp8_e4m3> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e4m3> {};
+    template <> struct NumericTraits<__nv_fp8_e5m2> :   BaseTraits<FLOATING_POINT, true, false, __nv_fp8_storage_t, __nv_fp8_e5m2> {};
+#endif // __CUDA_FP8_TYPES_EXIST__
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+// clang-format on
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename ::cuda::std::remove_cv<T>::type>
+{};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/util_vsmem.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/util_vsmem.cuh
new file mode 100644
index 000000000..d2e5541c0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/util_vsmem.cuh
@@ -0,0 +1,253 @@
+/******************************************************************************
+ * Copyright (c) 2023-24, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * This file contains facilities that help to prevent exceeding the available shared memory per thread block
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_arch.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/discard_memory>
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+/**
+ * @brief Helper struct to wrap all the information needed to implement virtual shared memory that's passed to a kernel.
+ *
+ */
+struct vsmem_t
+{
+  void* gmem_ptr;
+};
+
+/**
+ * @brief Class template that helps to prevent exceeding the available shared memory per thread block.
+ *
+ * @tparam AgentT The agent for which we check whether per-thread block shared memory is sufficient or whether virtual
+ * shared memory is needed.
+ */
+template <typename AgentT>
+class vsmem_helper_impl
+{
+private:
+  // Per-block virtual shared memory may be padded to make sure vsmem is an integer multiple of `line_size`
+  static constexpr std::size_t line_size = 128;
+
+  // The amount of shared memory or virtual shared memory required by the algorithm's agent
+  static constexpr std::size_t required_smem = sizeof(typename AgentT::TempStorage);
+
+  // Whether we need to allocate global memory-backed virtual shared memory
+  static constexpr bool needs_vsmem = required_smem > max_smem_per_block;
+
+  // Padding bytes to an integer multiple of `line_size`. Only applies to virtual shared memory
+  static constexpr std::size_t padding_bytes =
+    (required_smem % line_size == 0) ? 0 : (line_size - (required_smem % line_size));
+
+public:
+  // Type alias to be used for static temporary storage declaration within the algorithm's kernel
+  using static_temp_storage_t = ::cuda::std::_If<needs_vsmem, cub::NullType, typename AgentT::TempStorage>;
+
+  // The amount of global memory-backed virtual shared memory needed, padded to an integer multiple of 128 bytes
+  static constexpr std::size_t vsmem_per_block = needs_vsmem ? (required_smem + padding_bytes) : 0;
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we can use native shared memory as temporary
+   * storage.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&)
+  {
+    return static_temp_storage;
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we can use native shared memory as temporary
+   * storage and taking a linear block id.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(typename AgentT::TempStorage& static_temp_storage, vsmem_t&, std::size_t)
+  {
+    return static_temp_storage;
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we have to use global memory-backed
+   * virtual shared memory as temporary storage.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem)
+  {
+    return *reinterpret_cast<typename AgentT::TempStorage*>(
+      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
+  }
+
+  /**
+   * @brief Used from within the device algorithm's kernel to get the temporary storage that can be
+   * passed to the agent, specialized for the case when we have to use global memory-backed
+   * virtual shared memory as temporary storage and taking a linear block id.
+   */
+  static _CCCL_DEVICE _CCCL_FORCEINLINE typename AgentT::TempStorage&
+  get_temp_storage(cub::NullType& static_temp_storage, vsmem_t& vsmem, std::size_t linear_block_id)
+  {
+    return *reinterpret_cast<typename AgentT::TempStorage*>(
+      static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * linear_block_id));
+  }
+
+  /**
+   * @brief Hints to discard modified cache lines of the used virtual shared memory.
+   * modified cache lines.
+   *
+   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
+   * supposed to be reused after this function call.
+   */
+  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<!needs_vsmem_, int>::type = 0>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
+  {
+    return false;
+  }
+
+  /**
+   * @brief Hints to discard modified cache lines of the used virtual shared memory.
+   * modified cache lines.
+   *
+   * @note Needs to be followed by `__syncthreads()` if the function returns true and the virtual shared memory is
+   * supposed to be reused after this function call.
+   */
+  template <bool needs_vsmem_ = needs_vsmem, typename ::cuda::std::enable_if<needs_vsmem_, int>::type = 0>
+  static _CCCL_DEVICE _CCCL_FORCEINLINE bool discard_temp_storage(typename AgentT::TempStorage& temp_storage)
+  {
+    // Ensure all threads finished using temporary storage
+    CTA_SYNC();
+
+    const std::size_t linear_tid   = threadIdx.x;
+    const std::size_t block_stride = line_size * blockDim.x;
+
+    char* ptr    = reinterpret_cast<char*>(&temp_storage);
+    auto ptr_end = ptr + vsmem_per_block;
+
+    // 128 byte-aligned virtual shared memory discard
+    for (auto thread_ptr = ptr + (linear_tid * line_size); thread_ptr < ptr_end; thread_ptr += block_stride)
+    {
+      cuda::discard_memory(thread_ptr, line_size);
+    }
+    return true;
+  }
+};
+
+template <class DefaultAgentT, class FallbackAgentT>
+constexpr bool use_fallback_agent()
+{
+  return (sizeof(typename DefaultAgentT::TempStorage) > max_smem_per_block)
+      && (sizeof(typename FallbackAgentT::TempStorage) <= max_smem_per_block);
+}
+
+/**
+ * @brief Class template that helps to prevent exceeding the available shared memory per thread block with two measures:
+ * (1) If an agent's `TempStorage` declaration exceeds the maximum amount of shared memory per thread block, we check
+ * whether using a fallback policy, e.g., with a smaller tile size, would fit into shared memory.
+ * (2) If the fallback still doesn't fit into shared memory, we make use of virtual shared memory that is backed by
+ * global memory.
+ *
+ * @tparam DefaultAgentPolicyT The default tuning policy that is used if the default agent's shared memory requirements
+ * fall within the bounds of `max_smem_per_block` or when virtual shared memory is needed
+ * @tparam DefaultAgentT The default agent, instantiated with the given default tuning policy
+ * @tparam FallbackAgentPolicyT A fallback tuning policy that may exhibit lower shared memory requirements, e.g., by
+ * using a smaller tile size, than the default. This fallback policy is used if and only if the shared memory
+ * requirements of the default agent exceed `max_smem_per_block`, yet the shared memory requirements of the fallback
+ * agent falls within the bounds of `max_smem_per_block`.
+ * @tparam FallbackAgentT The fallback agent, instantiated with the given fallback tuning policy
+ */
+template <typename DefaultAgentPolicyT,
+          typename DefaultAgentT,
+          typename FallbackAgentPolicyT = DefaultAgentPolicyT,
+          typename FallbackAgentT       = DefaultAgentT,
+          bool UseFallbackPolicy        = use_fallback_agent<DefaultAgentT, FallbackAgentT>()>
+struct vsmem_helper_with_fallback_impl : public vsmem_helper_impl<DefaultAgentT>
+{
+  using agent_t        = DefaultAgentT;
+  using agent_policy_t = DefaultAgentPolicyT;
+};
+template <typename DefaultAgentPolicyT, typename DefaultAgentT, typename FallbackAgentPolicyT, typename FallbackAgentT>
+struct vsmem_helper_with_fallback_impl<DefaultAgentPolicyT, DefaultAgentT, FallbackAgentPolicyT, FallbackAgentT, true>
+    : public vsmem_helper_impl<FallbackAgentT>
+{
+  using agent_t        = FallbackAgentT;
+  using agent_policy_t = FallbackAgentPolicyT;
+};
+
+/**
+ * @brief Alias template for the `vsmem_helper_with_fallback_impl` that instantiates the given AgentT template with the
+ * respective policy as first template parameter, followed by the parameters captured by the `AgentParamsT` template
+ * parameter pack.
+ */
+template <typename DefaultPolicyT, typename FallbackPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+using vsmem_helper_fallback_policy_t =
+  vsmem_helper_with_fallback_impl<DefaultPolicyT,
+                                  AgentT<DefaultPolicyT, AgentParamsT...>,
+                                  FallbackPolicyT,
+                                  AgentT<FallbackPolicyT, AgentParamsT...>>;
+
+/**
+ * @brief Alias template for the `vsmem_helper_t` by using a simple fallback policy that uses `DefaultPolicyT` as basis,
+ * overwriting `64` threads per block and `1` item per thread.
+ */
+template <typename DefaultPolicyT, template <typename...> class AgentT, typename... AgentParamsT>
+using vsmem_helper_default_fallback_policy_t =
+  vsmem_helper_fallback_policy_t<DefaultPolicyT, policy_wrapper_t<DefaultPolicyT, 64, 1>, AgentT, AgentParamsT...>;
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/version.cuh
similarity index 72%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh
rename to source/tnn/device/cuda/thirdparty/cub/cub/version.cuh
index e7329d805..2d5232939 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/version.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/version.cuh
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -25,7 +25,7 @@
  *
  ******************************************************************************/
 
-/*! \file version.h
+/*! \file version.cuh
  *  \brief Compile-time macros encoding CUB release version
  *
  *         <cub/version.h> is the only CUB header that is guaranteed to
@@ -35,36 +35,55 @@
 
 #pragma once
 
+// For _CCCL_IMPLICIT_SYSTEM_HEADER
+#include <cuda/__cccl_config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/version>
+
 /*! \def CUB_VERSION
  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
- *         number of the CUB library.
+ *         number of the CUB library as MMMmmmpp.
+ *
+ *  \note CUB_VERSION is formatted as `MMMmmmpp`, which differs from `CCCL_VERSION` that uses `MMMmmmppp`.
  *
  *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>CUB_VERSION / 100000</tt> is the major version.
  */
-#define CUB_VERSION 100910
+#define CUB_VERSION 200800 // macro expansion with ## requires this to be a single value
 
 /*! \def CUB_MAJOR_VERSION
  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
  *         major version number of the CUB library.
  */
-#define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
+#define CUB_MAJOR_VERSION (CUB_VERSION / 100000)
 
 /*! \def CUB_MINOR_VERSION
  *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
  *         minor version number of the CUB library.
  */
-#define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
+#define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000)
 
 /*! \def CUB_SUBMINOR_VERSION
  *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
  *         sub-minor version number of the CUB library.
  */
-#define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
+#define CUB_SUBMINOR_VERSION (CUB_VERSION % 100)
 
 /*! \def CUB_PATCH_NUMBER
  *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
  *         patch number of the CUB library.
  */
-#define CUB_PATCH_NUMBER 1
+#define CUB_PATCH_NUMBER 0
+
+static_assert(CUB_MAJOR_VERSION == CCCL_MAJOR_VERSION, "");
+static_assert(CUB_MINOR_VERSION == CCCL_MINOR_VERSION, "");
+static_assert(CUB_SUBMINOR_VERSION == CCCL_PATCH_VERSION, "");
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_shfl.cuh
new file mode 100644
index 000000000..5abfa7cdd
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_shfl.cuh
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <typename InputT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS>
+class WarpExchangeShfl
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, "LOGICAL_WARP_THREADS must be a power of two");
+
+  static_assert(ITEMS_PER_THREAD == LOGICAL_WARP_THREADS,
+                "WARP_EXCHANGE_SHUFFLE currently only works when ITEMS_PER_THREAD == "
+                "LOGICAL_WARP_THREADS");
+
+  static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+
+  // concrete recursion class
+  template <typename OutputT, int IDX, int SIZE>
+  class CompileTimeArray : protected CompileTimeArray<OutputT, IDX + 1, SIZE>
+  {
+  protected:
+    InputT val;
+
+    template <int NUM_ENTRIES>
+    _CCCL_DEVICE void Foreach(const bool xor_bit_set, const unsigned mask)
+    {
+      // The implementation here is a recursive divide-and-conquer approach
+      // that takes inspiration from:
+      // https://forums.developer.nvidia.com/t/transposing-register-held-matrices-with-warp-shuffles-need-help/38652/2
+      //
+      // At its core, the problem can be boiled down to transposing the matrix
+      //
+      //   A B
+      //   C D
+      //
+      // by swapping the off-diagonal elements/sub-matrices B and C recursively.
+      //
+      // This implementation requires power-of-two matrices. In order to avoid
+      // the use of local or shared memory, all index computation has to occur
+      // at compile-time, since registers cannot be indexed dynamically.
+      // Furthermore, using recursive templates reduces the mental load on the
+      // optimizer, since lowering for-loops into registers oftentimes requires
+      // finagling them with #pragma unroll, which leads to brittle code.
+      //
+      // To illustrate this algorithm, let's pretend we have warpSize = 8,
+      // where t0, ..., t7 denote the 8 threads, and thread i has an array of
+      // size 8 with data = [Ai, Bi, ..., Hi] (the columns in the schematics).
+      //
+      // In the first round, we exchange the largest 4x4 off-diagonal
+      // submatrix. Boxes illustrate the submatrices to be exchanged.
+      //
+      //       ROUND 1
+      //       =======
+      //  t0  t1  t2  t3  t4  t5  t6  t7
+      //                 ┌──────────────┐
+      //  A0  A1  A2  A3 │A4  A5  A6  A7│    NUM_ENTRIES == 4 tells us how many
+      //                 │              │       entries we have in a submatrix,
+      //                 │              │       in this case 4 and the size of
+      //  B0  B1  B2  B3 │B4  B5  B6  B7│       the jumps between submatrices.
+      //                 │              │
+      //                 │              │  1. t[0,1,2,3] data[4] swap with t[4,5,6,7]'s data[0]
+      //  C0  C1  C2  C3 │C4  C5  C6  C7│  2. t[0,1,2,3] data[5] swap with t[4,5,6,7]'s data[1]
+      //                 │              │  3. t[0,1,2,3] data[6] swap with t[4,5,6,7]'s data[2]
+      //                 │              │  4. t[0,1,2,3] data[7] swap with t[4,5,6,7]'s data[3]
+      //  D0  D1  D2  D3 │D4  D5  D6  D7│
+      //                 └──────────────┘
+      // ┌──────────────┐
+      // │E0  E1  E2  E3│ E4  E5  E6  E7
+      // │              │
+      // │              │
+      // │F0  F1  F2  F3│ F4  F5  F6  F7
+      // │              │
+      // │              │
+      // │G0  G1  G2  G3│ G4  G5  G6  G7
+      // │              │
+      // │              │
+      // │H0  H1  H2  H3│ H4  H5  H6  H7
+      // └──────────────┘
+      //
+      //       ROUND 2
+      //       =======
+      //  t0  t1  t2  t3  t4  t5  t6  t7
+      //         ┌──────┐        ┌──────┐
+      //  A0  A1 │A2  A3│ E0  E1 │E2  E3│    NUM_ENTRIES == 2 so we have 2
+      //         │      │        │      │       submatrices per thread and there
+      //         │      │        │      │       are 2 elements between these
+      //  B0  B1 │B2  B3│ F0  F1 │F2  F3│       submatrices.
+      //         └──────┘        └──────┘
+      // ┌──────┐        ┌──────┐          1. t[0,1,4,5] data[2] swap with t[2,3,6,7]'s data[0]
+      // │C0  C1│ C2  C3 │G0  G1│ G2  G3   2. t[0,1,4,5] data[3] swap with t[2,3,6,7]'s data[1]
+      // │      │        │      │          3. t[0,1,4,5] data[6] swap with t[2,3,6,7]'s data[4]
+      // │      │        │      │          4. t[0,1,4,5] data[7] swap with t[2,3,6,7]'s data[5]
+      // │D0  D1│ D2  D3 │H0  H1│ H2  H3
+      // └──────┘        └──────┘
+      //         ┌──────┐        ┌──────┐
+      //  A4  A5 │A6  A7│ E4  E5 │E6  E7│
+      //         │      │        │      │
+      //         │      │        │      │
+      //  B4  B5 │B6  B7│ F4  F5 │F6  F7│
+      //         └──────┘        └──────┘
+      // ┌──────┐        ┌──────┐
+      // │C4  C5│ C6  C7 │G4  G5│ G6  G7
+      // │      │        │      │
+      // │      │        │      │
+      // │D4  D5│ D6  D7 │H4  H5│ H6  H7
+      // └──────┘        └──────┘
+      //
+      //       ROUND 3
+      //       =======
+      //  t0  t1  t2  t3  t4  t5  t6  t7
+      //     ┌──┐    ┌──┐    ┌──┐    ┌──┐
+      //  A0 │A1│ C0 │C1│ E0 │E1│ G0 │G1│    NUM_ENTRIES == 1 so we have 4
+      //     └──┘    └──┘    └──┘    └──┘       submatrices per thread and there
+      // ┌──┐    ┌──┐    ┌──┐    ┌──┐           is 1 element between these
+      // │B0│ B1 │D0│ D1 │F0│ F1 │H0│ H1        submatrices.
+      // └──┘    └──┘    └──┘    └──┘
+      //     ┌──┐    ┌──┐    ┌──┐    ┌──┐  1. t[0,2,4,6] data[1] swap with t[1,3,5,7]'s data[0]
+      //  A2 │A3│ C2 │C3│ E2 │E3│ G2 │G3│  2. t[0,2,4,6] data[3] swap with t[1,3,5,7]'s data[2]
+      //     └──┘    └──┘    └──┘    └──┘  3. t[0,2,4,6] data[5] swap with t[1,3,5,7]'s data[4]
+      // ┌──┐    ┌──┐    ┌──┐    ┌──┐      4. t[0,2,4,6] data[7] swap with t[1,3,5,7]'s data[6]
+      // │B2│ B3 │D2│ D3 │F2│ F3 │H2│ H3
+      // └──┘    └──┘    └──┘    └──┘
+      //     ┌──┐    ┌──┐    ┌──┐    ┌──┐
+      //  A4 │A5│ C4 │C5│ E4 │E5│ G4 │G5│
+      //     └──┘    └──┘    └──┘    └──┘
+      // ┌──┐    ┌──┐    ┌──┐    ┌──┐
+      // │B4│ B5 │D4│ D5 │F4│ F5 │H4│ H5
+      // └──┘    └──┘    └──┘    └──┘
+      //     ┌──┐    ┌──┐    ┌──┐    ┌──┐
+      //  A6 │A7│ C6 │C7│ E6 │E7│ G6 │G7│
+      //     └──┘    └──┘    └──┘    └──┘
+      // ┌──┐    ┌──┐    ┌──┐    ┌──┐
+      // │B6│ B7 │D6│ D7 │F6│ F7 │H6│ H7
+      // └──┘    └──┘    └──┘    └──┘
+      //
+      //       RESULT
+      //       ======
+      //  t0  t1  t2  t3  t4  t5  t6  t7
+      //
+      //  A0  B0  C0  D0  E0  F0  G0  H0
+      //
+      //
+      //  A1  B1  C1  D1  E1  F1  G1  H1
+      //
+      //
+      //  A2  B2  C2  D2  E2  F2  G2  H2
+      //
+      //
+      //  A3  B3  C3  D3  E3  F3  G3  H3
+      //
+      //
+      //  A4  B4  C4  D4  E4  F4  G4  H4
+      //
+      //
+      //  A5  B5  C5  D5  E5  F5  G5  H5
+      //
+      //
+      //  A6  B6  C6  D6  E6  F6  G6  H6
+      //
+      //
+      //  A7  B7  C7  D7  E7  F7  G7  H7
+      //
+
+      // NOTE: Do *NOT* try to refactor this code to use a reference, since nvcc
+      //       tends to choke on it and then drop everything into local memory.
+      const InputT send_val = (xor_bit_set ? CompileTimeArray<OutputT, IDX, SIZE>::val
+                                           : CompileTimeArray<OutputT, IDX + NUM_ENTRIES, SIZE>::val);
+      const InputT recv_val = __shfl_xor_sync(mask, send_val, NUM_ENTRIES, LOGICAL_WARP_THREADS);
+      (xor_bit_set ? CompileTimeArray<OutputT, IDX, SIZE>::val
+                   : CompileTimeArray<OutputT, IDX + NUM_ENTRIES, SIZE>::val) = recv_val;
+
+      constexpr int next_idx = IDX + 1 + ((IDX + 1) % NUM_ENTRIES == 0) * NUM_ENTRIES;
+      CompileTimeArray<OutputT, next_idx, SIZE>::template Foreach<NUM_ENTRIES>(xor_bit_set, mask);
+    }
+
+    // terminate recursion
+    _CCCL_DEVICE void TransposeImpl(unsigned int, unsigned int, Int2Type<0>) {}
+
+    template <int NUM_ENTRIES>
+    _CCCL_DEVICE void TransposeImpl(const unsigned int lane_id, const unsigned int mask, Int2Type<NUM_ENTRIES>)
+    {
+      const bool xor_bit_set = lane_id & NUM_ENTRIES;
+      Foreach<NUM_ENTRIES>(xor_bit_set, mask);
+
+      TransposeImpl(lane_id, mask, Int2Type<NUM_ENTRIES / 2>());
+    }
+
+  public:
+    _CCCL_DEVICE
+    CompileTimeArray(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+        : CompileTimeArray<OutputT, IDX + 1, SIZE>{input_items, output_items}
+        , val{input_items[IDX]}
+    {}
+
+    _CCCL_DEVICE ~CompileTimeArray()
+    {
+      this->output_items[IDX] = val;
+    }
+
+    _CCCL_DEVICE void Transpose(const unsigned int lane_id, const unsigned int mask)
+    {
+      TransposeImpl(lane_id, mask, Int2Type<ITEMS_PER_THREAD / 2>());
+    }
+  };
+
+  // terminating partial specialization
+  template <typename OutputT, int SIZE>
+  class CompileTimeArray<OutputT, SIZE, SIZE>
+  {
+  protected:
+    // used for dumping back the individual values after transposing
+    InputT (&output_items)[ITEMS_PER_THREAD];
+
+    template <int>
+    _CCCL_DEVICE void Foreach(bool, unsigned)
+    {}
+
+  public:
+    _CCCL_DEVICE CompileTimeArray(const InputT (&)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+        : output_items{output_items}
+    {}
+  };
+
+  const unsigned int lane_id;
+  const unsigned int warp_id;
+  const unsigned int member_mask;
+
+public:
+  using TempStorage = NullType;
+
+  WarpExchangeShfl() = delete;
+
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeShfl(TempStorage&)
+      : lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {}
+
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    CompileTimeArray<OutputT, 0, ITEMS_PER_THREAD> arr{input_items, output_items};
+    arr.Transpose(lane_id, member_mask);
+  }
+
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    BlockedToStriped(input_items, output_items);
+  }
+
+  // Trick to keep the compiler from inferring that the
+  // condition in the static_assert is always false.
+  template <typename T>
+  struct dependent_false
+  {
+    static constexpr bool value = false;
+  };
+
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&)[ITEMS_PER_THREAD], OffsetT (&)[ITEMS_PER_THREAD])
+  {
+    static_assert(dependent_false<OffsetT>::value,
+                  "Shuffle specialization of warp exchange does not support\n"
+                  "ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD],\n"
+                  "                 OffsetT (&ranks)[ITEMS_PER_THREAD])");
+  }
+
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterToStriped(const InputT (&)[ITEMS_PER_THREAD], OutputT (&)[ITEMS_PER_THREAD], OffsetT (&)[ITEMS_PER_THREAD])
+  {
+    static_assert(dependent_false<OffsetT>::value,
+                  "Shuffle specialization of warp exchange does not support\n"
+                  "ScatterToStriped(const InputT (&input_items)[ITEMS_PER_THREAD],\n"
+                  "                 OutputT (&output_items)[ITEMS_PER_THREAD],\n"
+                  "                 OffsetT (&ranks)[ITEMS_PER_THREAD])");
+  }
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_smem.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_smem.cuh
new file mode 100644
index 000000000..aabb9e291
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_exchange_smem.cuh
@@ -0,0 +1,175 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::WarpExchangeSmem class provides [<em>collective</em>](index.html#sec0)
+ * methods for rearranging data partitioned across a CUDA warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <typename InputT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS>
+class WarpExchangeSmem
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, "LOGICAL_WARP_THREADS must be a power of two");
+
+  static constexpr int ITEMS_PER_TILE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1;
+
+  static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+
+  static constexpr int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(0);
+
+  // Insert padding if the number of items per thread is a power of two
+  // and > 4 (otherwise we can typically use 128b loads)
+  static constexpr bool INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE);
+
+  static constexpr int PADDING_ITEMS = INSERT_PADDING ? (ITEMS_PER_TILE >> LOG_SMEM_BANKS) : 0;
+
+  union _TempStorage
+  {
+    InputT items_shared[ITEMS_PER_TILE + PADDING_ITEMS];
+  }; // union TempStorage
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  const unsigned int lane_id;
+  const unsigned int warp_id;
+  const unsigned int member_mask;
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  WarpExchangeSmem() = delete;
+
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchangeSmem(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {}
+
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx                  = ITEMS_PER_THREAD * lane_id + item;
+      temp_storage.items_shared[idx] = input_items[item];
+    }
+    WARP_SYNC(member_mask);
+
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx      = LOGICAL_WARP_THREADS * item + lane_id;
+      output_items[item] = temp_storage.items_shared[idx];
+    }
+  }
+
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx                  = LOGICAL_WARP_THREADS * item + lane_id;
+      temp_storage.items_shared[idx] = input_items[item];
+    }
+    WARP_SYNC(member_mask);
+
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const int idx      = ITEMS_PER_THREAD * lane_id + item;
+      output_items[item] = temp_storage.items_shared[idx];
+    }
+  }
+
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    ScatterToStriped(items, items, ranks);
+  }
+
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
+    const InputT (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      if (INSERT_PADDING)
+      {
+        ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+      }
+
+      temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM];
+    }
+
+    WARP_SYNC(member_mask);
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+      int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+
+      if (INSERT_PADDING)
+      {
+        item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+      }
+
+      output_items[ITEM] = temp_storage.items_shared[item_offset];
+    }
+  }
+};
+
+} // namespace detail
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 000000000..fdd4083c3
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,734 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread
+ * warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <class A = int, class = A>
+struct reduce_add_exists : ::cuda::std::false_type
+{};
+
+template <class T>
+struct reduce_add_exists<T, decltype(__reduce_add_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
+{};
+
+template <class T = int, class = T>
+struct reduce_min_exists : ::cuda::std::false_type
+{};
+
+template <class T>
+struct reduce_min_exists<T, decltype(__reduce_min_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
+{};
+
+template <class T = int, class = T>
+struct reduce_max_exists : ::cuda::std::false_type
+{};
+
+template <class T>
+struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cuda::std::true_type
+{};
+
+} // namespace detail
+
+/**
+ * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp (must be a power-of-two)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+struct WarpReduceShfl
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, "LOGICAL_WARP_THREADS must be a power of two");
+
+  //---------------------------------------------------------------------
+  // Constants and type definitions
+  //---------------------------------------------------------------------
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// The number of warp reduction steps
+    STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+    /// Number of logical warps in a PTX warp
+    LOGICAL_WARPS = CUB_WARP_THREADS(0) / LOGICAL_WARP_THREADS,
+
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8
+
+  };
+
+  template <typename S>
+  struct IsInteger
+  {
+    enum
+    {
+      /// Whether the data type is a small (32b or less) integer for which we can use a single SHFL instruction per
+      /// exchange
+      IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+    };
+  };
+
+  /// Shared memory storage layout type
+  using TempStorage = NullType;
+
+  //---------------------------------------------------------------------
+  // Thread fields
+  //---------------------------------------------------------------------
+
+  /// Lane index in logical warp
+  int lane_id;
+
+  /// Logical warp index in 32-thread physical warp
+  int warp_id;
+
+  /// 32-thread physical warp member mask of logical warp
+  ::cuda::std::uint32_t member_mask;
+
+  //---------------------------------------------------------------------
+  // Construction
+  //---------------------------------------------------------------------
+
+  /// Constructor
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceShfl(TempStorage& /*temp_storage*/)
+      : lane_id(static_cast<int>(LaneId()))
+      , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {
+    if (!IS_ARCH_WARP)
+    {
+      lane_id = lane_id % LOGICAL_WARP_THREADS;
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Reduction steps
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Reduction (specialized for summation across uint32 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+  ReduceStep(unsigned int input, cub::Sum /*reduction_op*/, int last_lane, int offset)
+  {
+    unsigned int output;
+    int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u32 r0;"
+      "  .reg .pred p;"
+      "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+      "  @p add.u32 r0, r0, %4;"
+      "  mov.u32 %0, r0;"
+      "}"
+      : "=r"(output)
+      : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for summation across fp32 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE float ReduceStep(float input, cub::Sum /*reduction_op*/, int last_lane, int offset)
+  {
+    float output;
+    int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .f32 r0;"
+      "  .reg .pred p;"
+      "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+      "  @p add.f32 r0, r0, %4;"
+      "  mov.f32 %0, r0;"
+      "}"
+      : "=f"(output)
+      : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for summation across unsigned long long types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long
+  ReduceStep(unsigned long long input, cub::Sum /*reduction_op*/, int last_lane, int offset)
+  {
+    unsigned long long output;
+    int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
+
+    asm volatile(
+      "{"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+      "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+      "  mov.b64 %0, {lo, hi};"
+      "  @p add.u64 %0, %0, %1;"
+      "}"
+      : "=l"(output)
+      : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for summation across long long types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE long long
+  ReduceStep(long long input, cub::Sum /*reduction_op*/, int last_lane, int offset)
+  {
+    long long output;
+    int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+      "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+      "  mov.b64 %0, {lo, hi};"
+      "  @p add.s64 %0, %0, %1;"
+      "}"
+      : "=l"(output)
+      : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for summation across double types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE double ReduceStep(double input, cub::Sum /*reduction_op*/, int last_lane, int offset)
+  {
+    double output;
+    int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  .reg .f64 r0;"
+      "  mov.b64 %0, %1;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+      "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+      "  mov.b64 r0, {lo, hi};"
+      "  @p add.f64 %0, %0, r0;"
+      "}"
+      : "=d"(output)
+      : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across
+   *        KeyValuePair<KeyT, ValueT> types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  template <typename ValueT, typename KeyT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePair<KeyT, ValueT> ReduceStep(
+    KeyValuePair<KeyT, ValueT> input, SwizzleScanOp<ReduceByKeyOp<cub::Sum>> /*reduction_op*/, int last_lane, int offset)
+  {
+    KeyValuePair<KeyT, ValueT> output;
+
+    KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+
+    output.key = input.key;
+    output.value =
+      ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+    if (input.key != other_key)
+    {
+      output.value = input.value;
+    }
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across
+   *        KeyValuePair<OffsetT, ValueT> types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  template <typename ValueT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePair<OffsetT, ValueT> ReduceStep(
+    KeyValuePair<OffsetT, ValueT> input,
+    SwizzleScanOp<ReduceBySegmentOp<cub::Sum>> /*reduction_op*/,
+    int last_lane,
+    int offset)
+  {
+    KeyValuePair<OffsetT, ValueT> output;
+
+    output.value =
+      ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+    output.key =
+      ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+    if (input.key > 0)
+    {
+      output.value = input.value;
+    }
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction step (generic)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  template <typename _T, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
+  {
+    _T output = input;
+
+    _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+    // Perform reduction op if valid
+    if (offset + lane_id <= last_lane)
+    {
+      output = reduction_op(input, temp);
+    }
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction step (specialized for small unsigned integers size 32b or less)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   *
+   * @param[in] is_small_unsigned
+   *   Marker type indicating whether T is a small unsigned integer
+   */
+  template <typename _T, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T
+  ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type<true> /*is_small_unsigned*/)
+  {
+    return ReduceStep(input, reduction_op, last_lane, offset);
+  }
+
+  /**
+   * @brief Reduction step (specialized for types other than small unsigned integers size
+   *        32b or less)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   *
+   * @param[in] is_small_unsigned
+   *   Marker type indicating whether T is a small unsigned integer
+   */
+  template <typename _T, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T
+  ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset, Int2Type<false> /*is_small_unsigned*/)
+  {
+    return ReduceStep(input, reduction_op, last_lane, offset);
+  }
+
+  //---------------------------------------------------------------------
+  // Templated reduction iteration
+  //---------------------------------------------------------------------
+
+  /**
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   */
+  template <typename ReductionOp, int STEP>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ReduceStep(T& input, ReductionOp reduction_op, int last_lane, Int2Type<STEP> /*step*/)
+  {
+    input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+    ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+  }
+
+  /**
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   *
+   * @param[in] last_lane
+   *   Index of last lane in segment
+   */
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ReduceStep(T& /*input*/, ReductionOp /*reduction_op*/, int /*last_lane*/, Int2Type<STEPS> /*step*/)
+  {}
+
+  //---------------------------------------------------------------------
+  // Reduction operations
+  //---------------------------------------------------------------------
+
+  /**
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  ReduceImpl(Int2Type<0> /* all_lanes_valid */, T input, int valid_items, ReductionOp reduction_op)
+  {
+    int last_lane = valid_items - 1;
+
+    T output = input;
+
+    // Template-iterate reduction steps
+    ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+    return output;
+  }
+
+  /**
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, ReductionOp reduction_op)
+  {
+    int last_lane = LOGICAL_WARP_THREADS - 1;
+
+    T output = input;
+
+    // Template-iterate reduction steps
+    ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+    return output;
+  }
+
+  template <class U = T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  typename ::cuda::std::enable_if<(::cuda::std::is_same<int, U>::value || ::cuda::std::is_same<unsigned int, U>::value)
+                                    && detail::reduce_add_exists<>::value,
+                                  T>::type
+  ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Sum /* reduction_op */)
+  {
+    T output = input;
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+                 (output = __reduce_add_sync(member_mask, input);),
+                 (output = ReduceImpl<cub::Sum>(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Sum{});));
+
+    return output;
+  }
+
+  template <class U = T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  typename ::cuda::std::enable_if<(::cuda::std::is_same<int, U>::value || ::cuda::std::is_same<unsigned int, U>::value)
+                                    && detail::reduce_min_exists<>::value,
+                                  T>::type
+  ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Min /* reduction_op */)
+  {
+    T output = input;
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+                 (output = __reduce_min_sync(member_mask, input);),
+                 (output = ReduceImpl<cub::Min>(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Min{});));
+
+    return output;
+  }
+
+  template <class U = T>
+  _CCCL_DEVICE _CCCL_FORCEINLINE
+  typename ::cuda::std::enable_if<(::cuda::std::is_same<int, U>::value || ::cuda::std::is_same<unsigned int, U>::value)
+                                    && detail::reduce_max_exists<>::value,
+                                  T>::type
+  ReduceImpl(Int2Type<1> /* all_lanes_valid */, T input, int /* valid_items */, cub::Max /* reduction_op */)
+  {
+    T output = input;
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+                 (output = __reduce_max_sync(member_mask, input);),
+                 (output = ReduceImpl<cub::Max>(Int2Type<1>{}, input, LOGICAL_WARP_THREADS, cub::Max{});));
+
+    return output;
+  }
+
+  /**
+   * @brief Reduction
+   *
+   * @tparam ALL_LANES_VALID
+   *   Whether all lanes in each warp are contributing a valid fold of items
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <bool ALL_LANES_VALID, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int valid_items, ReductionOp reduction_op)
+  {
+    return ReduceImpl(Int2Type<ALL_LANES_VALID>{}, input, valid_items, reduction_op);
+  }
+
+  /**
+   * @brief Segmented reduction
+   *
+   * @tparam HEAD_SEGMENTED
+   *   Whether flags indicate a segment-head or a segment-tail
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] flag
+   *   Whether or not the current lane is a segment head/tail
+   *
+   * @param[in] reduction_op
+   *   Binary reduction operator
+   */
+  template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
+  {
+    // Get the start flags for each thread in the warp.
+    int warp_flags = WARP_BALLOT(flag, member_mask);
+
+    // Convert to tail-segmented
+    if (HEAD_SEGMENTED)
+    {
+      warp_flags >>= 1;
+    }
+
+    // Mask out the bits below the current thread
+    warp_flags &= LaneMaskGe();
+
+    // Mask of physical lanes outside the logical warp and convert to logical lanemask
+    if (!IS_ARCH_WARP)
+    {
+      warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+    }
+
+    // Mask in the last lane of logical warp
+    warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+    // Find the next set flag
+    int last_lane = __clz(__brev(warp_flags));
+
+    T output = input;
+
+    //        // Iterate reduction steps
+    //        #pragma unroll
+    //        for (int STEP = 0; STEP < STEPS; STEP++)
+    //        {
+    //            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP,
+    //            Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+    //        }
+
+    // Template-iterate reduction steps
+    ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+    return output;
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_smem.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 000000000..87b38db2a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,413 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
+ * across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+struct WarpReduceSmem
+{
+  /******************************************************************************
+   * Constants and type definitions
+   ******************************************************************************/
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// Whether the logical warp size is a power-of-two
+    IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+    /// The number of warp reduction steps
+    STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+    /// The number of threads in half a warp
+    HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+    /// The number of shared memory elements per warp
+    WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+    /// FlagT status (when not using ballot)
+    UNSET = 0x0, // Is initially unset
+    SET   = 0x1, // Is initially set
+    SEEN  = 0x2, // Has seen another head flag from a successor peer
+  };
+
+  /// Shared memory flag type
+  using SmemFlag = unsigned char;
+
+  /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+  struct _TempStorage
+  {
+    T reduce[WARP_SMEM_ELEMENTS];
+    SmemFlag flags[WARP_SMEM_ELEMENTS];
+  };
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  _TempStorage& temp_storage;
+  unsigned int lane_id;
+  unsigned int member_mask;
+
+  /******************************************************************************
+   * Construction
+   ******************************************************************************/
+
+  /// Constructor
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduceSmem(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+  {}
+
+  /******************************************************************************
+   * Utility methods
+   ******************************************************************************/
+
+  //---------------------------------------------------------------------
+  // Regular reduction
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Reduction step
+   *
+   * @tparam ALL_LANES_VALID
+   *   Whether all lanes in each warp are contributing a valid fold of items
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   */
+  template <bool ALL_LANES_VALID, typename ReductionOp, int STEP>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type<STEP> /*step*/)
+  {
+    constexpr int OFFSET = 1 << STEP;
+
+    // Share input through buffer
+    ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+    WARP_SYNC(member_mask);
+
+    // Update input if peer_addend is in range
+    if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+    {
+      T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+      input         = reduction_op(input, peer_addend);
+    }
+
+    WARP_SYNC(member_mask);
+
+    return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+  }
+
+  /**
+   * @brief Reduction step (terminate)
+   *
+   * @tparam ALL_LANES_VALID
+   *   Whether all lanes in each warp are contributing a valid fold of items
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   */
+  template <bool ALL_LANES_VALID, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  ReduceStep(T input, int valid_items, ReductionOp /*reduction_op*/, Int2Type<STEPS> /*step*/)
+  {
+    return input;
+  }
+
+  //---------------------------------------------------------------------
+  // Segmented reduction
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Ballot-based segmented reduce
+   *
+   * @tparam HEAD_SEGMENTED
+   *   Whether flags indicate a segment-head or a segment-tail
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] flag
+   *   Whether or not the current lane is a segment head/tail
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   *
+   * @param[in] has_ballot
+   *   Marker type for whether the target arch has ballot functionality
+   */
+  template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<true> /*has_ballot*/)
+  {
+    // Get the start flags for each thread in the warp.
+    int warp_flags = WARP_BALLOT(flag, member_mask);
+
+    if (!HEAD_SEGMENTED)
+    {
+      warp_flags <<= 1;
+    }
+
+    // Keep bits above the current thread.
+    warp_flags &= LaneMaskGt();
+
+    // Accommodate packing of multiple logical warps in a single physical warp
+    if (!IS_ARCH_WARP)
+    {
+      warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+    }
+
+    // Find next flag
+    int next_flag = __clz(__brev(warp_flags));
+
+    // Clip the next segment at the warp boundary if necessary
+    if (LOGICAL_WARP_THREADS != 32)
+    {
+      next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+    }
+
+#pragma unroll
+    for (int STEP = 0; STEP < STEPS; STEP++)
+    {
+      const int OFFSET = 1 << STEP;
+
+      // Share input into buffer
+      ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+      WARP_SYNC(member_mask);
+
+      // Update input if peer_addend is in range
+      if (OFFSET + lane_id < next_flag)
+      {
+        T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+        input         = reduction_op(input, peer_addend);
+      }
+
+      WARP_SYNC(member_mask);
+    }
+
+    return input;
+  }
+
+  /**
+   * @brief Smem-based segmented reduce
+   *
+   * @tparam HEAD_SEGMENTED
+   *   Whether flags indicate a segment-head or a segment-tail
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] flag
+   *   Whether or not the current lane is a segment head/tail
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   *
+   * @param[in] has_ballot
+   *   Marker type for whether the target arch has ballot functionality
+   */
+  template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T
+  SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<false> /*has_ballot*/)
+  {
+    enum
+    {
+      UNSET = 0x0, // Is initially unset
+      SET   = 0x1, // Is initially set
+      SEEN  = 0x2, // Has seen another head flag from a successor peer
+    };
+
+    // Alias flags onto shared data storage
+    volatile SmemFlag* flag_storage = temp_storage.flags;
+
+    SmemFlag flag_status = (flag) ? SET : UNSET;
+
+    for (int STEP = 0; STEP < STEPS; STEP++)
+    {
+      const int OFFSET = 1 << STEP;
+
+      // Share input through buffer
+      ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+      WARP_SYNC(member_mask);
+
+      // Get peer from buffer
+      T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+      WARP_SYNC(member_mask);
+
+      // Share flag through buffer
+      flag_storage[lane_id] = flag_status;
+
+      // Get peer flag from buffer
+      SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+      // Update input if peer was in range
+      if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+      {
+        if (HEAD_SEGMENTED)
+        {
+          // Head-segmented
+          if ((flag_status & SEEN) == 0)
+          {
+            // Has not seen a more distant head flag
+            if (peer_flag_status & SET)
+            {
+              // Has now seen a head flag
+              flag_status |= SEEN;
+            }
+            else
+            {
+              // Peer is not a head flag: grab its count
+              input = reduction_op(input, peer_addend);
+            }
+
+            // Update seen status to include that of peer
+            flag_status |= (peer_flag_status & SEEN);
+          }
+        }
+        else
+        {
+          // Tail-segmented.  Simply propagate flag status
+          if (!flag_status)
+          {
+            input = reduction_op(input, peer_addend);
+            flag_status |= peer_flag_status;
+          }
+        }
+      }
+    }
+
+    return input;
+  }
+
+  /******************************************************************************
+   * Interface
+   ******************************************************************************/
+
+  /**
+   * @brief Reduction
+   *
+   * @tparam ALL_LANES_VALID
+   *   Whether all lanes in each warp are contributing a valid fold of items
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] valid_items
+   *   Total number of valid items across the logical warp
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   */
+  template <bool ALL_LANES_VALID, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int valid_items, ReductionOp reduction_op)
+  {
+    return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+  }
+
+  /**
+   * @brief Segmented reduction
+   *
+   * @tparam HEAD_SEGMENTED
+   *   Whether flags indicate a segment-head or a segment-tail
+   *
+   * @param[in] input
+   *   Calling thread's input
+   *
+   * @param[in] flag
+   *   Whether or not the current lane is a segment head/tail
+   *
+   * @param[in] reduction_op
+   *   Reduction operator
+   */
+  template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
+  {
+    return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_shfl.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 000000000..ac306872b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,672 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
+ * across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp (must be a power-of-two)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+struct WarpScanShfl
+{
+  //---------------------------------------------------------------------
+  // Constants and type definitions
+  //---------------------------------------------------------------------
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// The number of warp scan steps
+    STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    SHFL_C = (CUB_WARP_THREADS(0) - LOGICAL_WARP_THREADS) << 8
+  };
+
+  template <typename S>
+  struct IntegerTraits
+  {
+    enum
+    {
+      /// Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per
+      /// exchange
+      IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+    };
+  };
+
+  /// Shared memory storage layout type
+  struct TempStorage
+  {};
+
+  //---------------------------------------------------------------------
+  // Thread fields
+  //---------------------------------------------------------------------
+
+  /// Lane index in logical warp
+  unsigned int lane_id;
+
+  /// Logical warp index in 32-thread physical warp
+  unsigned int warp_id;
+
+  /// 32-thread physical warp member mask of logical warp
+  unsigned int member_mask;
+
+  //---------------------------------------------------------------------
+  // Construction
+  //---------------------------------------------------------------------
+
+  /// Constructor
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanShfl(TempStorage& /*temp_storage*/)
+      : lane_id(LaneId())
+      , warp_id(IS_ARCH_WARP ? 0 : (lane_id / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {
+    if (!IS_ARCH_WARP)
+    {
+      lane_id = lane_id % LOGICAL_WARP_THREADS;
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // Inclusive scan steps
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across int32 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE int InclusiveScanStep(int input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    int output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .s32 r0;"
+      "  .reg .pred p;"
+      "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+      "  @p add.s32 r0, r0, %4;"
+      "  mov.s32 %0, r0;"
+      "}"
+      : "=r"(output)
+      : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across uint32 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int
+  InclusiveScanStep(unsigned int input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    unsigned int output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u32 r0;"
+      "  .reg .pred p;"
+      "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+      "  @p add.u32 r0, r0, %4;"
+      "  mov.u32 %0, r0;"
+      "}"
+      : "=r"(output)
+      : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across fp32 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE float InclusiveScanStep(float input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    float output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .f32 r0;"
+      "  .reg .pred p;"
+      "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+      "  @p add.f32 r0, r0, %4;"
+      "  mov.f32 %0, r0;"
+      "}"
+      : "=f"(output)
+      : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across unsigned long long types)
+   *
+   * @param[in]  input
+   *   Calling thread's input item
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned long long
+  InclusiveScanStep(unsigned long long input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    unsigned long long output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u64 r0;"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+      "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+      "  mov.b64 r0, {lo, hi};"
+      "  @p add.u64 r0, r0, %4;"
+      "  mov.u64 %0, r0;"
+      "}"
+      : "=l"(output)
+      : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across long long types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE long long
+  InclusiveScanStep(long long input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    long long output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .s64 r0;"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+      "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+      "  mov.b64 r0, {lo, hi};"
+      "  @p add.s64 r0, r0, %4;"
+      "  mov.s64 %0, r0;"
+      "}"
+      : "=l"(output)
+      : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for summation across fp64 types)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE double InclusiveScanStep(double input, cub::Sum /*scan_op*/, int first_lane, int offset)
+  {
+    double output;
+    int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane)
+
+    // Use predicate set from SHFL to guard against invalid peers
+    asm volatile(
+      "{"
+      "  .reg .u32 lo;"
+      "  .reg .u32 hi;"
+      "  .reg .pred p;"
+      "  .reg .f64 r0;"
+      "  mov.b64 %0, %1;"
+      "  mov.b64 {lo, hi}, %1;"
+      "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+      "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+      "  mov.b64 r0, {lo, hi};"
+      "  @p add.f64 %0, %0, r0;"
+      "}"
+      : "=d"(output)
+      : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+
+    return output;
+  }
+
+  /*
+      /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+      template <typename Value, typename OffsetT>
+      _CCCL_DEVICE _CCCL_FORCEINLINE KeyValuePair<OffsetT, Value>InclusiveScanStep(
+          KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+          ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+          int                             first_lane,         ///< [in] Index of first lane in segment
+          int                             offset)             ///< [in] Up-offset to pull from
+      {
+          KeyValuePair<OffsetT, Value> output;
+
+          output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset,
+     Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>()); output.key = InclusiveScanStep(input.key, cub::Sum(),
+     first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+          if (input.key > 0)
+              output.value = input.value;
+
+          return output;
+      }
+  */
+
+  /**
+   * @brief Inclusive prefix scan step (generic)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   */
+  template <typename _T, typename ScanOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset)
+  {
+    _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+    // Perform scan op if from a valid peer
+    _T output = scan_op(temp, input);
+    if (static_cast<int>(lane_id) < first_lane + offset)
+    {
+      output = input;
+    }
+
+    return output;
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for small integers size 32b or less)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   *
+   * @param[in] is_small_unsigned
+   *   Marker type indicating whether T is a small integer
+   */
+  template <typename _T, typename ScanOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T
+  InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type<true> /*is_small_unsigned*/)
+  {
+    return InclusiveScanStep(input, scan_op, first_lane, offset);
+  }
+
+  /**
+   * @brief Inclusive prefix scan step (specialized for types other than small integers size
+   *        32b or less)
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] first_lane
+   *   Index of first lane in segment
+   *
+   * @param[in] offset
+   *   Up-offset to pull from
+   *
+   * @param[in] is_small_unsigned
+   *   Marker type indicating whether T is a small integer
+   */
+  template <typename _T, typename ScanOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE _T
+  InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset, Int2Type<false> /*is_small_unsigned*/)
+  {
+    return InclusiveScanStep(input, scan_op, first_lane, offset);
+  }
+
+  /******************************************************************************
+   * Interface
+   ******************************************************************************/
+
+  //---------------------------------------------------------------------
+  // Broadcast
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Broadcast
+   *
+   * @param[in] input
+   *   The value to broadcast
+   *
+   * @param[in] src_lane
+   *   Which warp lane is to do the broadcasting
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, int src_lane)
+  {
+    return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+  }
+
+  //---------------------------------------------------------------------
+  // Inclusive operations
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Inclusive scan
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename _T, typename ScanOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(_T input, _T& inclusive_output, ScanOpT scan_op)
+  {
+    inclusive_output = input;
+
+    // Iterate scan steps
+    int segment_first_lane = 0;
+
+// Iterate scan steps
+#pragma unroll
+    for (int STEP = 0; STEP < STEPS; STEP++)
+    {
+      inclusive_output = InclusiveScanStep(
+        inclusive_output, scan_op, segment_first_lane, (1 << STEP), Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+    }
+  }
+
+  /**
+   * @brief Inclusive scan, specialized for reduce-value-by-key
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename KeyT, typename ValueT, typename ReductionOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(
+    KeyValuePair<KeyT, ValueT> input, KeyValuePair<KeyT, ValueT>& inclusive_output, ReduceByKeyOp<ReductionOpT> scan_op)
+  {
+    inclusive_output = input;
+
+    KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+    unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+    // Mask away all lanes greater than ours
+    ballot = ballot & LaneMaskLe();
+
+    // Find index of first set bit
+    int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+// Iterate scan steps
+#pragma unroll
+    for (int STEP = 0; STEP < STEPS; STEP++)
+    {
+      inclusive_output.value = InclusiveScanStep(
+        inclusive_output.value,
+        scan_op.op,
+        segment_first_lane,
+        (1 << STEP),
+        Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+    }
+  }
+
+  /**
+   * @brief Inclusive scan with aggregate
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] warp_aggregate
+   *   Warp-wide aggregate reduction of input items
+   */
+  template <typename ScanOpT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOpT scan_op, T& warp_aggregate)
+  {
+    InclusiveScan(input, inclusive_output, scan_op);
+
+    // Grab aggregate from last warp lane
+    warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+  }
+
+  //---------------------------------------------------------------------
+  // Get exclusive from inclusive
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Update inclusive and exclusive using input and inclusive
+   *
+   * @param[in] input
+   *
+   * @param[out] inclusive
+   *
+   * @param[out] exclusive
+   *
+   * @param[in] scan_op
+   *
+   * @param[in] is_integer
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/)
+  {
+    // initial value unknown
+    exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of
+   *        integer types)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, cub::Sum /*scan_op*/, Int2Type<true> /*is_integer*/)
+  {
+    // initial value presumed 0
+    exclusive = inclusive - input;
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial
+   *        value
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/)
+  {
+    inclusive = scan_op(initial_value, inclusive);
+    exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+    if (lane_id == 0)
+    {
+      exclusive = initial_value;
+    }
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using initial value using input and inclusive
+   *        (specialized for summation of integer types)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, cub::Sum scan_op, T initial_value, Int2Type<true> /*is_integer*/)
+  {
+    inclusive = scan_op(initial_value, inclusive);
+    exclusive = inclusive - input;
+  }
+
+  /**
+   * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT scan_op, IsIntegerT is_integer)
+  {
+    warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+    Update(input, inclusive, exclusive, scan_op, is_integer);
+  }
+
+  /**
+   * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial
+   *        value
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Update(
+    T input, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT scan_op, T initial_value, IsIntegerT is_integer)
+  {
+    warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+    Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_smem.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 000000000..9fc8a4cf4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,428 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
+ * across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+/**
+ * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
+ */
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+struct WarpScanSmem
+{
+  /******************************************************************************
+   * Constants and type definitions
+   ******************************************************************************/
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// The number of warp scan steps
+    STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+    /// The number of threads in half a warp
+    HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+    /// The number of shared memory elements per warp
+    WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+  };
+
+  /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+  using CellT = T;
+
+  /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+  using _TempStorage = CellT[WARP_SMEM_ELEMENTS];
+
+  // Alias wrapper allowing storage to be unioned
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  _TempStorage& temp_storage;
+  unsigned int lane_id;
+  unsigned int member_mask;
+
+  /******************************************************************************
+   * Construction
+   ******************************************************************************/
+
+  /// Constructor
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpScanSmem(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      ,
+
+      lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+      ,
+
+      member_mask(WarpMask<LOGICAL_WARP_THREADS>(LaneId() / LOGICAL_WARP_THREADS))
+  {}
+
+  /******************************************************************************
+   * Utility methods
+   ******************************************************************************/
+
+  /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+  template <bool HAS_IDENTITY, int STEP, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanStep(T& partial, ScanOp scan_op, Int2Type<STEP> /*step*/)
+  {
+    constexpr int OFFSET = 1 << STEP;
+
+    // Share partial into buffer
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+    WARP_SYNC(member_mask);
+
+    // Update partial if addend is in range
+    if (HAS_IDENTITY || (lane_id >= OFFSET))
+    {
+      T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+      partial  = scan_op(addend, partial);
+    }
+    WARP_SYNC(member_mask);
+
+    ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+  }
+
+  /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+  template <bool HAS_IDENTITY, typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScanStep(T& /*partial*/, ScanOp /*scan_op*/, Int2Type<STEPS> /*step*/)
+  {}
+
+  /**
+   * @brief Inclusive prefix scan (specialized for summation across primitive types)
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in]
+   *   Marker type indicating whether T is primitive type
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, Sum scan_op, Int2Type<true> /*is_primitive*/)
+  {
+    T identity = 0;
+    ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+    WARP_SYNC(member_mask);
+
+    // Iterate scan steps
+    output = input;
+    ScanStep<true>(output, scan_op, Int2Type<0>());
+  }
+
+  /**
+   * @brief Inclusive prefix scan
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[in] is_primitive
+   *   Marker type indicating whether T is primitive type
+   */
+  template <typename ScanOp, int IS_PRIMITIVE>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T input, T& output, ScanOp scan_op, Int2Type<IS_PRIMITIVE> /*is_primitive*/)
+  {
+    // Iterate scan steps
+    output = input;
+    ScanStep<false>(output, scan_op, Int2Type<0>());
+  }
+
+  /******************************************************************************
+   * Interface
+   ******************************************************************************/
+
+  //---------------------------------------------------------------------
+  // Broadcast
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Broadcast
+   *
+   * @param[in] input
+   *   The value to broadcast
+   *
+   * @param[in] src_lane
+   *   Which warp lane is to do the broadcasting
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane)
+  {
+    if (lane_id == src_lane)
+    {
+      ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+    }
+
+    WARP_SYNC(member_mask);
+
+    return (T) ThreadLoad<LOAD_VOLATILE>(temp_storage);
+  }
+
+  //---------------------------------------------------------------------
+  // Inclusive operations
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Inclusive scan
+   *
+   * @param[in] input
+   *   Calling thread's input item.
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op)
+  {
+    InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+  }
+
+  /**
+   * @brief Inclusive scan with aggregate
+   *
+   * @param[in] input
+   *   Calling thread's input item
+   *
+   * @param[out] inclusive_output
+   *   Calling thread's output item. May be aliased with @p input
+   *
+   * @param[in] scan_op
+   *   Binary scan operator
+   *
+   * @param[out] warp_aggregate
+   *   Warp-wide aggregate reduction of input items.
+   */
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate)
+  {
+    InclusiveScan(input, inclusive_output, scan_op);
+
+    // Retrieve aggregate
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+    WARP_SYNC(member_mask);
+
+    warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+    WARP_SYNC(member_mask);
+  }
+
+  //---------------------------------------------------------------------
+  // Get exclusive from inclusive
+  //---------------------------------------------------------------------
+
+  /**
+   * @brief Update inclusive and exclusive using input and inclusive
+   *
+   * @param[in] input
+   *
+   * @param[in, out] inclusive
+   *
+   * @param[out] exclusive
+   *
+   * @param[in] scan_op
+   *
+   * @param[in] is_integer
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/)
+  {
+    // initial value unknown
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of
+   *        integer types)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, cub::Sum /*scan_op*/, Int2Type<true> /*is_integer*/)
+  {
+    // initial value presumed 0
+    exclusive = inclusive - input;
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial
+   *        value
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T /*input*/, T& inclusive, T& exclusive, ScanOpT scan_op, T initial_value, IsIntegerT /*is_integer*/)
+  {
+    inclusive = scan_op(initial_value, inclusive);
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    if (lane_id == 0)
+    {
+      exclusive = initial_value;
+    }
+  }
+
+  /**
+   * @brief Update inclusive and exclusive using initial value using input and inclusive
+   *        (specialized for summation of integer types)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, cub::Sum scan_op, T initial_value, Int2Type<true> /*is_integer*/)
+  {
+    inclusive = scan_op(initial_value, inclusive);
+    exclusive = inclusive - input;
+  }
+
+  /**
+   * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T /*input*/, T& inclusive, T& exclusive, T& warp_aggregate, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/)
+  {
+    // Initial value presumed to be unknown or identity (either way our padding is correct)
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    exclusive      = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+  }
+
+  /**
+   * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized
+   *        for summation of integer types)
+   */
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Update(T input, T& inclusive, T& exclusive, T& warp_aggregate, cub::Sum /*scan_o*/, Int2Type<true> /*is_integer*/)
+  {
+    // Initial value presumed to be unknown or identity (either way our padding is correct)
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    exclusive      = inclusive - input;
+  }
+
+  /**
+   * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial
+   *        value
+   */
+  template <typename ScanOpT, typename IsIntegerT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Update(
+    T /*input*/,
+    T& inclusive,
+    T& exclusive,
+    T& warp_aggregate,
+    ScanOpT scan_op,
+    T initial_value,
+    IsIntegerT /*is_integer*/)
+  {
+    // Broadcast warp aggregate
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+    WARP_SYNC(member_mask);
+
+    // Update inclusive with initial value
+    inclusive = scan_op(initial_value, inclusive);
+
+    // Get exclusive from exclusive
+    ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+    WARP_SYNC(member_mask);
+
+    exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+    if (lane_id == 0)
+    {
+      exclusive = initial_value;
+    }
+  }
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_exchange.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_exchange.cuh
new file mode 100644
index 000000000..79f422f5a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_exchange.cuh
@@ -0,0 +1,409 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file
+ * The cub::WarpExchange class provides [<em>collective</em>](../index.html#sec0)
+ * methods for rearranging data partitioned across a CUDA warp.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/specializations/warp_exchange_shfl.cuh>
+#include <cub/warp/specializations/warp_exchange_smem.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+enum WarpExchangeAlgorithm
+{
+  WARP_EXCHANGE_SMEM,
+  WARP_EXCHANGE_SHUFFLE,
+};
+
+namespace detail
+{
+template <typename InputT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS, WarpExchangeAlgorithm WARP_EXCHANGE_ALGORITHM>
+using InternalWarpExchangeImpl =
+  ::cuda::std::_If<WARP_EXCHANGE_ALGORITHM == WARP_EXCHANGE_SMEM,
+                   WarpExchangeSmem<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>,
+                   WarpExchangeShfl<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>>;
+} // namespace detail
+
+/**
+ * @brief The WarpExchange class provides [<em>collective</em>](../index.html#sec0)
+ *        methods for rearranging data partitioned across a CUDA warp.
+ *
+ * @tparam T
+ *   The data type to be exchanged.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items partitioned onto each thread.
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   <b>[optional]</b> The number of threads per "logical" warp (may be less
+ *   than the number of hardware warp threads). Default is the warp size of the
+ *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+ *   power of two.
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   Unused.
+ *
+ * @par Overview
+ * - It is commonplace for a warp of threads to rearrange data items between
+ *   threads. For example, the global memory accesses prefer patterns where
+ *   data items are "striped" across threads (where consecutive threads access
+ *   consecutive items), yet most warp-wide operations prefer a "blocked"
+ *   partitioning of items across threads (where consecutive items belong to a
+ *   single thread).
+ * - WarpExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](../index.html#sec5sec3) and
+ *     [<em>striped</em>](../index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a
+ *     [<em>striped arrangement</em>](../index.html#sec5sec3)
+ *
+ * @par A Simple Example
+ * @par
+ * The code snippet below illustrates the conversion from a "blocked" to a
+ * "striped" arrangement of 64 integer items partitioned across 16 threads where
+ * each thread owns 4 items.
+ * @par
+ * @code
+ * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     constexpr int warp_threads = 16;
+ *     constexpr int block_threads = 256;
+ *     constexpr int items_per_thread = 4;
+ *     constexpr int warps_per_block = block_threads / warp_threads;
+ *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+ *
+ *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+ *     using WarpExchangeT =
+ *       cub::WarpExchange<int, items_per_thread, warp_threads>;
+ *
+ *     // Allocate shared memory for WarpExchange
+ *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[items_per_thread];
+ *     // ...
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
+ * @endcode
+ * @par
+ * Suppose the set of striped input @p thread_data across the block of threads
+ * is <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+ * The corresponding output @p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+ */
+template <typename InputT,
+          int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS                      = CUB_PTX_WARP_THREADS,
+          int LEGACY_PTX_ARCH                           = 0,
+          WarpExchangeAlgorithm WARP_EXCHANGE_ALGORITHM = WARP_EXCHANGE_SMEM>
+class WarpExchange
+    : private detail::InternalWarpExchangeImpl<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, WARP_EXCHANGE_ALGORITHM>
+{
+  using InternalWarpExchange =
+    detail::InternalWarpExchangeImpl<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, WARP_EXCHANGE_ALGORITHM>;
+
+public:
+  /// \smemstorage{WarpExchange}
+  using TempStorage = typename InternalWarpExchange::TempStorage;
+
+  //! @name Collective constructors
+  //! @{
+
+  WarpExchange() = delete;
+
+  /**
+   * @brief Collective constructor using the specified memory allocation as
+   *        temporary storage.
+   */
+  explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchange(TempStorage& temp_storage)
+      : InternalWarpExchange(temp_storage)
+  {}
+
+  //! @}  end member group
+  //! @name Data movement
+  //! @{
+
+  /**
+   * @brief Transposes data items from <em>blocked</em> arrangement to
+   *        <em>striped</em> arrangement.
+   *
+   * @par
+   * @smemwarpreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "blocked" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data);
+   * @endcode
+   * @par
+   * Suppose the set of striped input @p thread_data across the block of threads
+   * is <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+   * The corresponding output @p thread_data in those threads will be
+   * <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+   *
+   * @param[in] input_items
+   *   Items to exchange, converting between <em>blocked</em> and
+   *   <em>striped</em> arrangements.
+   *
+   * @param[out] output_items
+   *   Items from exchange, converting between <em>striped</em> and
+   *   <em>blocked</em> arrangements. May be aliased to @p input_items.
+   */
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    InternalWarpExchange::BlockedToStriped(input_items, output_items);
+  }
+
+  /**
+   * @brief Transposes data items from <em>striped</em> arrangement to
+   *        <em>blocked</em> arrangement.
+   *
+   * @par
+   * @smemwarpreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "striped" to a
+   * "blocked" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Load a tile of data striped across threads
+   *     int thread_data[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a blocked arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
+   * @endcode
+   * @par
+   * Suppose the set of striped input @p thread_data across the block of threads
+   * is <tt>{ [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }</tt>.
+   * The corresponding output @p thread_data in those threads will be
+   * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }</tt>.
+   *
+   * @param[in] input_items
+   *   Items to exchange
+   *
+   * @param[out] output_items
+   *   Items from exchange. May be aliased to @p input_items.
+   */
+  template <typename OutputT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
+  {
+    InternalWarpExchange::StripedToBlocked(input_items, output_items);
+  }
+
+  /**
+   * @brief Exchanges valid data items annotated by rank
+   *        into <em>striped</em> arrangement.
+   *
+   * @par
+   * @smemwarpreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "scatter" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_data[items_per_thread];
+   *     int thread_ranks[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
+   *       thread_data, thread_ranks);
+   * @endcode
+   * @par
+   * Suppose the set of input @p thread_data across the block of threads
+   * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
+   * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
+   * corresponding output @p thread_data in those threads will be
+   * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
+   *
+   * @tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
+   *
+   * @param[in,out] items Items to exchange
+   * @param[in] ranks Corresponding scatter ranks
+   */
+  template <typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    InternalWarpExchange::ScatterToStriped(items, ranks);
+  }
+
+  /**
+   * @brief Exchanges valid data items annotated by rank
+   *        into <em>striped</em> arrangement.
+   *
+   * @par
+   * @smemwarpreuse
+   *
+   * @par Snippet
+   * The code snippet below illustrates the conversion from a "scatter" to a
+   * "striped" arrangement of 64 integer items partitioned across 16 threads
+   * where each thread owns 4 items.
+   * @par
+   * @code
+   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_exchange.cuh>
+   *
+   * __global__ void ExampleKernel(int *d_data, ...)
+   * {
+   *     constexpr int warp_threads = 16;
+   *     constexpr int block_threads = 256;
+   *     constexpr int items_per_thread = 4;
+   *     constexpr int warps_per_block = block_threads / warp_threads;
+   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+   *
+   *     // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
+   *     using WarpExchangeT = cub::WarpExchange<int, items_per_thread, warp_threads>;
+   *
+   *     // Allocate shared memory for WarpExchange
+   *     __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
+   *
+   *     // Obtain a segment of consecutive items that are blocked across threads
+   *     int thread_input[items_per_thread];
+   *     int thread_ranks[items_per_thread];
+   *     // ...
+   *
+   *     // Collectively exchange data into a striped arrangement across threads
+   *     int thread_output[items_per_thread];
+   *     WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
+   *       thread_input, thread_output, thread_ranks);
+   * @endcode
+   * @par
+   * Suppose the set of input @p thread_input across the block of threads
+   * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
+   * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
+   * corresponding @p thread_output in those threads will be
+   * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
+   *
+   * @tparam OffsetT <b>[inferred]</b> Signed integer type for local offsets
+   *
+   * @param[in] input_items
+   *   Items to exchange
+   *
+   * @param[out] output_items
+   *   Items from exchange. May be aliased to @p input_items.
+   *
+   * @param[in] ranks
+   *   Corresponding scatter ranks
+   */
+  template <typename OutputT, typename OffsetT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
+    const InputT (&input_items)[ITEMS_PER_THREAD],
+    OutputT (&output_items)[ITEMS_PER_THREAD],
+    OffsetT (&ranks)[ITEMS_PER_THREAD])
+  {
+    InternalWarpExchange::ScatterToStriped(input_items, output_items, ranks);
+  }
+
+  //@}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_load.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_load.cuh
new file mode 100644
index 000000000..ac5c700b9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_load.cuh
@@ -0,0 +1,615 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! Operations for reading linear tiles of data into the CUDA warp.
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_load.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! ``cub::WarpLoadAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpLoad` to
+//! read a linear segment of data from memory into a CUDA warp.
+//! @endrst
+enum WarpLoadAlgorithm
+{
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) decreases as the
+  //! access stride between threads increases (i.e., the number items per thread).
+  //! @endrst
+  WARP_LOAD_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) doesn't depend on
+  //! the number of items per thread.
+  //! @endrst
+  WARP_LOAD_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using
+  //! CUDA's built-in vectorized loads as a coalescing optimization.
+  //! For example, ``ld.global.v4.s32`` instructions will be generated when ``T = int`` and
+  //! ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high until the the
+  //!   access stride between threads (i.e., the number items per thread) exceeds the
+  //!   maximum vector load width (typically 4 items or 64B, whichever is lower).
+  //! - The following conditions will prevent vectorization and loading will fall
+  //!   back to cub::WARP_LOAD_DIRECT:
+  //!
+  //!   - ``ITEMS_PER_THREAD`` is odd
+  //!   - The ``InputIteratorT`` is not a simple pointer type
+  //!   - The block input offset is not quadword-aligned
+  //!   - The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //! @endrst
+  WARP_LOAD_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from
+  //! memory and then locally transposed into a
+  //! :ref:`blocked arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! - The utilization of memory transactions (coalescing) remains high
+  //!   regardless of items loaded per thread.
+  //! - The local reordering incurs slightly longer latencies and throughput than the direct
+  //!   ``cub::WARP_LOAD_DIRECT`` and ``cub::WARP_LOAD_VECTORIZE`` alternatives.
+  //! @endrst
+  WARP_LOAD_TRANSPOSE
+};
+
+//! @rst
+//! The WarpLoad class provides :ref:`collective <collective-primitives>` data movement methods for
+//! loading a linear segment of items from memory into a
+//! :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread warp.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! - The WarpLoad class provides a single data movement abstraction that can be
+//!   specialized to implement different cub::WarpLoadAlgorithm strategies. This
+//!   facilitates different performance policies for different architectures, data
+//!   types, granularity sizes, etc.
+//! - WarpLoad can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::WARP_LOAD_DIRECT`:
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from
+//!      memory.
+//!   #. :cpp:enumerator:`cub::WARP_LOAD_STRIPED`:
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from
+//!      memory.
+//!   #. :cpp:enumerator:`cub::WARP_LOAD_VECTORIZE`:
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from
+//!      memory using CUDA's built-in vectorized loads as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::WARP_LOAD_TRANSPOSE`:
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from
+//!      memory and is then locally transposed into a
+//!      :ref:`blocked arrangement <flexible-data-arrangement>`.
+//!
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates the loading of a linear segment of 64
+//! integers into a "blocked" arrangement across 16 threads where each thread
+//! owns 4 consecutive items. The load is specialized for ``WARP_LOAD_TRANSPOSE``,
+//! meaning memory references are efficiently coalesced using a warp-striped access
+//! pattern (after which items are locally reordered among threads).
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        constexpr int warp_threads = 16;
+//!        constexpr int block_threads = 256;
+//!        constexpr int items_per_thread = 4;
+//!
+//!        // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+//!        using WarpLoadT = WarpLoad<int,
+//!                                   items_per_thread,
+//!                                   cub::WARP_LOAD_TRANSPOSE,
+//!                                   warp_threads>;
+//!
+//!        constexpr int warps_in_block = block_threads / warp_threads;
+//!        constexpr int tile_size = items_per_thread * warp_threads;
+//!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+//!
+//!        // Allocate shared memory for WarpLoad
+//!        __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+//!
+//!        // Load a segment of consecutive items that are blocked across threads
+//!        int thread_data[items_per_thread];
+//!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+//!                                           thread_data);
+//!
+//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
+//! The set of ``thread_data`` across the first logical warp of threads in those
+//! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+//! @endrst
+//!
+//! @tparam InputT
+//!   The data type to read into (which must be convertible from the input
+//!   iterator's value type).
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   <b>[optional]</b> cub::WarpLoadAlgorithm tuning policy.
+//!   default: cub::WARP_LOAD_DIRECT.
+//!
+//! @tparam LOGICAL_WARP_THREADS
+//!   <b>[optional]</b> The number of threads per "logical" warp (may be less
+//!   than the number of hardware warp threads). Default is the warp size of the
+//!   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+//!   power of two.
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   Unused.
+template <typename InputT,
+          int ITEMS_PER_THREAD,
+          WarpLoadAlgorithm ALGORITHM = WARP_LOAD_DIRECT,
+          int LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+          int LEGACY_PTX_ARCH         = 0>
+class WarpLoad
+{
+  static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, "LOGICAL_WARP_THREADS must be a power of two");
+
+private:
+  /*****************************************************************************
+   * Algorithmic variants
+   ****************************************************************************/
+
+  /// Load helper
+  template <WarpLoadAlgorithm _POLICY, int DUMMY>
+  struct LoadInternal;
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_DIRECT, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_STRIPED, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_VECTORIZE, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+    }
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+    }
+
+    template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+    }
+
+    template <typename _InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    {
+      LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+    }
+  };
+
+  template <int DUMMY>
+  struct LoadInternal<WARP_LOAD_TRANSPOSE, DUMMY>
+  {
+    using WarpExchangeT = WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+
+    struct _TempStorage : WarpExchangeT::TempStorage
+    {};
+
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    _TempStorage& temp_storage;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+
+    template <typename InputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+
+    template <typename InputIteratorT, typename DefaultT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void
+    Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+    {
+      LoadDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+      WarpExchangeT(temp_storage).StripedToBlocked(items, items);
+    }
+  };
+
+  /*****************************************************************************
+   * Type definitions
+   ****************************************************************************/
+
+  /// Internal load implementation to use
+  using InternalLoad = LoadInternal<ALGORITHM, 0>;
+
+  /// Shared memory storage layout type
+  using _TempStorage = typename InternalLoad::TempStorage;
+
+  /*****************************************************************************
+   * Utility methods
+   ****************************************************************************/
+
+  /// Internal storage allocator
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /*****************************************************************************
+   * Thread fields
+   ****************************************************************************/
+
+  /// Thread reference to shared storage
+  _TempStorage& temp_storage;
+
+  /// Linear thread-id
+  int linear_tid;
+
+public:
+  /// @smemstorage{WarpLoad}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of
+  //!        shared memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad()
+      : temp_storage(PrivateStorage())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //! @brief Collective constructor using the specified memory allocation as
+  //!        temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //! @} end member group
+  //! @name Data movement
+  //! @{
+
+  //! @rst
+  //! Load a linear segment of items from memory.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!
+  //!        // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+  //!        using WarpLoadT = WarpLoad<int,
+  //!                                   items_per_thread,
+  //!                                   cub::WARP_LOAD_TRANSPOSE,
+  //!                                   warp_threads>;
+  //!
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!
+  //!        // Allocate shared memory for WarpLoad
+  //!        __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[items_per_thread];
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+  //!                                              thread_data);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``,
+  //! The set of ``thread_data`` across the first logical warp of threads in those
+  //! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+  //! @endrst
+  //!
+  //! @param[in] block_itr The thread block's base input iterator for loading from
+  //! @param[out] items Data to load
+  template <typename InputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD])
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+  }
+
+  //! @rst
+  //! Load a linear segment of items from memory, guarded by range.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!
+  //!        // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+  //!        using WarpLoadT = WarpLoad<int,
+  //!                                   items_per_thread,
+  //!                                   cub::WARP_LOAD_TRANSPOSE,
+  //!                                   warp_threads>;
+  //!
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!
+  //!        // Allocate shared memory for WarpLoad
+  //!        __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[items_per_thread];
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+  //!                                              thread_data,
+  //!                                              valid_items);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``.
+  //! The set of ``thread_data`` across the first logical warp of threads in those threads will be:
+  //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }`` with only the first two threads being unmasked to
+  //! load portions of valid data (and other items remaining unassigned).
+  //! @endrst
+  //!
+  //! @param[in] block_itr The thread block's base input iterator for loading from
+  //! @param[out] items Data to load
+  //! @param[in] valid_items Number of valid items to load
+  template <typename InputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items)
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+  }
+
+  //! @rst
+  //! Load a linear segment of items from memory, guarded by range.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_load.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!
+  //!        // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each
+  //!        using WarpLoadT = WarpLoad<int,
+  //!                                   items_per_thread,
+  //!                                   cub::WARP_LOAD_TRANSPOSE,
+  //!                                   warp_threads>;
+  //!
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!
+  //!        // Allocate shared memory for WarpLoad
+  //!        __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block];
+  //!
+  //!        // Load a segment of consecutive items that are blocked across threads
+  //!        int thread_data[items_per_thread];
+  //!        WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
+  //!                                              thread_data,
+  //!                                              valid_items,
+  //!                                              -1);
+  //!
+  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``, ``valid_items`` is ``5``, and the
+  //! out-of-bounds default is ``-1``. The set of ``thread_data`` across the first logical warp of
+  //! threads in those threads will be: ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }`` with
+  //! only the first two threads being unmasked to load portions of valid data (and other items
+  //! are assigned ``-1``).
+  //! @endrst
+  //!
+  //! @param[in] block_itr The thread block's base input iterator for loading from
+  //! @param[out] items Data to load
+  //! @param[in] valid_items Number of valid items to load
+  //! @param[in] oob_default Default value to assign out-of-bound items
+  template <typename InputIteratorT, typename DefaultT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default)
+  {
+    InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+  }
+
+  //! @} end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_merge_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_merge_sort.cuh
new file mode 100644
index 000000000..40e29322c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_merge_sort.cuh
@@ -0,0 +1,173 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_merge_sort.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The WarpMergeSort class provides methods for sorting items partitioned across a CUDA warp
+//! using a merge sorting method.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//!   WarpMergeSort arranges items into ascending order using a comparison
+//!   functor with less-than semantics. Merge sort can handle arbitrary types
+//!   and comparison functors.
+//!
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates a sort of 64 integer keys that are
+//! partitioned across 16 threads where each thread owns 4 consecutive items.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>  // or equivalently <cub/warp/warp_merge_sort.cuh>
+//!
+//!    struct CustomLess
+//!    {
+//!      template <typename DataType>
+//!      __device__ bool operator()(const DataType &lhs, const DataType &rhs)
+//!      {
+//!        return lhs < rhs;
+//!      }
+//!    };
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        constexpr int warp_threads = 16;
+//!        constexpr int block_threads = 256;
+//!        constexpr int items_per_thread = 4;
+//!        constexpr int warps_per_block = block_threads / warp_threads;
+//!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+//!
+//!        // Specialize WarpMergeSort for a virtual warp of 16 threads
+//!        // owning 4 integer items each
+//!        using WarpMergeSortT =
+//!          cub::WarpMergeSort<int, items_per_thread, warp_threads>;
+//!
+//!        // Allocate shared memory for WarpMergeSort
+//!        __shared__ typename WarpMergeSortT::TempStorage temp_storage[warps_per_block];
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_keys[items_per_thread];
+//!        // ...
+//!
+//!        WarpMergeSortT(temp_storage[warp_id]).Sort(thread_keys, CustomLess());
+//!        // ...
+//!    }
+//!
+//! Suppose the set of input ``thread_keys`` across a warp of threads is
+//! ``{ [0,64,1,63], [2,62,3,61], [4,60,5,59], ..., [31,34,32,33] }``.
+//! The corresponding output ``thread_keys`` in those threads will be
+//! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [31,32,33,34] }``.
+//! @endrst
+//!
+//! @tparam KeyT
+//!   Key type
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of items per thread
+//!
+//! @tparam LOGICAL_WARP_THREADS
+//!   <b>[optional]</b> The number of threads per "logical" warp (may be less
+//!   than the number of hardware warp threads). Default is the warp size of the
+//!   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+//!   power of two.
+//!
+//! @tparam ValueT
+//!   <b>[optional]</b> Value type (default: cub::NullType, which indicates a
+//!   keys-only sort)
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   Unused.
+//!
+template <typename KeyT,
+          int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS = CUB_WARP_THREADS(0),
+          typename ValueT          = NullType,
+          int LEGACY_PTX_ARCH      = 0>
+class WarpMergeSort
+    : public BlockMergeSortStrategy<KeyT,
+                                    ValueT,
+                                    LOGICAL_WARP_THREADS,
+                                    ITEMS_PER_THREAD,
+                                    WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT>>
+{
+private:
+  static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+  static constexpr bool KEYS_ONLY    = ::cuda::std::is_same<ValueT, NullType>::value;
+  static constexpr int TILE_SIZE     = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+
+  using BlockMergeSortStrategyT =
+    BlockMergeSortStrategy<KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort>;
+
+  const unsigned int warp_id;
+  const unsigned int member_mask;
+
+public:
+  WarpMergeSort() = delete;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage& temp_storage)
+      : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+      , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS))
+      , member_mask(WarpMask<LOGICAL_WARP_THREADS>(warp_id))
+  {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE unsigned int get_member_mask() const
+  {
+    return member_mask;
+  }
+
+private:
+  _CCCL_DEVICE _CCCL_FORCEINLINE void SyncImplementation() const
+  {
+    WARP_SYNC(member_mask);
+  }
+
+  friend BlockMergeSortStrategyT;
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_reduce.cuh
new file mode 100644
index 000000000..7785b8992
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_reduce.cuh
@@ -0,0 +1,745 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! @rst
+//! The ``cub::WarpReduce`` class provides :ref:`collective <collective-primitives>` methods for
+//! computing a parallel reduction of items partitioned across a CUDA thread warp.
+//! @endrst
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/specializations/warp_reduce_shfl.cuh>
+#include <cub/warp/specializations/warp_reduce_smem.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The ``WarpReduce`` class provides :ref:`collective <collective-primitives>` methods for
+//! computing a parallel reduction of items partitioned across a CUDA thread warp.
+//!
+//! .. image:: ../../img/warp_reduce_logo.png
+//!     :align: center
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! - A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`__ (or *fold*)
+//!   uses a binary combining operator to compute a single aggregate from a list of input elements.
+//! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8
+//!   threads)
+//! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
+//!
+//! Performance Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! - Uses special instructions when applicable (e.g., warp ``SHFL`` instructions)
+//! - Uses synchronization-free communication between warp lanes when applicable
+//! - Incurs zero bank conflicts for most types
+//! - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!
+//!   - Summation (**vs.** generic reduction)
+//!   - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
+//!
+//! Simple Examples
+//! ++++++++++++++++++++++++++
+//!
+//! @warpcollective{WarpReduce}
+//!
+//! The code snippet below illustrates four concurrent warp sum reductions within a block of
+//! 128 threads (one per each of the 32-thread warps).
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize WarpReduce for type int
+//!        using WarpReduce = cub::WarpReduce<int>;
+//!
+//!        // Allocate WarpReduce shared memory for 4 warps
+//!        __shared__ typename WarpReduce::TempStorage temp_storage[4];
+//!
+//!        // Obtain one input item per thread
+//!        int thread_data = ...
+//!
+//!        // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+//!        int warp_id = threadIdx.x / 32;
+//!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and 96
+//! will be ``496``, ``1520``, ``2544``, and ``3568``, respectively
+//! (and is undefined in other threads).
+//!
+//! The code snippet below illustrates a single warp sum reduction within a block of
+//! 128 threads.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize WarpReduce for type int
+//!        using WarpReduce = cub::WarpReduce<int>;
+//!
+//!        // Allocate WarpReduce shared memory for one warp
+//!        __shared__ typename WarpReduce::TempStorage temp_storage;
+//!        ...
+//!
+//!        // Only the first warp performs a reduction
+//!        if (threadIdx.x < 32)
+//!        {
+//!            // Obtain one input item per thread
+//!            int thread_data = ...
+//!
+//!            // Return the warp-wide sum to lane0
+//!            int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the warp of threads is
+//! ``{0, 1, 2, 3, ..., 31}``. The corresponding output ``aggregate`` in thread0 will be ``496``
+//! (and is undefined in other threads).
+//! @endrst
+//!
+//! @tparam T
+//!   The reduction input/output element type
+//!
+//! @tparam LOGICAL_WARP_THREADS
+//!   <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of
+//!   hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability
+//!   (e.g., 32 threads for SM20).
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   <b>[optional]</b> Unused.
+template <typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+class WarpReduce
+{
+private:
+  /******************************************************************************
+   * Constants and type definitions
+   ******************************************************************************/
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// Whether the logical warp size is a power-of-two
+    IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+  };
+
+public:
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+  /// Internal specialization.
+  /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two
+  using InternalWarpReduce =
+    ::cuda::std::_If<IS_POW_OF_TWO, WarpReduceShfl<T, LOGICAL_WARP_THREADS>, WarpReduceSmem<T, LOGICAL_WARP_THREADS>>;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+  /// Shared memory storage layout type for WarpReduce
+  using _TempStorage = typename InternalWarpReduce::TempStorage;
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+
+  /******************************************************************************
+   * Utility methods
+   ******************************************************************************/
+
+public:
+  /// \smemstorage{WarpReduce}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @rst
+  //! Collective constructor using the specified memory allocation as temporary storage.
+  //! Logical warp and lane identifiers are constructed from ``threadIdx.x``.
+  //! @endrst
+  //!
+  //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+  {}
+
+  //! @}  end member group
+  //! @name Summation reductions
+  //! @{
+
+  //! @rst
+  //! Computes a warp-wide sum in the calling warp.
+  //! The output is valid in warp *lane*\ :sub:`0`.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp sum reductions within a block of
+  //! 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for 4 warps
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Return the warp-wide sums to each lane0
+  //!        int warp_id = threadIdx.x / 32;
+  //!        int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, 1, 2, 3, ..., 127}``.
+  //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``,
+  //! ``2544``, and ``3568``, respectively (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @param[in] input Calling thread's input
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  {
+    return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes a partially-full warp-wide sum in the calling warp.
+  //! The output is valid in warp *lane*\ :sub:`0`.
+  //!
+  //! All threads across the calling warp must agree on the same value for ``valid_items``.
+  //! Otherwise the result is undefined.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a sum reduction within a single, partially-full
+  //! block of 32 threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item per thread if in range
+  //!        int thread_data;
+  //!        if (threadIdx.x < valid_items)
+  //!            thread_data = d_data[threadIdx.x];
+  //!
+  //!        // Return the warp-wide sums to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).Sum(
+  //!            thread_data, valid_items);
+  //!
+  //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``.
+  //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6``
+  //! (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] valid_items
+  //!   Total number of valid items in the calling thread's logical warp
+  //!   (may be less than ``LOGICAL_WARP_THREADS``)
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items)
+  {
+    // Determine if we don't need bounds checking
+    return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes a segmented sum in the calling warp where segments are defined by head-flags.
+  //! The sum of each segment is returned to the first lane in that segment
+  //! (which always includes *lane*\ :sub:`0`).
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a head-segmented warp sum
+  //! reduction within a block of 32 threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item and flag per thread
+  //!        int thread_data = ...
+  //!        int head_flag = ...
+  //!
+  //!        // Return the warp-wide sums to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+  //!            thread_data, head_flag);
+  //!
+  //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
+  //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``,
+  //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be
+  //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] head_flag
+  //!   Head flag denoting whether or not `input` is the start of a new segment
+  template <typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag)
+  {
+    return HeadSegmentedReduce(input, head_flag, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes a segmented sum in the calling warp where segments are defined by tail-flags.
+  //! The sum of each segment is returned to the first lane in that segment
+  //! (which always includes *lane*\ :sub:`0`).
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a tail-segmented warp sum reduction within a block of 32
+  //! threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item and flag per thread
+  //!        int thread_data = ...
+  //!        int tail_flag = ...
+  //!
+  //!        // Return the warp-wide sums to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+  //!            thread_data, tail_flag);
+  //!
+  //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads
+  //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``,
+  //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be
+  //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] tail_flag
+  //!   Head flag denoting whether or not `input` is the start of a new segment
+  template <typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag)
+  {
+    return TailSegmentedReduce(input, tail_flag, cub::Sum());
+  }
+
+  //! @}  end member group
+  //! @name Generic reductions
+  //! @{
+
+  //! @rst
+  //! Computes a warp-wide reduction in the calling warp using the specified binary reduction
+  //! functor. The output is valid in warp *lane*\ :sub:`0`.
+  //!
+  //! Supports non-commutative reduction operators
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp max reductions within a block of
+  //! 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for 4 warps
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Return the warp-wide reductions to each lane0
+  //!        int warp_id = threadIdx.x / 32;
+  //!        int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+  //!            thread_data, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and
+  //! 96 will be ``31``, ``63``, ``95``, and ``127``, respectively
+  //! (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction operator
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op)
+  {
+    return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+  }
+
+  //! @rst
+  //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary
+  //! reduction functor. The output is valid in warp *lane*\ :sub:`0`.
+  //!
+  //! All threads across the calling warp must agree on the same value for ``valid_items``.
+  //! Otherwise the result is undefined.
+  //!
+  //! Supports non-commutative reduction operators
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a max reduction within a single, partially-full
+  //! block of 32 threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item per thread if in range
+  //!        int thread_data;
+  //!        if (threadIdx.x < valid_items)
+  //!            thread_data = d_data[threadIdx.x];
+  //!
+  //!        // Return the warp-wide reductions to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).Reduce(
+  //!            thread_data, cub::Max(), valid_items);
+  //!
+  //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ... }`` and ``valid_items``
+  //! is ``4``. The corresponding output ``aggregate`` in thread0 is ``3`` (and is
+  //! undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction operator
+  //!
+  //! @param[in] valid_items
+  //!   Total number of valid items in the calling thread's logical warp
+  //!   (may be less than ``LOGICAL_WARP_THREADS``)
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items)
+  {
+    return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+  }
+
+  //! @rst
+  //! Computes a segmented reduction in the calling warp where segments are defined by head-flags.
+  //! The reduction of each segment is returned to the first lane in that segment
+  //! (which always includes *lane*\ :sub:`0`).
+  //!
+  //! Supports non-commutative reduction operators
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a head-segmented warp max
+  //! reduction within a block of 32 threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item and flag per thread
+  //!        int thread_data = ...
+  //!        int head_flag = ...
+  //!
+  //!        // Return the warp-wide reductions to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+  //!            thread_data, head_flag, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
+  //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0}``,
+  //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be
+  //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] head_flag
+  //!   Head flag denoting whether or not `input` is the start of a new segment
+  //!
+  //! @param[in] reduction_op
+  //!   Reduction operator
+  template <typename ReductionOp, typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op)
+  {
+    return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+  }
+
+  //! @rst
+  //! Computes a segmented reduction in the calling warp where segments are defined by tail-flags.
+  //! The reduction of each segment is returned to the first lane in that segment
+  //! (which always includes *lane*\ :sub:`0`).
+  //!
+  //! Supports non-commutative reduction operators
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates a tail-segmented warp max
+  //! reduction within a block of 32 threads (one warp).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpReduce for type int
+  //!        using WarpReduce = cub::WarpReduce<int>;
+  //!
+  //!        // Allocate WarpReduce shared memory for one warp
+  //!        __shared__ typename WarpReduce::TempStorage temp_storage;
+  //!
+  //!        // Obtain one input item and flag per thread
+  //!        int thread_data = ...
+  //!        int tail_flag = ...
+  //!
+  //!        // Return the warp-wide reductions to each lane0
+  //!        int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+  //!            thread_data, tail_flag, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads
+  //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``,
+  //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be
+  //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads).
+  //! @endrst
+  //!
+  //! @tparam ReductionOp
+  //!   **[inferred]** Binary reduction operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input
+  //!
+  //! @param[in] tail_flag
+  //!   Tail flag denoting whether or not \p input is the end of the current segment
+  //!
+  //! @param[in] reduction_op
+  //!   Reduction operator
+  template <typename ReductionOp, typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op)
+  {
+    return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+  }
+
+  //! @}  end member group
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+template <typename T, int LEGACY_PTX_ARCH>
+class WarpReduce<T, 1, LEGACY_PTX_ARCH>
+{
+private:
+  using _TempStorage = cub::NullType;
+
+public:
+  struct InternalWarpReduce
+  {
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {}
+
+    template <bool ALL_LANES_VALID, typename ReductionOp>
+    _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */)
+    {
+      return input;
+    }
+
+    template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+    _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */)
+    {
+      return input;
+    }
+  };
+
+  using TempStorage = typename InternalWarpReduce::TempStorage;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {}
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input)
+  {
+    return input;
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */)
+  {
+    return input;
+  }
+
+  template <typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */)
+  {
+    return input;
+  }
+
+  template <typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */)
+  {
+    return input;
+  }
+
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */)
+  {
+    return input;
+  }
+
+  template <typename ReductionOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */)
+  {
+    return input;
+  }
+
+  template <typename ReductionOp, typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */)
+  {
+    return input;
+  }
+
+  template <typename ReductionOp, typename FlagT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */)
+  {
+    return input;
+  }
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_scan.cuh
new file mode 100644
index 000000000..5daeec6e3
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_scan.cuh
@@ -0,0 +1,1141 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! @rst
+//! The ``cub::WarpScan`` class provides :ref:`collective <collective-primitives>` methods for
+//! computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+//! @endrst
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/specializations/warp_scan_shfl.cuh>
+#include <cub/warp/specializations/warp_scan_smem.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! The WarpScan class provides :ref:`collective <collective-primitives>` methods for computing a
+//! parallel prefix scan of items partitioned across a CUDA thread warp.
+//!
+//! .. image:: ../../img/warp_scan_logo.png
+//!     :align: center
+//!
+//! Overview
+//! ++++++++++++++++++++++++++
+//!
+//! * Given a list of input elements and a binary reduction operator, a
+//!   `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`__ produces an output list where each
+//!   element is computed to be the reduction of the elements occurring earlier in the input list.
+//!   *Prefix sum* connotes a prefix scan with the addition operator. The term *inclusive*
+//!   indicates that the *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
+//!   The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into
+//!   the *i*\ :sup:`th` output reduction.
+//! * Supports non-commutative scan operators
+//! * Supports "logical" warps smaller than the physical warp size
+//!   (e.g., a logical warp of 8 threads)
+//! * The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS``
+//!
+//! Performance Considerations
+//! ++++++++++++++++++++++++++
+//!
+//! * Uses special instructions when applicable (e.g., warp ``SHFL``)
+//! * Uses synchronization-free communication between warp lanes when applicable
+//! * Incurs zero bank conflicts for most types
+//! * Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+//!
+//!   * Summation (**vs.** generic scan)
+//!   * The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS``
+//!
+//! Simple Examples
+//! ++++++++++++++++++++++++++
+//!
+//! @warpcollective{WarpScan}
+//!
+//! The code snippet below illustrates four concurrent warp prefix sums within a block of
+//! 128 threads (one per each of the 32-thread warps).
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize WarpScan for type int
+//!        using WarpScan = cub::WarpScan<int>;
+//!
+//!        // Allocate WarpScan shared memory for 4 warps
+//!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+//!
+//!        // Obtain one input item per thread
+//!        int thread_data = ...
+//!
+//!        // Compute warp-wide prefix sums
+//!        int warp_id = threadIdx.x / 32;
+//!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the block of threads is
+//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
+//! threads will be ``0, 1, 2, 3, ..., 31}``.
+//!
+//! The code snippet below illustrates a single warp prefix sum within a block of
+//! 128 threads.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>
+//!
+//!    __global__ void ExampleKernel(...)
+//!    {
+//!        // Specialize WarpScan for type int
+//!        using WarpScan = cub::WarpScan<int>;
+//!
+//!        // Allocate WarpScan shared memory for one warp
+//!        __shared__ typename WarpScan::TempStorage temp_storage;
+//!        ...
+//!
+//!        // Only the first warp performs a prefix sum
+//!        if (threadIdx.x < 32)
+//!        {
+//!            // Obtain one input item per thread
+//!            int thread_data = ...
+//!
+//!            // Compute warp-wide prefix sums
+//!            WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+//!
+//! Suppose the set of input ``thread_data`` across the warp of threads is
+//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
+//! ``{0, 1, 2, 3, ..., 31}``.
+//! @endrst
+//!
+//! @tparam T
+//!   The scan input/output element type
+//!
+//! @tparam LOGICAL_WARP_THREADS
+//!   **[optional]** The number of threads per "logical" warp (may be less than the number of
+//!   hardware warp threads). Default is the warp size associated with the CUDA Compute Capability
+//!   targeted by the compiler (e.g., 32 threads for SM20).
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   **[optional]** Unused.
+template <typename T, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
+class WarpScan
+{
+private:
+  /******************************************************************************
+   * Constants and type definitions
+   ******************************************************************************/
+
+  enum
+  {
+    /// Whether the logical warp size and the PTX warp size coincide
+    IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)),
+
+    /// Whether the logical warp size is a power-of-two
+    IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+    /// Whether the data type is an integer (which has fully-associative addition)
+    IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+  };
+
+  /// Internal specialization.
+  /// Use SHFL-based scan if LOGICAL_WARP_THREADS is a power-of-two
+  using InternalWarpScan =
+    ::cuda::std::_If<IS_POW_OF_TWO, WarpScanShfl<T, LOGICAL_WARP_THREADS>, WarpScanSmem<T, LOGICAL_WARP_THREADS>>;
+
+  /// Shared memory storage layout type for WarpScan
+  using _TempStorage = typename InternalWarpScan::TempStorage;
+
+  /******************************************************************************
+   * Thread fields
+   ******************************************************************************/
+
+  /// Shared storage reference
+  _TempStorage& temp_storage;
+  unsigned int lane_id;
+
+  /******************************************************************************
+   * Public types
+   ******************************************************************************/
+
+public:
+  /// @smemstorage{WarpScan}
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using the specified memory allocation as temporary storage.
+  //!        Logical warp and lane identifiers are constructed from `threadIdx.x`.
+  //!
+  //! @param[in] temp_storage
+  //!   Reference to memory allocation having layout type TempStorage
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpScan(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS)
+  {}
+
+  //! @}  end member group
+  //! @name Inclusive prefix sums
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive prefix sum across the calling warp.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
+  //! block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute inclusive warp-wide prefix sums
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
+  //! of threads will be ``1, 2, 3, ..., 32}``.
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item.
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with `input`.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output)
+  {
+    InclusiveScan(input, inclusive_output, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes an inclusive prefix sum across the calling warp.
+  //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a
+  //! block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute inclusive warp-wide prefix sums
+  //!        int warp_aggregate;
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
+  //!                                                     thread_data,
+  //!                                                     warp_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
+  //! of threads will be ``1, 2, 3, ..., 32}``. Furthermore, ``warp_aggregate`` for all threads
+  //! in all warps will be ``32``.
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& inclusive_output, T& warp_aggregate)
+  {
+    InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+  }
+
+  //! @}  end member group
+  //! @name Exclusive prefix sums
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
+  //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
+  //!
+  //! * @identityzero
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
+  //! block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix sums
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
+  //! of threads will be ``0, 1, 2, ..., 31}``.
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item.
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item. May be aliased with `input`.
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output)
+  {
+    T initial_value{};
+    ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+  }
+
+  //! @rst
+  //! Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the
+  //! initial value, and is assigned to ``exclusive_output`` in *lane*\ :sub:`0`.
+  //! Also provides every thread with the warp-wide ``warp_aggregate`` of all inputs.
+  //!
+  //! * @identityzero
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a
+  //! block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix sums
+  //!        int warp_aggregate;
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data,
+  //!                                                     thread_data,
+  //!                                                     warp_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
+  //! of threads will be ``0, 1, 2, ..., 31}``. Furthermore, ``warp_aggregate`` for all threads
+  //! in all warps will be ``32``.
+  //! @endrst
+  //!
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& exclusive_output, T& warp_aggregate)
+  {
+    T initial_value{};
+    ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+  }
+
+  //! @}  end member group
+  //! @name Inclusive prefix scans
+  //! @{
+
+  //! @rst
+  //! Computes an inclusive prefix scan using the specified binary scan functor across the
+  //! calling warp.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute inclusive warp-wide prefix max scans
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
+  //! ``32, 32, 34, 34, ..., 62, 62``, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op)
+  {
+    InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+  }
+
+  //! @rst
+  //! Computes an inclusive prefix scan using the specified binary scan functor across the
+  //! calling warp.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix sum scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-warp-scan-init-value
+  //!     :end-before: example-end inclusive-warp-scan-init-value
+  //!
+  //! Suppose the set of input ``thread_data`` in the first warp is
+  //! ``{0, 1, 2, 3, ..., 31}``, in the second warp is ``{1, 2, 3, 4, ..., 32}`` etc.
+  //! The corresponding output ``thread_data`` for a max operation in the first
+  //! warp would be ``{3, 3, 3, 3, ..., 31}``, the output for the second warp would be
+  //! ``{3, 3, 3, 4, ..., 32}``, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the inclusive scan (uniform across warp)
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    T exclusive_output;
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type<IS_INTEGER>());
+  }
+
+  //! @rst
+  //! Computes an inclusive prefix scan using the specified binary scan functor across the
+  //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
+  //! all inputs.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute inclusive warp-wide prefix max scans
+  //!        int warp_aggregate;
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).InclusiveScan(
+  //!            thread_data, thread_data, cub::Max(), warp_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
+  //! ``32, 32, 34, 34, ..., 62, 62``, etc.  Furthermore, ``warp_aggregate`` would be assigned
+  //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with ``input``
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items.
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& inclusive_output, ScanOp scan_op, T& warp_aggregate)
+  {
+    InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+  }
+
+  //! @rst
+  //! Computes an inclusive prefix scan using the specified binary scan functor across the
+  //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
+  //! all inputs.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans
+  //! within a block of 128 threads (one scan per warp).
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_warp_scan_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-warp-scan-init-value-aggregate
+  //!     :end-before: example-end inclusive-warp-scan-init-value-aggregate
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{1, 1, 1, 1, ..., 1}``. For initial value equal to 3, the corresponding output
+  //! ``thread_data`` for a sum operation in the first warp would be
+  //! ``{4, 5, 6, 7, ..., 35}``, the output for the second warp would be
+  //! ``{4, 5, 6, 7, ..., 35}``, etc.  Furthermore,  ``warp_aggregate`` would be assigned
+  //! ``32`` for threads in each warp.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's output item. May be aliased with ``input``
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the inclusive scan (uniform across warp). It is not taken
+  //!   into account for warp_aggregate.
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items.
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  InclusiveScan(T input, T& inclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    // Perform the inclusive scan operation
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    // Update the inclusive_output and warp_aggregate using the Update function
+    T exclusive_output;
+    internal.Update(
+      input, inclusive_output, exclusive_output, warp_aggregate, scan_op, initial_value, Int2Type<IS_INTEGER>());
+  }
+
+  //! @}  end member group
+  //! @name Exclusive prefix scans
+  //! @{
+
+  //! @rst
+  //! Computes an exclusive prefix scan using the specified binary scan functor across the
+  //! calling warp. Because no initial value is supplied, the ``output`` computed for
+  //! *lane*\ :sub:`0` is undefined.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix max scans
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
+  //! ``?, 32, 32, 34, ..., 60, 62``, etc.
+  //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    T inclusive_output;
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, scan_op, Int2Type<IS_INTEGER>());
+  }
+
+  //! @rst
+  //! Computes an exclusive prefix scan using the specified binary scan functor across the
+  //! calling warp.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix max scans
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+  //!                                                      thread_data,
+  //!                                                      INT_MIN,
+  //!                                                      cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
+  //! ``30, 32, 32, 34, ..., 60, 62``, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the exclusive scan
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    T inclusive_output;
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type<IS_INTEGER>());
+  }
+
+  //! @rst
+  //! Computes an exclusive prefix scan using the specified binary scan functor across the
+  //! calling warp. Because no initial value is supplied, the ``output`` computed for
+  //! *lane*\ :sub:`0` is undefined. Also provides every thread with the warp-wide
+  //! ``warp_aggregate`` of all inputs.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix max scans
+  //!        int warp_aggregate;
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+  //!                                                      thread_data,
+  //!                                                      cub::Max(),
+  //!                                                      warp_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
+  //! ``?, 32, 32, 34, ..., 60, 62``, etc. (The output ``thread_data`` in warp *lane*\ :sub:`0`
+  //! is undefined). Furthermore, ``warp_aggregate`` would be assigned ``30`` for threads in the
+  //! first warp, \p 62 for threads in the second warp, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item. May be aliased with `input`
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& exclusive_output, ScanOp scan_op, T& warp_aggregate)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    T inclusive_output;
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, warp_aggregate, scan_op, Int2Type<IS_INTEGER>());
+  }
+
+  //! @rst
+  //! Computes an exclusive prefix scan using the specified binary scan functor across the
+  //! calling warp. Also provides every thread with the warp-wide ``warp_aggregate`` of
+  //! all inputs.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix max scans
+  //!        int warp_aggregate;
+  //!        int warp_id = threadIdx.x / 32;
+  //!        WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data,
+  //!                                                      thread_data,
+  //!                                                      INT_MIN,
+  //!                                                      cub::Max(),
+  //!                                                      warp_aggregate);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``thread_data`` in the first
+  //! warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
+  //! ``30, 32, 32, 34, ..., 60, 62``, etc. Furthermore, ``warp_aggregate`` would be assigned
+  //! ``30`` for threads in the first warp, ``62`` for threads in the second warp, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's output item.  May be aliased with `input`
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the exclusive scan
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  //!
+  //! @param[out] warp_aggregate
+  //!   Warp-wide aggregate reduction of input items
+  //!
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  ExclusiveScan(T input, T& exclusive_output, T initial_value, ScanOp scan_op, T& warp_aggregate)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    T inclusive_output;
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(
+      input, inclusive_output, exclusive_output, warp_aggregate, scan_op, initial_value, Int2Type<IS_INTEGER>());
+  }
+
+  //! @}  end member group
+  //! @name Combination (inclusive & exclusive) prefix scans
+  //! @{
+
+  //! @rst
+  //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
+  //! across the calling warp. Because no initial value is supplied, the ``exclusive_output``
+  //! computed for *lane*\ :sub:`0` is undefined.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans
+  //! within a block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute exclusive warp-wide prefix max scans
+  //!        int inclusive_partial, exclusive_partial;
+  //!        WarpScan(temp_storage[warp_id]).Scan(thread_data,
+  //!                                             inclusive_partial,
+  //!                                             exclusive_partial,
+  //!                                             cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
+  //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
+  //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
+  //! first warp would be ``?, 0, 0, 2, ..., 28, 30``, the output for the second warp would be
+  //! ``?, 32, 32, 34, ..., 60, 62``, etc.
+  //! (The output ``thread_data`` in warp *lane*\ :sub:`0` is undefined.)
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's inclusive-scan output item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's exclusive-scan output item
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Scan(T input, T& inclusive_output, T& exclusive_output, ScanOp scan_op)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, scan_op, Int2Type<IS_INTEGER>());
+  }
+
+  //! @rst
+  //! Computes both inclusive and exclusive prefix scans using the specified binary scan functor
+  //! across the calling warp.
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates four concurrent warp-wide prefix max scans within a
+  //! block of 128 threads (one per each of the 32-thread warps).
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Compute inclusive warp-wide prefix max scans
+  //!        int warp_id = threadIdx.x / 32;
+  //!        int inclusive_partial, exclusive_partial;
+  //!        WarpScan(temp_storage[warp_id]).Scan(thread_data,
+  //!                                             inclusive_partial,
+  //!                                             exclusive_partial,
+  //!                                             INT_MIN,
+  //!                                             cub::Max());
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, -1, 2, -3, ..., 126, -127}``. The corresponding output ``inclusive_partial`` in the
+  //! first warp would be ``0, 0, 2, 2, ..., 30, 30``, the output for the second warp would be
+  //! ``32, 32, 34, 34, ..., 62, 62``, etc. The corresponding output ``exclusive_partial`` in the
+  //! first warp would be ``INT_MIN, 0, 0, 2, ..., 28, 30``, the output for the second warp would
+  //! be ``30, 32, 32, 34, ..., 60, 62``, etc.
+  //! @endrst
+  //!
+  //! @tparam ScanOp
+  //!   **[inferred]** Binary scan operator type having member
+  //!   `T operator()(const T &a, const T &b)`
+  //!
+  //! @param[in] input
+  //!   Calling thread's input item
+  //!
+  //! @param[out] inclusive_output
+  //!   Calling thread's inclusive-scan output item
+  //!
+  //! @param[out] exclusive_output
+  //!   Calling thread's exclusive-scan output item
+  //!
+  //! @param[in] initial_value
+  //!   Initial value to seed the exclusive scan
+  //!
+  //! @param[in] scan_op
+  //!   Binary scan operator
+  template <typename ScanOp>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void
+  Scan(T input, T& inclusive_output, T& exclusive_output, T initial_value, ScanOp scan_op)
+  {
+    InternalWarpScan internal(temp_storage);
+
+    internal.InclusiveScan(input, inclusive_output, scan_op);
+
+    internal.Update(input, inclusive_output, exclusive_output, scan_op, initial_value, Int2Type<IS_INTEGER>());
+  }
+
+  //! @}  end member group
+  //! @name Data exchange
+  //! @{
+
+  //! @rst
+  //! Broadcast the value ``input`` from *lane*\ :sub:`src_lane` to all lanes in the warp
+  //!
+  //! * @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the warp-wide broadcasts of values from *lane*\ :sub:`0`
+  //! in each of four warps to all other threads in those warps.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>
+  //!
+  //!    __global__ void ExampleKernel(...)
+  //!    {
+  //!        // Specialize WarpScan for type int
+  //!        using WarpScan = cub::WarpScan<int>;
+  //!
+  //!        // Allocate WarpScan shared memory for 4 warps
+  //!        __shared__ typename WarpScan::TempStorage temp_storage[4];
+  //!
+  //!        // Obtain one input item per thread
+  //!        int thread_data = ...
+  //!
+  //!        // Broadcast from lane0 in each warp to all other threads in the warp
+  //!        int warp_id = threadIdx.x / 32;
+  //!        thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+  //!
+  //! Suppose the set of input ``thread_data`` across the block of threads is
+  //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``thread_data`` will be
+  //! ``{0, 0, ..., 0}`` in warp\ :sub:`0`,
+  //! ``{32, 32, ..., 32}`` in warp\ :sub:`1`,
+  //! ``{64, 64, ..., 64}`` in warp\ :sub:`2`, etc.
+  //! @endrst
+  //!
+  //! @param[in] input
+  //!   The value to broadcast
+  //!
+  //! @param[in] src_lane
+  //!   Which warp lane is to do the broadcasting
+  _CCCL_DEVICE _CCCL_FORCEINLINE T Broadcast(T input, unsigned int src_lane)
+  {
+    return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+  }
+
+  //@}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_store.cuh b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_store.cuh
new file mode 100644
index 000000000..bb99bc596
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/cub/warp/warp_store.cuh
@@ -0,0 +1,519 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//! @file
+//! Operations for writing linear segments of data from the CUDA warp
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/block/block_store.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+//! @rst
+//! ``cub::WarpStoreAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpStore`
+//! to write a blocked arrangement of items across a CUDA warp to a linear segment of memory.
+//! @endrst
+enum WarpStoreAlgorithm
+{
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly
+  //! to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) decreases as the
+  //! access stride between threads increases (i.e., the number items per thread).
+  //! @endrst
+  WARP_STORE_DIRECT,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`striped arrangement <flexible-data-arrangement>` of data is written
+  //! directly to memory.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! The utilization of memory transactions (coalescing) remains high regardless
+  //! of items written per thread.
+  //! @endrst
+  WARP_STORE_STRIPED,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is written
+  //! directly to memory using CUDA's built-in vectorized stores as a coalescing
+  //! optimization. For example, ``st.global.v4.s32`` instructions will be
+  //! generated when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! * The utilization of memory transactions (coalescing) remains high until
+  //!   the the access stride between threads (i.e., the number items per thread)
+  //!   exceeds the maximum vector store width (typically 4 items or 64B,
+  //!   whichever is lower).
+  //! * The following conditions will prevent vectorization and writing will fall
+  //!   back to ``cub::WARP_STORE_DIRECT``:
+  //!
+  //!   * ``ITEMS_PER_THREAD`` is odd
+  //!   * The ``OutputIteratorT`` is not a simple pointer type
+  //!   * The block output offset is not quadword-aligned
+  //!   * The data type ``T`` is not a built-in primitive or CUDA vector type
+  //!     (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
+  //!
+  //! @endrst
+  WARP_STORE_VECTORIZE,
+
+  //! @rst
+  //! Overview
+  //! ++++++++++++++++++++++++++
+  //!
+  //! A :ref:`blocked arrangement <flexible-data-arrangement>` is locally
+  //! transposed and then efficiently written to memory as a
+  //! :ref:`striped arrangement <flexible-data-arrangement>`.
+  //!
+  //! Performance Considerations
+  //! ++++++++++++++++++++++++++
+  //!
+  //! * The utilization of memory transactions (coalescing) remains high
+  //!   regardless of items written per thread.
+  //! * The local reordering incurs slightly longer latencies and throughput than the
+  //!   direct ``cub::WARP_STORE_DIRECT`` and ``cub::WARP_STORE_VECTORIZE`` alternatives.
+  //!
+  //! @endrst
+  WARP_STORE_TRANSPOSE
+};
+
+//! @rst
+//! The WarpStore class provides :ref:`collective <collective-primitives>`
+//! data movement methods for writing a :ref:`blocked arrangement <flexible-data-arrangement>`
+//! of items partitioned across a CUDA warp to a linear segment of memory.
+//!
+//! Overview
+//! ++++++++++++++++
+//!
+//! * The WarpStore class provides a single data movement abstraction that can be
+//!   specialized to implement different cub::WarpStoreAlgorithm strategies. This
+//!   facilitates different performance policies for different architectures,
+//!   data types, granularity sizes, etc.
+//! * WarpStore can be optionally specialized by different data movement strategies:
+//!
+//!   #. :cpp:enumerator:`cub::WARP_STORE_DIRECT`:
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to
+//!      memory.
+//!   #. :cpp:enumerator:`cub::WARP_STORE_STRIPED`:
+//!      a :ref:`striped arrangement <flexible-data-arrangement>` of data is written directly to
+//!      memory.
+//!   #. :cpp:enumerator:`cub::WARP_STORE_VECTORIZE`:
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` of data is written directly to
+//!      memory using CUDA's built-in vectorized stores as a coalescing optimization.
+//!   #. :cpp:enumerator:`cub::WARP_STORE_TRANSPOSE`:
+//!      a :ref:`blocked arrangement <flexible-data-arrangement>` is locally transposed into a
+//!      :ref:`striped arrangement <flexible-data-arrangement>` which is then written to memory.
+//!
+//! * @rowmajor
+//!
+//! A Simple Example
+//! ++++++++++++++++
+//!
+//! The code snippet below illustrates the storing of a "blocked" arrangement
+//! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+//! into a linear segment of memory. The store is specialized for
+//! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+//! that memory references will be efficiently coalesced using a warp-striped
+//! access pattern.
+//!
+//! .. code-block:: c++
+//!
+//!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+//!
+//!    __global__ void ExampleKernel(int *d_data, ...)
+//!    {
+//!        constexpr int warp_threads = 16;
+//!        constexpr int block_threads = 256;
+//!        constexpr int items_per_thread = 4;
+//!
+//!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+//!        using WarpStoreT = WarpStore<int,
+//!                                     items_per_thread,
+//!                                     cub::WARP_STORE_TRANSPOSE,
+//!                                     warp_threads>;
+//!
+//!        constexpr int warps_in_block = block_threads / warp_threads;
+//!        constexpr int tile_size = items_per_thread * warp_threads;
+//!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+//!
+//!        // Allocate shared memory for WarpStore
+//!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+//!
+//!        // Obtain a segment of consecutive items that are blocked across threads
+//!        int thread_data[4];
+//!        ...
+//!
+//!        // Store items to linear memory
+//!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+//!
+//! Suppose the set of ``thread_data`` across the warp threads is
+//! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+//! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+//! @endrst
+//!
+//! @tparam T
+//!   The type of data to be written.
+//!
+//! @tparam ITEMS_PER_THREAD
+//!   The number of consecutive items partitioned onto each thread.
+//!
+//! @tparam ALGORITHM
+//!   <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
+//!   default: cub::WARP_STORE_DIRECT.
+//!
+//! @tparam LOGICAL_WARP_THREADS
+//!   <b>[optional]</b> The number of threads per "logical" warp (may be less
+//!   than the number of hardware warp threads). Default is the warp size of the
+//!   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
+//!   power of two.
+//!
+//! @tparam LEGACY_PTX_ARCH
+//!   Unused.
+template <typename T,
+          int ITEMS_PER_THREAD,
+          WarpStoreAlgorithm ALGORITHM = WARP_STORE_DIRECT,
+          int LOGICAL_WARP_THREADS     = CUB_PTX_WARP_THREADS,
+          int LEGACY_PTX_ARCH          = 0>
+class WarpStore
+{
+  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE, "LOGICAL_WARP_THREADS must be a power of two");
+
+  static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0);
+
+private:
+  /// Store helper
+  template <WarpStoreAlgorithm _POLICY, int DUMMY>
+  struct StoreInternal;
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_DIRECT, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_STRIPED, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_VECTORIZE, DUMMY>
+  {
+    using TempStorage = NullType;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& /*temp_storage*/, int linear_tid)
+        : linear_tid(linear_tid)
+    {}
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(T* block_ptr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+    }
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  template <int DUMMY>
+  struct StoreInternal<WARP_STORE_TRANSPOSE, DUMMY>
+  {
+    using WarpExchangeT = WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+
+    struct _TempStorage : WarpExchangeT::TempStorage
+    {};
+
+    struct TempStorage : Uninitialized<_TempStorage>
+    {};
+
+    _TempStorage& temp_storage;
+
+    int linear_tid;
+
+    _CCCL_DEVICE _CCCL_FORCEINLINE StoreInternal(TempStorage& temp_storage, int linear_tid)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(linear_tid)
+    {}
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+    {
+      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
+    }
+
+    template <typename OutputIteratorT>
+    _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+    {
+      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
+      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items, valid_items);
+    }
+  };
+
+  /// Internal load implementation to use
+  using InternalStore = StoreInternal<ALGORITHM, 0>;
+
+  /// Shared memory storage layout type
+  using _TempStorage = typename InternalStore::TempStorage;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  _TempStorage& temp_storage;
+
+  int linear_tid;
+
+public:
+  struct TempStorage : Uninitialized<_TempStorage>
+  {};
+
+  //! @name Collective constructors
+  //! @{
+
+  //! @brief Collective constructor using a private static allocation of shared
+  //!        memory as temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore()
+      : temp_storage(PrivateStorage())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //! @brief Collective constructor using the specified memory allocation as
+  //!        temporary storage.
+  _CCCL_DEVICE _CCCL_FORCEINLINE WarpStore(TempStorage& temp_storage)
+      : temp_storage(temp_storage.Alias())
+      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
+  {}
+
+  //! @}  end member group
+  //! @name Data movement
+  //! @{
+
+  //! @rst
+  //! Store items into a linear segment of memory.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the storing of a "blocked" arrangement
+  //! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for
+  //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+  //! that memory references will be efficiently coalesced using a warp-striped
+  //! access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!
+  //!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+  //!        using WarpStoreT = WarpStore<int,
+  //!                                     items_per_thread,
+  //!                                     cub::WARP_STORE_TRANSPOSE,
+  //!                                     warp_threads>;
+  //!
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!
+  //!        // Allocate shared memory for WarpStore
+  //!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Store items to linear memory
+  //!        WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
+  //!
+  //! Suppose the set of ``thread_data`` across the warp threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
+  //! The output ``d_data`` will be ``0, 1, 2, 3, 4, 5, ...``.
+  //! @endrst
+  //!
+  //! @param[out] block_itr The thread block's base output iterator for storing to
+  //! @param[in] items Data to store
+  template <typename OutputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+  }
+
+  //! @rst
+  //! Store items into a linear segment of memory, guarded by range.
+  //!
+  //! @smemwarpreuse
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates the storing of a "blocked" arrangement
+  //! of 64 integers across 16 threads (where each thread owns 4 consecutive items)
+  //! into a linear segment of memory. The store is specialized for
+  //! ``WARP_STORE_TRANSPOSE``, meaning items are locally reordered among threads so
+  //! that memory references will be efficiently coalesced using a warp-striped
+  //! access pattern.
+  //!
+  //! .. code-block:: c++
+  //!
+  //!    #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
+  //!
+  //!    __global__ void ExampleKernel(int *d_data, int valid_items ...)
+  //!    {
+  //!        constexpr int warp_threads = 16;
+  //!        constexpr int block_threads = 256;
+  //!        constexpr int items_per_thread = 4;
+  //!
+  //!        // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
+  //!        using WarpStoreT = WarpStore<int,
+  //!                                     items_per_thread,
+  //!                                     cub::WARP_STORE_TRANSPOSE,
+  //!                                     warp_threads>;
+  //!
+  //!        constexpr int warps_in_block = block_threads / warp_threads;
+  //!        constexpr int tile_size = items_per_thread * warp_threads;
+  //!        const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
+  //!
+  //!        // Allocate shared memory for WarpStore
+  //!        __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
+  //!
+  //!        // Obtain a segment of consecutive items that are blocked across threads
+  //!        int thread_data[4];
+  //!        ...
+  //!
+  //!        // Store items to linear memory
+  //!        WarpStoreT(temp_storage[warp_id]).Store(
+  //!          d_data + warp_id * tile_size, thread_data, valid_items);
+  //!
+  //! Suppose the set of ``thread_data`` across the warp threads is
+  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`` and ``valid_items``
+  //! is ``5``. The output ``d_data`` will be ``0, 1, 2, 3, 4, ?, ?, ...``,
+  //! with only the first two threads being unmasked to store portions of valid
+  //! data.
+  //! @endrst
+  //!
+  //! @param[out] block_itr The thread block's base output iterator for storing to
+  //! @param[in] items Data to store
+  //! @param[in] valid_items Number of valid items to write
+  //!
+  template <typename OutputIteratorT>
+  _CCCL_DEVICE _CCCL_FORCEINLINE void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items)
+  {
+    InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+  }
+
+  //! @}  end member group
+};
+
+CUB_NAMESPACE_END
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/examples/CMakeLists.txt
new file mode 100644
index 000000000..a12c47897
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Some of the examples include the `cub/test/test_util.h` header, which
+# depends on c2h:
+cccl_get_c2h()
+
+# Create meta targets that build all examples for a single configuration:
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+## cub_add_example
+#
+# Add an example executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the example
+#   target. Useful for post-processing target information per-backend.
+# example_name: The name of the example minus "<config_prefix>.example." For
+#   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
+#   would be "cuda.copy".
+# example_src: The source file that implements the example.
+# cub_target: The reference cub target with configuration information.
+#
+function(cub_add_example target_name_var example_name example_src cub_target)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target cub.all.example.${example_name})
+
+  add_executable(${example_target} "${example_src}")
+  target_link_libraries(${example_target} PRIVATE
+    ${cub_target}
+    cccl.c2h
+  )
+  cub_clone_target_properties(${example_target} ${cub_target})
+  cub_configure_cuda_target(${example_target} RDC ${CUB_FORCE_RDC})
+  target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  add_test(NAME ${example_target}
+    COMMAND "$<TARGET_FILE:${example_target}>"
+  )
+endfunction()
+
+add_subdirectory(block)
+add_subdirectory(device)
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/.gitignore b/source/tnn/device/cuda/thirdparty/cub/examples/block/.gitignore
similarity index 100%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/block/.gitignore
rename to source/tnn/device/cuda/thirdparty/cub/examples/block/.gitignore
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/block/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/examples/block/CMakeLists.txt
new file mode 100644
index 000000000..cfca5720a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/block/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB_RECURSE example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  example_*.cu
+)
+
+foreach (cub_target IN LISTS CUB_TARGETS)
+  foreach (example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WE)
+    string(REGEX REPLACE
+      "^example_block_" "block."
+      example_name "${example_name}"
+    )
+    cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
+  endforeach()
+endforeach()
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_radix_sort.cu
new file mode 100644
index 000000000..6bf95ea18
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_radix_sort.cu
@@ -0,0 +1,342 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockRadixSort
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+
+#include <algorithm>
+#include <iostream>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+/// Uniform key samples
+bool g_uniform_keys;
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide sorting over integers
+ */
+template <typename Key,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD>
+__launch_bounds__(BLOCK_THREADS) __global__
+  void BlockSortKernel(Key* d_in, // Tile of input
+                       Key* d_out, // Tile of output
+                       clock_t* d_elapsed) // Elapsed cycle count of block scan
+{
+  enum
+  {
+    TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
+  };
+
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared
+  // memory to a blocked arrangement)
+  using BlockLoadT = BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE>;
+
+  // Specialize BlockRadixSort type for our thread block
+  using BlockRadixSortT = BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>;
+
+  // Shared memory
+  __shared__ union TempStorage
+  {
+    typename BlockLoadT::TempStorage load;
+    typename BlockRadixSortT::TempStorage sort;
+  } temp_storage;
+
+  // Per-thread tile items
+  Key items[ITEMS_PER_THREAD];
+
+  // Our current block's offset
+  int block_offset = blockIdx.x * TILE_SIZE;
+
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  // Start cycle timer
+  clock_t start = clock();
+
+  // Sort keys
+  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+
+  // Stop cycle timer
+  clock_t stop = clock();
+
+  // Store output in striped fashion
+  StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+  // Store elapsed clocks
+  if (threadIdx.x == 0)
+  {
+    d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
+  }
+}
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize sorting problem (and solution).
+ */
+template <typename Key>
+void Initialize(Key* h_in, Key* h_reference, int num_items, int tile_size)
+{
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (g_uniform_keys)
+    {
+      h_in[i] = 0;
+    }
+    else
+    {
+      RandomBits(h_in[i]);
+    }
+    h_reference[i] = h_in[i];
+  }
+
+  // Only sort the first tile
+  std::sort(h_reference, h_reference + tile_size);
+}
+
+/**
+ * Test BlockScan
+ */
+template <typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+void Test()
+{
+  constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  // Allocate host arrays
+  Key* h_in          = new Key[TILE_SIZE * g_grid_size];
+  Key* h_reference   = new Key[TILE_SIZE * g_grid_size];
+  clock_t* h_elapsed = new clock_t[g_grid_size];
+
+  // Initialize problem and reference output on host
+  Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
+
+  // Initialize device arrays
+  Key* d_in          = nullptr;
+  Key* d_out         = nullptr;
+  clock_t* d_elapsed = nullptr;
+  CubDebugExit(cudaMalloc((void**) &d_in, sizeof(Key) * TILE_SIZE * g_grid_size));
+  CubDebugExit(cudaMalloc((void**) &d_out, sizeof(Key) * TILE_SIZE * g_grid_size));
+  CubDebugExit(cudaMalloc((void**) &d_elapsed, sizeof(clock_t) * g_grid_size));
+
+  // Display input problem data
+  if (g_verbose)
+  {
+    printf("Input data: ");
+    for (int i = 0; i < TILE_SIZE; i++)
+    {
+      std::cout << h_in[i] << ", ";
+    }
+    printf("\n\n");
+  }
+
+  // Kernel props
+  int max_sm_occupancy;
+  CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
+
+  // Copy problem to device
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
+
+  printf(
+    "BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+    TILE_SIZE * g_grid_size,
+    g_timing_iterations,
+    g_grid_size,
+    BLOCK_THREADS,
+    ITEMS_PER_THREAD,
+    max_sm_occupancy);
+  fflush(stdout);
+
+  // Run kernel once to prime caches and check result
+  BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+  // Check for kernel errors and STDIO from the kernel, if any
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Check results
+  printf("\tOutput items: ");
+  int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+  printf("%s\n", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+  fflush(stdout);
+
+  // Run this several times and average the performance results
+  GpuTimer timer;
+  float elapsed_millis              = 0.0;
+  unsigned long long elapsed_clocks = 0;
+
+  for (int i = 0; i < g_timing_iterations; ++i)
+  {
+    timer.Start();
+
+    // Run kernel
+    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Copy clocks from device
+    CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
+    for (int j = 0; j < g_grid_size; j++)
+    {
+      elapsed_clocks += h_elapsed[j];
+    }
+  }
+
+  // Check for kernel errors and STDIO from the kernel, if any
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Display timing results
+  float avg_millis           = elapsed_millis / g_timing_iterations;
+  float avg_items_per_sec    = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+  double avg_clocks          = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
+  double avg_clocks_per_item = avg_clocks / TILE_SIZE;
+
+  printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
+  printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
+  printf("\tAverage kernel millis: %.4f\n", avg_millis);
+  printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+  fflush(stdout);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (h_elapsed)
+  {
+    delete[] h_elapsed;
+  }
+  if (d_in)
+  {
+    CubDebugExit(cudaFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(cudaFree(d_out));
+  }
+  if (d_elapsed)
+  {
+    CubDebugExit(cudaFree(d_elapsed));
+  }
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose      = args.CheckCmdLineFlag("v");
+  g_uniform_keys = args.CheckCmdLineFlag("uniform");
+  args.GetCmdLineArgument("i", g_timing_iterations);
+  args.GetCmdLineArgument("grid-size", g_grid_size);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--i=<timing iterations (default:%d)>]"
+           "[--grid-size=<grid size (default:%d)>]"
+           "[--v] "
+           "\n",
+           argv[0],
+           g_timing_iterations,
+           g_grid_size);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+  fflush(stdout);
+
+  // Run tests
+  printf("\nuint32:\n");
+  fflush(stdout);
+  Test<unsigned int, 128, 13>();
+  printf("\n");
+  fflush(stdout);
+
+  printf("\nfp32:\n");
+  fflush(stdout);
+  Test<float, 128, 13>();
+  printf("\n");
+  fflush(stdout);
+
+  printf("\nuint8:\n");
+  fflush(stdout);
+  Test<unsigned char, 128, 13>();
+  printf("\n");
+  fflush(stdout);
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce.cu
new file mode 100644
index 000000000..ac0c75778
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce.cu
@@ -0,0 +1,297 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockReduce
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_store.cuh>
+
+#include <iostream>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide reduction.
+ */
+template <int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          BlockReduceAlgorithm ALGORITHM>
+__global__ void BlockReduceKernel(int* d_in, // Tile of input
+                                  int* d_out, // Tile aggregate
+                                  clock_t* d_elapsed) // Elapsed cycle count of block reduction
+{
+  // Specialize BlockReduce type for our thread block
+  using BlockReduceT = BlockReduce<int, BLOCK_THREADS, ALGORITHM>;
+
+  // Shared memory
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  // Per-thread tile data
+  int data[ITEMS_PER_THREAD];
+  LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
+
+  // Start cycle timer
+  clock_t start = clock();
+
+  // Compute sum
+  int aggregate = BlockReduceT(temp_storage).Sum(data);
+
+  // Stop cycle timer
+  clock_t stop = clock();
+
+  // Store aggregate and elapsed clocks
+  if (threadIdx.x == 0)
+  {
+    *d_elapsed = (start > stop) ? start - stop : stop - start;
+    *d_out     = aggregate;
+  }
+}
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int* h_in, int num_items)
+{
+  int inclusive = 0;
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = i % 17;
+    inclusive += h_in[i];
+  }
+
+  return inclusive;
+}
+
+/**
+ * Test thread block reduction
+ */
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockReduceAlgorithm ALGORITHM>
+void Test()
+{
+  constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  // Allocate host arrays
+  int* h_in  = new int[TILE_SIZE];
+  int* h_gpu = new int[TILE_SIZE + 1];
+
+  // Initialize problem and reference output on host
+  int h_aggregate = Initialize(h_in, TILE_SIZE);
+
+  // Initialize device arrays
+  int* d_in          = nullptr;
+  int* d_out         = nullptr;
+  clock_t* d_elapsed = nullptr;
+  cudaMalloc((void**) &d_in, sizeof(int) * TILE_SIZE);
+  cudaMalloc((void**) &d_out, sizeof(int) * 1);
+  cudaMalloc((void**) &d_elapsed, sizeof(clock_t));
+
+  // Display input problem data
+  if (g_verbose)
+  {
+    printf("Input data: ");
+    for (int i = 0; i < TILE_SIZE; i++)
+    {
+      printf("%d, ", h_in[i]);
+    }
+    printf("\n\n");
+  }
+
+  // Kernel props
+  int max_sm_occupancy;
+  CubDebugExit(
+    MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+  // Copy problem to device
+  cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+  printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d "
+         "SM occupancy):\n",
+         (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+         TILE_SIZE,
+         g_timing_iterations,
+         g_grid_size,
+         BLOCK_THREADS,
+         ITEMS_PER_THREAD,
+         max_sm_occupancy);
+
+  // Run kernel
+  BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+  // Check total aggregate
+  printf("\tAggregate: ");
+  int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
+  printf("%s\n", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Run this several times and average the performance results
+  GpuTimer timer;
+  float elapsed_millis   = 0.0;
+  clock_t elapsed_clocks = 0;
+
+  for (int i = 0; i < g_timing_iterations; ++i)
+  {
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    timer.Start();
+
+    // Run kernel
+    BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>
+      <<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Copy clocks from device
+    clock_t clocks;
+    CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+    elapsed_clocks += clocks;
+  }
+
+  // Check for kernel errors and STDIO from the kernel, if any
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Display timing results
+  float avg_millis          = elapsed_millis / g_timing_iterations;
+  float avg_items_per_sec   = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+  float avg_clocks          = float(elapsed_clocks) / g_timing_iterations;
+  float avg_clocks_per_item = avg_clocks / TILE_SIZE;
+
+  printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
+  printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+  printf("\tAverage kernel millis: %.4f\n", avg_millis);
+  printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_gpu)
+  {
+    delete[] h_gpu;
+  }
+  if (d_in)
+  {
+    cudaFree(d_in);
+  }
+  if (d_out)
+  {
+    cudaFree(d_out);
+  }
+  if (d_elapsed)
+  {
+    cudaFree(d_elapsed);
+  }
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("i", g_timing_iterations);
+  args.GetCmdLineArgument("grid-size", g_grid_size);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--i=<timing iterations>] "
+           "[--grid-size=<grid size>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Run tests
+  Test<1024, 1, BLOCK_REDUCE_RAKING>();
+  Test<512, 2, BLOCK_REDUCE_RAKING>();
+  Test<256, 4, BLOCK_REDUCE_RAKING>();
+  Test<128, 8, BLOCK_REDUCE_RAKING>();
+  Test<64, 16, BLOCK_REDUCE_RAKING>();
+  Test<32, 32, BLOCK_REDUCE_RAKING>();
+  Test<16, 64, BLOCK_REDUCE_RAKING>();
+
+  printf("-------------\n");
+
+  Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
+  Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce_dyn_smem.cu b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce_dyn_smem.cu
new file mode 100644
index 000000000..041cfe95e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_reduce_dyn_smem.cu
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockReduce with dynamic shared memory
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_reduce_dyn_smem.cu -I../.. -lcudart -O3 -std=c++14
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_store.cuh>
+
+#include <algorithm>
+#include <iostream>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+// Some implementation details rely on c++14
+#if _CCCL_STD_VER >= 2014
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Default grid size
+int g_grid_size = 1;
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide reduction.
+ */
+template <int BLOCK_THREADS>
+__global__ void BlockReduceKernel(int* d_in, // Tile of input
+                                  int* d_out // Tile aggregate
+)
+{
+  // Specialize BlockReduce type for our thread block
+  using BlockReduceT = cub::BlockReduce<int, BLOCK_THREADS>;
+  using TempStorageT = typename BlockReduceT::TempStorage;
+
+  union ShmemLayout
+  {
+    TempStorageT reduce;
+    int aggregate;
+  };
+
+  // shared memory byte-array
+  extern __shared__ __align__(alignof(ShmemLayout)) char smem[];
+
+  // cast to lvalue reference of expected type
+  auto& temp_storage = reinterpret_cast<TempStorageT&>(smem);
+
+  int data = d_in[threadIdx.x];
+
+  // Compute sum
+  int aggregate = BlockReduceT(temp_storage).Sum(data);
+
+  // block-wide sync barrier necessary to re-use shared mem safely
+  __syncthreads();
+  int* smem_integers = reinterpret_cast<int*>(smem);
+  if (threadIdx.x == 0)
+  {
+    smem_integers[0] = aggregate;
+  }
+
+  // sync to make new shared value available to all threads
+  __syncthreads();
+  aggregate = smem_integers[0];
+
+  // all threads write the aggregate to output
+  d_out[threadIdx.x] = aggregate;
+}
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int* h_in, int num_items)
+{
+  int inclusive = 0;
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = i % 17;
+    inclusive += h_in[i];
+  }
+
+  return inclusive;
+}
+
+/**
+ * Test thread block reduction
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+  // Allocate host arrays
+  int* h_in = new int[BLOCK_THREADS];
+
+  // Initialize problem and reference output on host
+  int h_aggregate = Initialize(h_in, BLOCK_THREADS);
+
+  // Initialize device arrays
+  int* d_in  = nullptr;
+  int* d_out = nullptr;
+  cudaMalloc((void**) &d_in, sizeof(int) * BLOCK_THREADS);
+  cudaMalloc((void**) &d_out, sizeof(int) * BLOCK_THREADS);
+
+  // Display input problem data
+  if (g_verbose)
+  {
+    printf("Input data: ");
+    for (int i = 0; i < BLOCK_THREADS; i++)
+    {
+      printf("%d, ", h_in[i]);
+    }
+    printf("\n\n");
+  }
+
+  // Copy problem to device
+  cudaMemcpy(d_in, h_in, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice);
+
+  // determine necessary storage size:
+  auto block_reduce_temp_bytes = sizeof(typename cub::BlockReduce<int, BLOCK_THREADS>::TempStorage);
+  // finally, we need to make sure that we can hold at least one integer
+  // needed in the kernel to exchange data after reduction
+  auto smem_size = (std::max)(1 * sizeof(int), block_reduce_temp_bytes);
+
+  // use default stream
+  cudaStream_t stream = nullptr;
+
+  // Run reduction kernel
+  BlockReduceKernel<BLOCK_THREADS><<<g_grid_size, BLOCK_THREADS, smem_size, stream>>>(d_in, d_out);
+
+  // Check total aggregate
+  printf("\tAggregate: ");
+  int compare = 0;
+  for (int i = 0; i < BLOCK_THREADS; i++)
+  {
+    compare = compare || CompareDeviceResults(&h_aggregate, d_out + i, 1, g_verbose, g_verbose);
+  }
+  printf("%s\n", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Check for kernel errors and STDIO from the kernel, if any
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (d_in)
+  {
+    cudaFree(d_in);
+  }
+  if (d_out)
+  {
+    cudaFree(d_out);
+  }
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("grid-size", g_grid_size);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--grid-size=<grid size>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Run tests
+  Test<1024>();
+  Test<512>();
+  Test<256>();
+  Test<128>();
+  Test<64>();
+  Test<32>();
+  Test<16>();
+
+  return 0;
+}
+
+#else // < C++14
+
+int main() {}
+
+#endif
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_scan.cu b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_scan.cu
new file mode 100644
index 000000000..28077f5e8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/block/example_block_scan.cu
@@ -0,0 +1,349 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockScan
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+
+#include <iostream>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          BlockScanAlgorithm ALGORITHM>
+__global__ void BlockPrefixSumKernel(int* d_in, // Tile of input
+                                     int* d_out, // Tile of output
+                                     clock_t* d_elapsed) // Elapsed cycle count of block scan
+{
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared
+  // memory to a blocked arrangement)
+  using BlockLoadT = BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE>;
+
+  // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared
+  // memory to a blocked arrangement)
+  using BlockStoreT = BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE>;
+
+  // Specialize BlockScan type for our thread block
+  using BlockScanT = BlockScan<int, BLOCK_THREADS, ALGORITHM>;
+
+  // Shared memory
+  __shared__ union TempStorage
+  {
+    typename BlockLoadT::TempStorage load;
+    typename BlockStoreT::TempStorage store;
+    typename BlockScanT::TempStorage scan;
+  } temp_storage;
+
+  // Per-thread tile data
+  int data[ITEMS_PER_THREAD];
+
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in, data);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  // Start cycle timer
+  clock_t start = clock();
+
+  // Compute exclusive prefix sum
+  int aggregate;
+  BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
+
+  // Stop cycle timer
+  clock_t stop = clock();
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  // Store items from a blocked arrangement
+  BlockStoreT(temp_storage.store).Store(d_out, data);
+
+  // Store aggregate and elapsed clocks
+  if (threadIdx.x == 0)
+  {
+    *d_elapsed                              = (start > stop) ? start - stop : stop - start;
+    d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
+  }
+}
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive prefix sum problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int* h_in, int* h_reference, int num_items)
+{
+  int inclusive = 0;
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = i % 17;
+
+    h_reference[i] = inclusive;
+    inclusive += h_in[i];
+  }
+
+  return inclusive;
+}
+
+/**
+ * Test thread block scan
+ */
+template <int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockScanAlgorithm ALGORITHM>
+void Test()
+{
+  constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  // Allocate host arrays
+  int* h_in        = new int[TILE_SIZE];
+  int* h_reference = new int[TILE_SIZE];
+  int* h_gpu       = new int[TILE_SIZE + 1];
+
+  // Initialize problem and reference output on host
+  int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
+
+  // Initialize device arrays
+  int* d_in          = nullptr;
+  int* d_out         = nullptr;
+  clock_t* d_elapsed = nullptr;
+  cudaMalloc((void**) &d_in, sizeof(int) * TILE_SIZE);
+  cudaMalloc((void**) &d_out, sizeof(int) * (TILE_SIZE + 1));
+  cudaMalloc((void**) &d_elapsed, sizeof(clock_t));
+
+  // Display input problem data
+  if (g_verbose)
+  {
+    printf("Input data: ");
+    for (int i = 0; i < TILE_SIZE; i++)
+    {
+      printf("%d, ", h_in[i]);
+    }
+    printf("\n\n");
+  }
+
+  // Kernel props
+  int max_sm_occupancy;
+  CubDebugExit(
+    MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+  // Copy problem to device
+  cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+  printf(
+    "BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM "
+    "occupancy):\n",
+    (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING"
+    : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)
+      ? "BLOCK_SCAN_RAKING_MEMOIZE"
+      : "BLOCK_SCAN_WARP_SCANS",
+    TILE_SIZE,
+    g_timing_iterations,
+    g_grid_size,
+    BLOCK_THREADS,
+    ITEMS_PER_THREAD,
+    max_sm_occupancy);
+
+  // Run aggregate/prefix kernel
+  BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>
+    <<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+  // Check results
+  printf("\tOutput items: ");
+  int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+  printf("%s\n", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Check total aggregate
+  printf("\tAggregate: ");
+  compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+  printf("%s\n", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Run this several times and average the performance results
+  GpuTimer timer;
+  float elapsed_millis   = 0.0;
+  clock_t elapsed_clocks = 0;
+
+  for (int i = 0; i < g_timing_iterations; ++i)
+  {
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    timer.Start();
+
+    // Run aggregate/prefix kernel
+    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>
+      <<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);
+
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Copy clocks from device
+    clock_t clocks;
+    CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+    elapsed_clocks += clocks;
+  }
+
+  // Check for kernel errors and STDIO from the kernel, if any
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Display timing results
+  float avg_millis          = elapsed_millis / g_timing_iterations;
+  float avg_items_per_sec   = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+  float avg_clocks          = float(elapsed_clocks) / g_timing_iterations;
+  float avg_clocks_per_item = avg_clocks / TILE_SIZE;
+
+  printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
+  printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+  printf("\tAverage kernel millis: %.4f\n", avg_millis);
+  printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (h_gpu)
+  {
+    delete[] h_gpu;
+  }
+  if (d_in)
+  {
+    cudaFree(d_in);
+  }
+  if (d_out)
+  {
+    cudaFree(d_out);
+  }
+  if (d_elapsed)
+  {
+    cudaFree(d_elapsed);
+  }
+}
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("i", g_timing_iterations);
+  args.GetCmdLineArgument("grid-size", g_grid_size);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--i=<timing iterations (default:%d)>]"
+           "[--grid-size=<grid size (default:%d)>]"
+           "[--v] "
+           "\n",
+           argv[0],
+           g_timing_iterations,
+           g_grid_size);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Run tests
+  Test<1024, 1, BLOCK_SCAN_RAKING>();
+  Test<512, 2, BLOCK_SCAN_RAKING>();
+  Test<256, 4, BLOCK_SCAN_RAKING>();
+  Test<128, 8, BLOCK_SCAN_RAKING>();
+  Test<64, 16, BLOCK_SCAN_RAKING>();
+  Test<32, 32, BLOCK_SCAN_RAKING>();
+
+  printf("-------------\n");
+
+  Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
+  Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
+  Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
+  Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
+  Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
+  Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
+
+  printf("-------------\n");
+
+  Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
+  Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
+  Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
+  Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
+  Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
+  Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/.gitignore b/source/tnn/device/cuda/thirdparty/cub/examples/device/.gitignore
similarity index 100%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/.gitignore
rename to source/tnn/device/cuda/thirdparty/cub/examples/device/.gitignore
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/examples/device/CMakeLists.txt
new file mode 100644
index 000000000..19d412cfb
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB_RECURSE example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  example_*.cu
+)
+
+foreach (cub_target IN LISTS CUB_TARGETS)
+  foreach (example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WE)
+    string(REGEX REPLACE
+      "^example_device_" "device."
+      example_name "${example_name}"
+    )
+    cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
+  endforeach()
+endforeach()
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_decoupled_look_back.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_decoupled_look_back.cu
new file mode 100644
index 000000000..806a9159d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_decoupled_look_back.cu
@@ -0,0 +1,122 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_scan.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <iostream>
+
+template <class ScanTileStateT>
+__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
+{
+  tile_state.InitializeStatus(blocks_in_grid);
+}
+
+template <class MessageT>
+__global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state)
+{
+  using scan_op_t         = cub::Sum;
+  using scan_tile_state_t = cub::ScanTileState<MessageT>;
+  using tile_prefix_op    = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
+  using temp_storage_t    = typename tile_prefix_op::TempStorage;
+
+  // Allocate temp storage in shared memory
+  __shared__ temp_storage_t temp_storage;
+
+  scan_op_t scan_op{};
+  constexpr unsigned int threads_in_warp = 32;
+  const unsigned int tid                 = threadIdx.x;
+
+  // Construct prefix op
+  tile_prefix_op prefix(tile_state, temp_storage, scan_op);
+  const unsigned int tile_idx = prefix.GetTileIdx();
+
+  // Compute block aggregate
+  MessageT block_aggregate = blockIdx.x;
+
+  if (tile_idx == 0)
+  {
+    // There are no blocks to look back to, immediately set the inclusive state
+    if (tid == 0)
+    {
+      tile_state.SetInclusive(tile_idx, block_aggregate);
+      printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate);
+    }
+  }
+  else
+  {
+    // Only the first warp in the block can perform the look back
+    const unsigned int warp_id = tid / threads_in_warp;
+
+    if (warp_id == 0)
+    {
+      // Perform the decoupled look-back
+      // Invocation of the prefix will block until the look-back is complete.
+      MessageT exclusive_prefix = prefix(block_aggregate);
+
+      if (tid == 0)
+      {
+        MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+        printf("tile %d: exclusive = %d inclusive = %d\n", tile_idx, exclusive_prefix, inclusive_prefix);
+      }
+    }
+  }
+}
+
+template <class MessageT>
+void decoupled_look_back_example(int blocks_in_grid)
+{
+  using scan_tile_state_t = cub::ScanTileState<MessageT>;
+
+  // Query temporary storage requirements
+  std::size_t temp_storage_bytes{};
+  scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes);
+
+  // Allocate temporary storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Initialize temporary storage
+  scan_tile_state_t tile_status;
+  tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes);
+  constexpr unsigned int threads_in_init_block = 256;
+  const unsigned int blocks_in_init_grid       = ::cuda::ceil_div(blocks_in_grid, threads_in_init_block);
+  init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, blocks_in_grid);
+
+  // Launch decoupled look-back
+  constexpr unsigned int threads_in_block = 256;
+  decoupled_look_back_kernel<<<blocks_in_grid, threads_in_block>>>(tile_status);
+
+  // Wait for kernel to finish
+  cudaDeviceSynchronize();
+}
+
+int main()
+{
+  decoupled_look_back_example<int>(14);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_flagged.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_flagged.cu
new file mode 100644
index 000000000..2b498d94d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_flagged.cu
@@ -0,0 +1,243 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::Flagged().
+ *
+ * Partition flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_partition.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
+{
+  unsigned short max_short = (unsigned short) -1;
+
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Select number of repeating occurrences
+    unsigned short repeat;
+    RandomBits(repeat);
+    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+    repeat = CUB_MAX(1, repeat);
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      h_flags[j] = 0;
+      h_in[j]    = key;
+      j++;
+    }
+
+    h_flags[i] = 1;
+    i          = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("Flags:\n");
+    DisplayResults(h_flags, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve unique problem
+ */
+int Solve(int* h_in, unsigned char* h_flags, int* h_reference, int num_items)
+{
+  int num_selected = 0;
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (h_flags[i])
+    {
+      h_reference[num_selected] = h_in[i];
+      num_selected++;
+    }
+    else
+    {
+      h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+    }
+  }
+
+  return num_selected;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items   = 150;
+  int max_segment = 40; // Maximum segment length
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxseg", max_segment);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--maxseg=<max segment length>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays
+  int* h_in              = new int[num_items];
+  int* h_reference       = new int[num_items];
+  unsigned char* h_flags = new unsigned char[num_items];
+
+  // Initialize problem and solution
+  Initialize(h_in, h_flags, num_items, max_segment);
+  int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+  printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+         num_items,
+         num_selected,
+         (num_selected > 0) ? num_items / num_selected : 0,
+         (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  int* d_in              = nullptr;
+  unsigned char* d_flags = nullptr;
+
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_flags, sizeof(unsigned char) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array and num selected
+  int* d_out              = nullptr;
+  int* d_num_selected_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int)));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(
+    DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(
+    DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+  printf("\t Data %s ", compare ? "FAIL" : "PASS");
+  compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+  printf("\t Count %s ", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_num_selected_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_flags)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_flags));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_if.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_if.cu
new file mode 100644
index 000000000..f7c2ea86d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_partition_if.cu
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::If().
+ *
+ * Partitions items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_partition.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+/// Selection functor type
+struct GreaterThan
+{
+  int compare;
+
+  __host__ __device__ __forceinline__ GreaterThan(int compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ __forceinline__ bool operator()(const int& a) const
+  {
+    return (a > compare);
+  }
+};
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(int* h_in, int num_items, int max_segment)
+{
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+    unsigned short max_short = (unsigned short) -1;
+    unsigned short repeat;
+    RandomBits(repeat);
+    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+    repeat = CUB_MAX(1, repeat);
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      h_in[j] = key;
+      j++;
+    }
+
+    i = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(int* h_in, SelectOp select_op, int* h_reference, int num_items)
+{
+  int num_selected = 0;
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (select_op(h_in[i]))
+    {
+      h_reference[num_selected] = h_in[i];
+      num_selected++;
+    }
+    else
+    {
+      h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+    }
+  }
+
+  return num_selected;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items   = 150;
+  int max_segment = 40; // Maximum segment length
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxseg", max_segment);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--maxseg=<max segment length>]"
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays
+  int* h_in        = new int[num_items];
+  int* h_reference = new int[num_items];
+
+  // DevicePartition a pivot index
+  unsigned int pivot_index;
+  unsigned int max_int = (unsigned int) -1;
+  RandomBits(pivot_index);
+  pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+  printf("Pivot idx: %d\n", pivot_index);
+  fflush(stdout);
+
+  // Initialize problem and solution
+  Initialize(h_in, num_items, max_segment);
+  GreaterThan select_op(h_in[pivot_index]);
+
+  int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+  printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+         num_items,
+         num_selected,
+         (num_selected > 0) ? num_items / num_selected : 0,
+         (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  int* d_in = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array and num selected
+  int* d_out              = nullptr;
+  int* d_num_selected_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int)));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(
+    DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(
+    DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+  printf("\t Data %s ", compare ? "FAIL" : "PASS");
+  compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+  printf("\t Count %s ", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_num_selected_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort.cu
new file mode 100644
index 000000000..64a029643
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort.cu
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceRadixSort::SortPairs().
+ *
+ * Sorts an array of float keys paired with a corresponding array of int values.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <algorithm>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for floating point types.
+ * Treats positive and negative zero as equivalent.
+ */
+struct Pair
+{
+  float key;
+  int value;
+
+  bool operator<(const Pair& b) const
+  {
+    return key < b.key;
+  }
+};
+
+/**
+ * Initialize key-value sorting problem.
+ */
+void Initialize(float* h_keys, int* h_values, float* h_reference_keys, int* h_reference_values, int num_items)
+{
+  Pair* h_pairs = new Pair[num_items];
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    RandomBits(h_keys[i]);
+    RandomBits(h_values[i]);
+    h_pairs[i].key   = h_keys[i];
+    h_pairs[i].value = h_values[i];
+  }
+
+  if (g_verbose)
+  {
+    printf("Input keys:\n");
+    DisplayResults(h_keys, num_items);
+    printf("\n\n");
+
+    printf("Input values:\n");
+    DisplayResults(h_values, num_items);
+    printf("\n\n");
+  }
+
+  std::stable_sort(h_pairs, h_pairs + num_items);
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_reference_keys[i]   = h_pairs[i].key;
+    h_reference_values[i] = h_pairs[i].value;
+  }
+
+  delete[] h_pairs;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items = 150;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
+         num_items,
+         int(sizeof(float)),
+         int(sizeof(int)));
+  fflush(stdout);
+
+  // Allocate host arrays
+  float* h_keys           = new float[num_items];
+  float* h_reference_keys = new float[num_items];
+  int* h_values           = new int[num_items];
+  int* h_reference_values = new int[num_items];
+
+  // Initialize problem and solution on host
+  Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
+
+  // Allocate device arrays
+  DoubleBuffer<float> d_keys;
+  DoubleBuffer<int> d_values;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[0], sizeof(float) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[1], sizeof(float) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[0], sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[1], sizeof(int) * num_items));
+
+  // Allocate temporary storage
+  size_t temp_storage_bytes = 0;
+  void* d_temp_storage      = nullptr;
+
+  CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Initialize device arrays
+  CubDebugExit(
+    cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+  CubDebugExit(
+    cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Run
+  CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
+  printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+  compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+  printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_keys)
+  {
+    delete[] h_keys;
+  }
+  if (h_reference_keys)
+  {
+    delete[] h_reference_keys;
+  }
+  if (h_values)
+  {
+    delete[] h_values;
+  }
+  if (h_reference_values)
+  {
+    delete[] h_reference_values;
+  }
+
+  if (d_keys.d_buffers[0])
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+  }
+  if (d_keys.d_buffers[1])
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+  }
+  if (d_values.d_buffers[0])
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+  }
+  if (d_values.d_buffers[1])
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort_custom.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort_custom.cu
new file mode 100644
index 000000000..d0bcf920f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_radix_sort_custom.cu
@@ -0,0 +1,256 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_radix_sort.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cuda/std/tuple>
+
+#include <bitset>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <type_traits>
+
+#include "cub/block/radix_rank_sort_operations.cuh"
+
+struct custom_t
+{
+  std::uint16_t i;
+  float f;
+};
+
+struct decomposer_t
+{
+  __host__ __device__ //
+    ::cuda::std::tuple<std::uint16_t&, float&>
+    operator()(custom_t& key) const
+  {
+    return {key.i, key.f};
+  }
+};
+
+std::bitset<64> to_binary_representation(custom_t value)
+{
+  std::uint64_t bits{};
+  memcpy(&bits, &value, sizeof(custom_t));
+  return std::bitset<64>{bits};
+}
+
+int main()
+{
+  std::cout << "This example illustrates use of radix sort with custom type.\n";
+  std::cout << "Let's define a simple structure of the following form:\n\n";
+  std::cout << "\tstruct custom_t {\n";
+  std::cout << "\t  std::uint32_t i;\n";
+  std::cout << "\t  float f;\n";
+  std::cout << "\t};\n\n";
+  std::cout << "The `i` field is already stored in the bit-lexicographical order.\n";
+  std::cout << "The `f` field, however, isn't. Therefore, to feed this structure \n";
+  std::cout << "into the radix sort, we have to convert `f` into bit ordered representation.\n";
+  std::cout << "The `custom_t{65535, -4.2f}` has the following binary representation:\n\n";
+
+  auto print_segment = [](std::string msg, std::size_t segment_size, char filler = '-') {
+    std::string spaces((segment_size - msg.size()) / 2 - 1, filler);
+    std::cout << '<' << spaces << msg << spaces << '>';
+  };
+
+  std::cout << '\t';
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  std::cout << "\ts";
+  print_segment(" exp. ", 8);
+  print_segment(" mantissa -", 23);
+  print_segment(" padding -", 16);
+  print_segment(" short -", 16);
+  std::cout << '\n';
+
+  custom_t the_answer{65535, -4.2f};
+  std::cout << '\t' << to_binary_representation(the_answer);
+  std::cout << "\n\t";
+  print_segment(" <----  higher bits  /  lower bits  ----> ", 64, ' ');
+  std::cout << "\n\n";
+
+  std::cout << "Let's say we are trying to compare l={42, -4.2f} with g={42, 4.2f}:\n";
+
+  std::cout << "\n\t";
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  custom_t l{42, -4.2f};
+  custom_t g{42, 4.2f};
+  std::cout << "l:\t" << to_binary_representation(l) << '\n';
+  std::cout << "g:\t" << to_binary_representation(g) << "\n\n";
+
+  std::cout << "As you can see, `l` key happened to be larger in the bit-lexicographicl order.\n";
+  std::cout << "Since there's no reflection in C++, we can't inspect the type and convert \n";
+  std::cout << "each field into the bit-lexicographicl order. You can tell CUB how to do that\n";
+  std::cout << "by specializing cub::RadixTraits for the `custom_t`:\n\n";
+
+  std::cout << "\tstruct decomposer_t \n";
+  std::cout << "\t{\n";
+  std::cout << "\t  __host__ __device__ \n";
+  std::cout << "\t  ::cuda::std::tuple<std::uint16_t&, float&> operator()(custom_t &key) const \n";
+  std::cout << "\t  {\n";
+  std::cout << "\t    return {key.i, key.f};\n";
+  std::cout << "\t  }\n";
+  std::cout << "\t};\n";
+  std::cout << "\n";
+
+  std::cout << "Decomposer allows you to specify which fields are most significant and which\n";
+  std::cout << "are least significant. In our case, `f` is the most significant field and\n";
+  std::cout << "`i` is the least significant field. The decomposer is then used by CUB to convert\n";
+  std::cout << "the `custom_t` into the bit-lexicographicl order:\n\n";
+
+  using conversion_policy = cub::detail::radix::traits_t<custom_t>::bit_ordered_conversion_policy;
+  l                       = conversion_policy::to_bit_ordered(decomposer_t{}, l);
+  g                       = conversion_policy::to_bit_ordered(decomposer_t{}, g);
+
+  std::cout << "\n\t";
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  std::cout << "l:\t" << to_binary_representation(l) << '\n';
+  std::cout << "g:\t" << to_binary_representation(g) << "\n\n";
+
+  std::cout << '\n';
+  std::cout << "As you can see, `g` is now actually larger than `l` in the bit-lexicographicl order.\n";
+  std::cout << "After binning, CUB is able to restore the original key:\n\n";
+
+  l = conversion_policy::from_bit_ordered(decomposer_t{}, l);
+  g = conversion_policy::from_bit_ordered(decomposer_t{}, g);
+
+  std::cout << "\n\t";
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  std::cout << "l:\t" << to_binary_representation(l) << '\n';
+  std::cout << "g:\t" << to_binary_representation(g) << "\n\n";
+
+  using inversion_policy = cub::detail::radix::traits_t<custom_t>::bit_ordered_inversion_policy;
+  std::cout << '\n';
+  std::cout << "We are also able to inverse differentiating bits:\n";
+
+  l = inversion_policy::inverse(decomposer_t{}, l);
+  g = inversion_policy::inverse(decomposer_t{}, g);
+
+  std::cout << "\n\t";
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  std::cout << "l:\t" << to_binary_representation(l) << '\n';
+  std::cout << "g:\t" << to_binary_representation(g) << "\n\n";
+
+  std::cout << '\n';
+  std::cout << "We as well can compute the minimal and minimal / maximal keys:\n";
+
+  l = cub::detail::radix::traits_t<custom_t>::min_raw_binary_key(decomposer_t{});
+  g = cub::detail::radix::traits_t<custom_t>::max_raw_binary_key(decomposer_t{});
+
+  std::cout << "\n\t";
+  print_segment(" `.f` ", 32);
+  print_segment(" padding -", 16);
+  print_segment(" `.s` ", 16);
+  std::cout << '\n';
+
+  std::cout << "l:\t" << to_binary_representation(l) << '\n';
+  std::cout << "g:\t" << to_binary_representation(g) << "\n\n";
+
+  std::cout << "We can even compute the number of differentiating bits:\n\n";
+
+  std::cout << "end:\t";
+  std::cout << cub::detail::radix::traits_t<custom_t>::default_end_bit(decomposer_t{});
+  std::cout << '\n';
+  std::cout << "size:\t";
+  std::cout << sizeof(custom_t) * CHAR_BIT;
+  std::cout << "\n\n";
+
+  std::cout << "All of these operations are used behind the scenes by CUB to sort custom types:\n\n";
+
+  constexpr int num_items            = 6;
+  thrust::device_vector<custom_t> in = {{4, +2.5f}, {0, -2.5f}, {3, +1.1f}, {1, +0.0f}, {2, -0.0f}, {5, +3.7f}};
+
+  std::cout << "in:\n";
+  for (custom_t key : in)
+  {
+    std::cout << "\t{.i = " << key.i << ", .f = " << key.f << "},\n";
+  }
+
+  thrust::device_vector<custom_t> out(num_items);
+
+  const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+  custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Sort keys
+  cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+  cudaDeviceSynchronize();
+
+  std::cout << "\n";
+  std::cout << "sort:\n";
+  std::cout << "\n";
+
+  std::cout << "\tcub::DeviceRadixSort::SortKeys(d_temp_storage,\n";
+  std::cout << "\t                               temp_storage_bytes,\n";
+  std::cout << "\t                               d_in,\n";
+  std::cout << "\t                               d_out,\n";
+  std::cout << "\t                               num_items,\n";
+  std::cout << "\t                               decomposer_t{});\n\n";
+
+  std::cout << "out:\n";
+  for (custom_t key : out)
+  {
+    std::cout << "\t{.i = " << key.i << ", .f = " << key.f << "},\n";
+  }
+
+  std::cout << '\n';
+  std::cout << "If you have any issues with radix sort support of custom types, \n";
+  std::cout << "please feel free to use this example to identify the problem.\n\n";
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_reduce.cu
similarity index 51%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu
rename to source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_reduce.cu
index fc8fddb0e..7d4c419ba 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_reduce.cu
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_reduce.cu
@@ -39,23 +39,20 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
 #include <cub/device/device_reduce.cuh>
+#include <cub/util_allocator.cuh>
 
 #include "../../test/test_util.h"
+#include <stdio.h>
 
 using namespace cub;
 
-
 //---------------------------------------------------------------------
-// Globals, constants and typedefs
+// Globals, constants and aliases
 //---------------------------------------------------------------------
 
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
 
 //---------------------------------------------------------------------
 // Test generation
@@ -64,40 +61,39 @@ CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memo
 /**
  * Initialize problem
  */
-void Initialize(
-    int   *h_in,
-    int     num_items)
+void Initialize(int* h_in, int num_items)
 {
-    for (int i = 0; i < num_items; ++i)
-        h_in[i] = i;
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = i;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
 }
 
-
 /**
  * Compute solution
  */
-void Solve(
-    int           *h_in,
-    int           &h_reference,
-    int             num_items)
+void Solve(int* h_in, int& h_reference, int num_items)
 {
-    for (int i = 0; i < num_items; ++i)
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (i == 0)
+    {
+      h_reference = h_in[0];
+    }
+    else
     {
-        if (i == 0)
-            h_reference = h_in[0];
-        else
-            h_reference += h_in[i];
+      h_reference += h_in[i];
     }
+  }
 }
 
-
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -107,74 +103,83 @@ void Solve(
  */
 int main(int argc, char** argv)
 {
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
-        num_items, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate host arrays
-    int* h_in = new int[num_items];
-    int  h_reference;
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items);
-    Solve(h_in, h_reference, num_items);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array
-    int *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
-
-    // Request and allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
+  int num_items = 150;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n", num_items, (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate host arrays
+  int* h_in = new int[num_items];
+  int h_reference{};
+
+  // Initialize problem and solution
+  Initialize(h_in, num_items);
+  Solve(h_in, h_reference, num_items);
+
+  // Allocate problem device arrays
+  int* d_in = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array
+  int* d_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * 1));
+
+  // Request and allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
+  printf("\t%s", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
 }
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_scan.cu
similarity index 50%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu
rename to source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_scan.cu
index 3c85526b5..519e8b004 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/examples/device/example_device_scan.cu
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_scan.cu
@@ -39,70 +39,61 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
 #include <cub/device/device_scan.cuh>
+#include <cub/util_allocator.cuh>
 
 #include "../../test/test_util.h"
+#include <stdio.h>
 
 using namespace cub;
 
-
 //---------------------------------------------------------------------
-// Globals, constants and typedefs
+// Globals, constants and aliases
 //---------------------------------------------------------------------
 
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
 
 //---------------------------------------------------------------------
 // Test generation
 //---------------------------------------------------------------------
 
-
 /**
  * Initialize problem
  */
-void Initialize(
-    int        *h_in,
-    int          num_items)
+void Initialize(int* h_in, int num_items)
 {
-    for (int i = 0; i < num_items; ++i)
-        h_in[i] = i;
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_in[i] = i;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
 }
 
 /**
  * Solve exclusive-scan problem
  */
-int Solve(
-    int           *h_in,
-    int           *h_reference,
-    int             num_items)
+int Solve(int* h_in, int* h_reference, int num_items)
 {
-    int inclusive = 0;
-    int aggregate = 0;
+  int inclusive = 0;
+  int aggregate = 0;
 
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference[i] = inclusive;
-        inclusive += h_in[i];
-        aggregate += h_in[i];
-    }
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_reference[i] = inclusive;
+    inclusive += h_in[i];
+    aggregate += h_in[i];
+  }
 
-    return aggregate;
+  return aggregate;
 }
 
-
-
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -112,75 +103,87 @@ int Solve(
  */
 int main(int argc, char** argv)
 {
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
-        num_items, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate host arrays
-    int*  h_in = new int[num_items];
-    int*  h_reference = new int[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items);
-    Solve(h_in, h_reference, num_items);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array
-    int *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
+  int num_items = 150;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n", num_items, (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate host arrays
+  int* h_in        = new int[num_items];
+  int* h_reference = new int[num_items];
+
+  // Initialize problem and solution
+  Initialize(h_in, num_items);
+  Solve(h_in, h_reference, num_items);
+
+  // Allocate problem device arrays
+  int* d_in = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array
+  int* d_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+  printf("\t%s", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
 }
-
-
-
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_flagged.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_flagged.cu
new file mode 100644
index 000000000..7f0ad094c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_flagged.cu
@@ -0,0 +1,243 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Flagged().
+ *
+ * Selects flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(int* h_in, unsigned char* h_flags, int num_items, int max_segment)
+{
+  unsigned short max_short = (unsigned short) -1;
+
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Select number of repeating occurrences
+    unsigned short repeat;
+    RandomBits(repeat);
+    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+    repeat = CUB_MAX(1, repeat);
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      h_flags[j] = 0;
+      h_in[j]    = key;
+      j++;
+    }
+
+    h_flags[i] = 1;
+    i          = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("Flags:\n");
+    DisplayResults(h_flags, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve unique problem
+ */
+int Solve(int* h_in, unsigned char* h_flags, int* h_reference, int num_items)
+{
+  int num_selected = 0;
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (h_flags[i])
+    {
+      h_reference[num_selected] = h_in[i];
+      num_selected++;
+    }
+    else
+    {
+      h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+    }
+  }
+
+  return num_selected;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items   = 150;
+  int max_segment = 40; // Maximum segment length
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxseg", max_segment);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--maxseg=<max segment length>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays
+  int* h_in              = new int[num_items];
+  int* h_reference       = new int[num_items];
+  unsigned char* h_flags = new unsigned char[num_items];
+
+  // Initialize problem and solution
+  Initialize(h_in, h_flags, num_items, max_segment);
+  int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+  printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+         num_items,
+         num_selected,
+         (num_selected > 0) ? num_items / num_selected : 0,
+         (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  int* d_in              = nullptr;
+  unsigned char* d_flags = nullptr;
+
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_flags, sizeof(unsigned char) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+  CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array and num selected
+  int* d_out              = nullptr;
+  int* d_num_selected_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int)));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(
+    DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(
+    DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+  printf("\t Data %s ", compare ? "FAIL" : "PASS");
+  compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+  printf("\t Count %s ", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_num_selected_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_flags)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_flags));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_if.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_if.cu
new file mode 100644
index 000000000..ce070347d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_if.cu
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::If().
+ *
+ * Selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+/// Selection functor type
+struct GreaterThan
+{
+  int compare;
+
+  __host__ __device__ __forceinline__ GreaterThan(int compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ __forceinline__ bool operator()(const int& a) const
+  {
+    return (a > compare);
+  }
+};
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(int* h_in, int num_items, int max_segment)
+{
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+    unsigned short max_short = (unsigned short) -1;
+    unsigned short repeat;
+    RandomBits(repeat);
+    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+    repeat = CUB_MAX(1, repeat);
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      h_in[j] = key;
+      j++;
+    }
+
+    i = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(int* h_in, SelectOp select_op, int* h_reference, int num_items)
+{
+  int num_selected = 0;
+  for (int i = 0; i < num_items; ++i)
+  {
+    if (select_op(h_in[i]))
+    {
+      h_reference[num_selected] = h_in[i];
+      num_selected++;
+    }
+    else
+    {
+      h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+    }
+  }
+
+  return num_selected;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items   = 150;
+  int max_segment = 40; // Maximum segment length
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxseg", max_segment);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--maxseg=<max segment length>]"
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays
+  int* h_in        = new int[num_items];
+  int* h_reference = new int[num_items];
+
+  // Select a pivot index
+  unsigned int pivot_index;
+  unsigned int max_int = (unsigned int) -1;
+  RandomBits(pivot_index);
+  pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+  printf("Pivot idx: %d\n", pivot_index);
+  fflush(stdout);
+
+  // Initialize problem and solution
+  Initialize(h_in, num_items, max_segment);
+  GreaterThan select_op(h_in[pivot_index]);
+
+  int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+  printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+         num_items,
+         num_selected,
+         (num_selected > 0) ? num_items / num_selected : 0,
+         (int) sizeof(int));
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  int* d_in = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array and num selected
+  int* d_out              = nullptr;
+  int* d_num_selected_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int)));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(
+    DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(
+    DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+  printf("\t Data %s ", compare ? "FAIL" : "PASS");
+  compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+  printf("\t Count %s ", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_num_selected_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_unique.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_unique.cu
new file mode 100644
index 000000000..dbd6a2462
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_select_unique.cu
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Unique().
+ *
+ * Selects the first element from each run of identical values from a sequence
+ * of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_select.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(int* h_in, int num_items, int max_segment)
+{
+  int key = 0;
+  int i   = 0;
+  while (i < num_items)
+  {
+    // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+    unsigned short max_short = (unsigned short) -1;
+    unsigned short repeat;
+    RandomBits(repeat);
+    repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+    repeat = CUB_MAX(1, repeat);
+
+    int j = i;
+    while (j < CUB_MIN(i + repeat, num_items))
+    {
+      h_in[j] = key;
+      j++;
+    }
+
+    i = j;
+    key++;
+  }
+
+  if (g_verbose)
+  {
+    printf("Input:\n");
+    DisplayResults(h_in, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve unique problem
+ */
+int Solve(int* h_in, int* h_reference, int num_items)
+{
+  int num_selected = 0;
+  if (num_items > 0)
+  {
+    h_reference[num_selected] = h_in[0];
+    num_selected++;
+  }
+
+  for (int i = 1; i < num_items; ++i)
+  {
+    if (h_in[i] != h_in[i - 1])
+    {
+      h_reference[num_selected] = h_in[i];
+      num_selected++;
+    }
+  }
+
+  return num_selected;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  int num_items   = 150;
+  int max_segment = 40; // Maximum segment length
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxseg", max_segment);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--n=<input items> "
+           "[--device=<device-id>] "
+           "[--maxseg=<max segment length>]"
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays
+  int* h_in        = new int[num_items];
+  int* h_reference = new int[num_items];
+
+  // Initialize problem and solution
+  Initialize(h_in, num_items, max_segment);
+  int num_selected = Solve(h_in, h_reference, num_items);
+
+  printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
+         num_items,
+         (int) sizeof(int),
+         num_selected,
+         num_items / num_selected);
+  fflush(stdout);
+
+  // Allocate problem device arrays
+  int* d_in = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_in, sizeof(int) * num_items));
+
+  // Initialize device input
+  CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+  // Allocate device output array and num selected
+  int* d_out              = nullptr;
+  int* d_num_selected_out = nullptr;
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_out, sizeof(int) * num_items));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_selected_out, sizeof(int)));
+
+  // Allocate temporary storage
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+  // Run
+  CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+
+  // Check for correctness (and display results, if specified)
+  int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+  printf("\t Data %s ", compare ? "FAIL" : "PASS");
+  compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+  printf("\t Count %s ", compare ? "FAIL" : "PASS");
+  AssertEquals(0, compare);
+
+  // Cleanup
+  if (h_in)
+  {
+    delete[] h_in;
+  }
+  if (h_reference)
+  {
+    delete[] h_reference;
+  }
+  if (d_in)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_in));
+  }
+  if (d_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_out));
+  }
+  if (d_num_selected_out)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+  }
+  if (d_temp_storage)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+  }
+
+  printf("\n\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
new file mode 100644
index 000000000..1428f19f2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
@@ -0,0 +1,408 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of sorting a sequence of keys and values (each pair is a
+ * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
+ * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <algorithm>
+
+#include "../../test/test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and aliases
+//---------------------------------------------------------------------
+
+bool g_verbose = false; // Whether to display input/output to console
+CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for using std::sort on key-value pairs.
+ */
+template <typename Key, typename Value>
+struct Pair
+{
+  Key key;
+  Value value;
+
+  bool operator<(const Pair& b) const
+  {
+    return (key < b.key);
+  }
+};
+
+/**
+ * Pair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
+{
+  os << '<' << val.key << ',' << val.value << '>';
+  return os;
+}
+
+/**
+ * Initialize problem
+ */
+template <typename Key, typename Value>
+void Initialize(Key* h_keys, Value* h_values, int num_items, int max_key)
+{
+  float scale = float(max_key) / float(UINT_MAX);
+  for (int i = 0; i < num_items; ++i)
+  {
+    Key sample;
+    RandomBits(sample);
+    h_keys[i]   = (max_key == -1) ? i : (Key) (scale * sample);
+    h_values[i] = i;
+  }
+
+  if (g_verbose)
+  {
+    printf("Keys:\n");
+    DisplayResults(h_keys, num_items);
+    printf("\n\n");
+
+    printf("Values:\n");
+    DisplayResults(h_values, num_items);
+    printf("\n\n");
+  }
+}
+
+/**
+ * Solve sorted non-trivial subrange problem.  Returns the number
+ * of non-trivial runs found.
+ */
+template <typename Key, typename Value>
+int Solve(Key* h_keys, Value* h_values, int num_items, int* h_offsets_reference, int* h_lengths_reference)
+{
+  // Sort
+
+  Pair<Key, Value>* h_pairs = new Pair<Key, Value>[num_items];
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_pairs[i].key   = h_keys[i];
+    h_pairs[i].value = h_values[i];
+  }
+
+  std::stable_sort(h_pairs, h_pairs + num_items);
+
+  if (g_verbose)
+  {
+    printf("Sorted pairs:\n");
+    DisplayResults(h_pairs, num_items);
+    printf("\n\n");
+  }
+
+  // Find non-trivial runs
+
+  Key previous  = h_pairs[0].key;
+  int length    = 1;
+  int num_runs  = 0;
+  int run_begin = 0;
+
+  for (int i = 1; i < num_items; ++i)
+  {
+    if (previous != h_pairs[i].key)
+    {
+      if (length > 1)
+      {
+        h_offsets_reference[num_runs] = run_begin;
+        h_lengths_reference[num_runs] = length;
+        num_runs++;
+      }
+      length    = 1;
+      run_begin = i;
+    }
+    else
+    {
+      length++;
+    }
+    previous = h_pairs[i].key;
+  }
+
+  if (length > 1)
+  {
+    h_offsets_reference[num_runs] = run_begin;
+    h_lengths_reference[num_runs] = length;
+    num_runs++;
+  }
+
+  delete[] h_pairs;
+
+  return num_runs;
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  using Key   = unsigned int;
+  using Value = int;
+
+  int timing_iterations = 0;
+  int num_items         = 40;
+  Key max_key           = 20; // Max item
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+  args.GetCmdLineArgument("n", num_items);
+  args.GetCmdLineArgument("maxkey", max_key);
+  args.GetCmdLineArgument("i", timing_iterations);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--i=<timing iterations> "
+           "[--n=<input items, default 40> "
+           "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Allocate host arrays (problem and reference solution)
+
+  Key* h_keys              = new Key[num_items];
+  Value* h_values          = new Value[num_items];
+  int* h_offsets_reference = new int[num_items];
+  int* h_lengths_reference = new int[num_items];
+
+  // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
+  printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
+  fflush(stdout);
+
+  Initialize(h_keys, h_values, num_items, max_key);
+  int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
+
+  printf("%d non-trivial runs\n", num_runs);
+  fflush(stdout);
+
+  // Repeat for performance timing
+  GpuTimer gpu_timer;
+  GpuTimer gpu_rle_timer;
+  float elapsed_millis     = 0.0;
+  float elapsed_rle_millis = 0.0;
+  for (int i = 0; i <= timing_iterations; ++i)
+  {
+    // Allocate and initialize device arrays for sorting
+    DoubleBuffer<Key> d_keys;
+    DoubleBuffer<Value> d_values;
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[0], sizeof(Key) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys.d_buffers[1], sizeof(Key) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[0], sizeof(Value) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_values.d_buffers[1], sizeof(Value) * num_items));
+
+    CubDebugExit(
+      cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(
+      cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Start timer
+    gpu_timer.Start();
+
+    // Allocate temporary storage for sorting
+    size_t temp_storage_bytes = 0;
+    void* d_temp_storage      = nullptr;
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Do the sort
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+    // Free unused buffers and sorting temporary storage
+    if (d_keys.d_buffers[d_keys.selector ^ 1])
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
+    }
+    if (d_values.d_buffers[d_values.selector ^ 1])
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
+    }
+    if (d_temp_storage)
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+    // Start timer
+    gpu_rle_timer.Start();
+
+    // Allocate device arrays for enumerating non-trivial runs
+    int* d_offests_out = nullptr;
+    int* d_lengths_out = nullptr;
+    int* d_num_runs    = nullptr;
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_offests_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_lengths_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_num_runs, sizeof(int) * 1));
+
+    // Allocate temporary storage for isolating non-trivial runs
+    d_temp_storage = nullptr;
+    CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys.d_buffers[d_keys.selector],
+      d_offests_out,
+      d_lengths_out,
+      d_num_runs,
+      num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Do the isolation
+    CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys.d_buffers[d_keys.selector],
+      d_offests_out,
+      d_lengths_out,
+      d_num_runs,
+      num_items));
+
+    // Free keys buffer
+    if (d_keys.d_buffers[d_keys.selector])
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
+    }
+
+    //
+    // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
+    //
+
+    // Stop sort timer
+    gpu_timer.Stop();
+    gpu_rle_timer.Stop();
+
+    if (i == 0)
+    {
+      // First iteration is a warmup: // Check for correctness (and display results, if specified)
+
+      printf("\nRUN OFFSETS: \n");
+      int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
+      printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+      printf("\nRUN LENGTHS: \n");
+      compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+      printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+      printf("\nNUM RUNS: \n");
+      compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+      printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+      AssertEquals(0, compare);
+    }
+    else
+    {
+      elapsed_millis += gpu_timer.ElapsedMillis();
+      elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
+    }
+
+    // GPU cleanup
+
+    if (d_values.d_buffers[d_values.selector])
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
+    }
+    if (d_offests_out)
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_offests_out));
+    }
+    if (d_lengths_out)
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+    }
+    if (d_num_runs)
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    }
+    if (d_temp_storage)
+    {
+      CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+  }
+
+  // Host cleanup
+  if (h_keys)
+  {
+    delete[] h_keys;
+  }
+  if (h_values)
+  {
+    delete[] h_values;
+  }
+  if (h_offsets_reference)
+  {
+    delete[] h_offsets_reference;
+  }
+  if (h_lengths_reference)
+  {
+    delete[] h_lengths_reference;
+  }
+
+  printf("\n\n");
+
+  if (timing_iterations > 0)
+  {
+    printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in "
+           "RLE isolation)\n",
+           timing_iterations,
+           elapsed_millis / timing_iterations,
+           elapsed_rle_millis / timing_iterations);
+  }
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/.gitignore b/source/tnn/device/cuda/thirdparty/cub/test/.gitignore
similarity index 100%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/.gitignore
rename to source/tnn/device/cuda/thirdparty/cub/test/.gitignore
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/test/CMakeLists.txt
new file mode 100644
index 000000000..5db6eea6c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/CMakeLists.txt
@@ -0,0 +1,437 @@
+if(CMAKE_GENERATOR MATCHES "^Visual Studio")
+  if(CUB_ENABLE_RDC_TESTS)
+    if("${CMAKE_VERSION}" VERSION_LESS 3.27.5)
+      # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/8794
+      message(WARNING "CMake 3.27.5 or newer is required to enable RDC tests in Visual Studio.")
+      cmake_minimum_required(VERSION 3.27.5)
+    endif()
+  endif()
+endif()
+
+if ("NVHPC" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # NVBugs 200770766
+  set(CUB_SEPARATE_CATCH2 ON)
+else()
+  option(CUB_SEPARATE_CATCH2
+    "Build each catch2 test as a separate executable."
+    OFF
+  )
+endif()
+
+cccl_get_c2h()
+cccl_get_nvtx()
+
+find_package(CUDAToolkit)
+
+set(build_nvrtc_tests ON)
+if ("NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  set(build_nvrtc_tests OFF)
+endif()
+
+# The function below reads the filepath `src`, extracts the %PARAM% comments,
+# and fills `labels_var` with a list of `label1_value1.label2_value2...`
+# strings, and puts the corresponding `DEFINITION=value1:DEFINITION=value2`
+# entries into `defs_var`.
+#
+# See the README.md file in this directory for background info.
+function(cub_get_test_params src labels_var defs_var)
+  file(READ "${src}" file_data)
+  set(param_regex "//[ ]+%PARAM%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)")
+
+  string(REGEX MATCHALL
+    "${param_regex}"
+    matches
+    "${file_data}"
+  )
+
+  set(variant_labels)
+  set(variant_defs)
+
+  foreach(match IN LISTS matches)
+    string(REGEX MATCH
+      "${param_regex}"
+      unused
+      "${match}"
+    )
+
+    set(def ${CMAKE_MATCH_1})
+    set(label ${CMAKE_MATCH_2})
+    set(values "${CMAKE_MATCH_3}")
+    string(REPLACE ":" ";" values "${values}")
+
+    # Build lists of test name suffixes (labels) and preprocessor definitions
+    # (defs) containing the cartesian product of all param values:
+    if (NOT variant_labels)
+      foreach(value IN LISTS values)
+        list(APPEND variant_labels ${label}_${value})
+      endforeach()
+    else()
+      set(tmp_labels)
+      foreach(old_label IN LISTS variant_labels)
+        foreach(value IN LISTS values)
+          list(APPEND tmp_labels ${old_label}.${label}_${value})
+        endforeach()
+      endforeach()
+      set(variant_labels "${tmp_labels}")
+    endif()
+
+    if (NOT variant_defs)
+      foreach(value IN LISTS values)
+        list(APPEND variant_defs ${def}=${value})
+      endforeach()
+    else()
+      set(tmp_defs)
+      foreach(old_def IN LISTS variant_defs)
+        foreach(value IN LISTS values)
+          list(APPEND tmp_defs ${old_def}:${def}=${value})
+        endforeach()
+      endforeach()
+      set(variant_defs "${tmp_defs}")
+    endif()
+  endforeach()
+
+  set(${labels_var} "${variant_labels}" PARENT_SCOPE)
+  set(${defs_var} "${variant_defs}" PARENT_SCOPE)
+endfunction()
+
+# Create meta targets that build all tests for a single configuration:
+foreach(cub_target IN LISTS CUB_TARGETS)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+file(GLOB test_srcs
+  RELATIVE "${CUB_SOURCE_DIR}/test"
+  CONFIGURE_DEPENDS
+  test_*.cu
+  catch2_test_*.cu
+)
+
+## _cub_is_catch2_test
+#
+# If the test_src contains the substring "catch2_test_", `result_var` will
+# be set to TRUE.
+function(_cub_is_catch2_test result_var test_src)
+  string(FIND "${test_src}" "catch2_test_" idx)
+  if (idx EQUAL -1)
+    set(${result_var} FALSE PARENT_SCOPE)
+  else()
+    set(${result_var} TRUE PARENT_SCOPE)
+  endif()
+endfunction()
+
+## _cub_is_fail_test
+#
+# If the test_src contains the substring "_fail", `result_var` will
+# be set to TRUE.
+function(_cub_is_fail_test result_var test_src)
+  string(FIND "${test_src}" "_fail" idx)
+  if (idx EQUAL -1)
+    set(${result_var} FALSE PARENT_SCOPE)
+  else()
+    set(${result_var} TRUE PARENT_SCOPE)
+  endif()
+endfunction()
+
+## _cub_launcher_requires_rdc
+#
+# If given launcher id corresponds to a CDP launcher, set `out_var` to 1.
+function(_cub_launcher_requires_rdc out_var launcher_id)
+  if ("${launcher_id}" STREQUAL "1")
+    set(${out_var} 1 PARENT_SCOPE)
+  else()
+    set(${out_var} 0 PARENT_SCOPE)
+  endif()
+endfunction()
+
+## cub_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# cub_target: The reference cub target with configuration information.
+#
+function(cub_add_test target_name_var test_name test_src cub_target launcher_id)
+  cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+  _cub_is_catch2_test(is_catch2_test "${test_src}")
+  _cub_launcher_requires_rdc(cdp_val "${launcher_id}")
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  set(config_meta_target ${config_prefix}.tests)
+
+  if (is_catch2_test)
+    # Per config helper library:
+    set(config_c2h_target ${config_prefix}.test.catch2_helper.lid_${launcher_id})
+    if (NOT TARGET ${config_c2h_target})
+      add_library(${config_c2h_target} INTERFACE)
+      target_include_directories(${config_c2h_target} INTERFACE "${CUB_SOURCE_DIR}/test")
+      cub_clone_target_properties(${config_c2h_target} ${cub_target})
+      cub_configure_cuda_target(${config_c2h_target} RDC ${cdp_val})
+      target_link_libraries(${config_c2h_target} INTERFACE
+        ${cub_target}
+        cccl.c2h
+        CUDA::nvrtc
+        CUDA::cuda_driver
+      )
+    endif() # config_c2h_target
+
+    if (CUB_SEPARATE_CATCH2)
+      add_executable(${test_target} "${test_src}")
+      target_link_libraries(${test_target} PRIVATE cccl.c2h.main)
+      add_dependencies(${config_meta_target} ${test_target})
+
+      add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
+    else() # Not CUB_SEPARATE_CATCH2
+      # Per config catch2 runner
+      set(config_c2run_target ${config_prefix}.catch2_test.lid_${launcher_id})
+      if (NOT TARGET ${config_c2run_target})
+        add_executable(${config_c2run_target})
+        target_link_libraries(${config_c2run_target} PRIVATE
+          cccl.c2h.main
+          ${cub_target}
+          ${config_c2h_target}
+          Catch2::Catch2)
+        cub_clone_target_properties(${config_c2run_target} ${cub_target})
+        cub_configure_cuda_target(${config_c2run_target} RDC ${cdp_val})
+        add_dependencies(${config_meta_target} ${config_c2run_target})
+        target_include_directories(${config_c2run_target} PRIVATE
+          "${CUB_SOURCE_DIR}/test"
+        )
+        if ("NVHPC" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+          target_link_options(${config_c2run_target} PRIVATE "-cuda")
+        endif()
+
+        add_test(NAME ${config_c2run_target}
+          COMMAND "$<TARGET_FILE:${config_c2run_target}>"
+        )
+      endif() # per config catch2 runner
+
+      add_library(${test_target} OBJECT "${test_src}")
+
+      if(CMAKE_GENERATOR MATCHES "^Visual Studio")
+        target_link_libraries(${config_c2run_target} PRIVATE $<TARGET_OBJECTS:${test_target}>)
+      else()
+        target_link_libraries(${config_c2run_target} PRIVATE ${test_target})
+      endif()
+    endif() # CUB_SEPARATE_CATCH2
+
+    if ("${test_target}" MATCHES "nvrtc")
+      target_compile_definitions(${test_target} PRIVATE NVRTC_CUB_PATH="-I${CMAKE_SOURCE_DIR}/cub")
+      target_compile_definitions(${test_target} PRIVATE NVRTC_THRUST_PATH="-I${CMAKE_SOURCE_DIR}/thrust")
+      target_compile_definitions(${test_target} PRIVATE NVRTC_LIBCUDACXX_PATH="-I${CMAKE_SOURCE_DIR}/libcudacxx/include")
+      target_compile_definitions(${test_target} PRIVATE NVRTC_CTK_PATH="-I${CUDAToolkit_INCLUDE_DIRS}")
+    endif()
+
+    if ("${test_target}" MATCHES "test.iterator")
+      target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists
+    endif()
+
+    # enable lambdas for all API examples
+    if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api")
+      target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+    endif()
+
+    target_link_libraries(${test_target} PRIVATE
+      ${cub_target}
+      ${config_c2h_target}
+      Catch2::Catch2
+    )
+    cub_clone_target_properties(${test_target} ${cub_target})
+    target_include_directories(${test_target}
+      PUBLIC "${CUB_SOURCE_DIR}/test"
+    )
+  else() # Not catch2:
+    # Related target names:
+    set(test_meta_target cub.all.test.${test_name})
+
+    add_executable(${test_target} "${test_src}")
+    target_link_libraries(${test_target} PRIVATE
+      ${cub_target}
+      cccl.c2h
+    )
+    cub_clone_target_properties(${test_target} ${cub_target})
+    target_include_directories(${test_target} PRIVATE "${CUB_SOURCE_DIR}/test")
+    target_compile_definitions(${test_target} PRIVATE CUB_DETAIL_DEBUG_ENABLE_SYNC)
+
+    if ("${test_target}" MATCHES "nvtx_in_usercode")
+      target_link_libraries(${test_target} PRIVATE nvtx3-cpp)
+    endif()
+
+    _cub_is_fail_test(is_fail_test "${test_src}")
+    if (is_fail_test)
+      set_target_properties(${test_target} PROPERTIES EXCLUDE_FROM_ALL true
+                                           EXCLUDE_FROM_DEFAULT_BUILD true)
+      add_test(NAME ${test_target}
+               COMMAND ${CMAKE_COMMAND} --build "${CMAKE_BINARY_DIR}"
+                                        --target ${test_target}
+                                        --config $<CONFIGURATION>)
+      string(REGEX MATCH "err_([0-9]+)" MATCH_RESULT "${test_name}")
+      file(READ ${test_src} test_content)
+      if(MATCH_RESULT)
+        string(REGEX MATCH "// expected-error-${CMAKE_MATCH_1}+ {{\"([^\"]+)\"}}" expected_errors_matches ${test_content})
+
+        if (expected_errors_matches)
+          set_tests_properties(${test_target} PROPERTIES PASS_REGULAR_EXPRESSION "${CMAKE_MATCH_1}")
+        else()
+          set_tests_properties(${test_target} PROPERTIES WILL_FAIL true)
+        endif()
+      else()
+        string(REGEX MATCH "// expected-error {{\"([^\"]+)\"}}" expected_errors_matches ${test_content})
+
+        if (expected_errors_matches)
+          set_tests_properties(${test_target} PROPERTIES PASS_REGULAR_EXPRESSION "${CMAKE_MATCH_1}")
+        else()
+          set_tests_properties(${test_target} PROPERTIES WILL_FAIL true)
+        endif()
+      endif()
+    else()
+      # Add to the active configuration's meta target
+      add_dependencies(${config_meta_target} ${test_target})
+
+      # Meta target that builds tests with this name for all configurations:
+      if (NOT TARGET ${test_meta_target})
+        add_custom_target(${test_meta_target})
+      endif()
+      add_dependencies(${test_meta_target} ${test_target})
+
+      add_test(NAME ${test_target} COMMAND "$<TARGET_FILE:${test_target}>")
+    endif()
+  endif() # Not catch2 test
+endfunction()
+
+# Sets out_var to launch id if the label contains launch variants
+function(_cub_has_lid_variant out_var label)
+  string(FIND "${label}" "lid_" idx)
+  if (idx EQUAL -1)
+    set(${out_var} 0 PARENT_SCOPE)
+  else()
+    set(${out_var} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Sets out_var to 1 if the label contains "lid_1", e.g. launch id corresponds
+# to device-side (CDP) launch.
+function(_cub_launcher_id out_var label)
+  string(REGEX MATCH "lid_([0-9]+)" MATCH_RESULT "${label}")
+  if(MATCH_RESULT)
+    set(${out_var} ${CMAKE_MATCH_1} PARENT_SCOPE)
+  else()
+    set(${out_var} 0 PARENT_SCOPE)
+  endif()
+endfunction()
+
+foreach (test_src IN LISTS test_srcs)
+  get_filename_component(test_name "${test_src}" NAME_WE)
+  string(REGEX REPLACE "^catch2_test_" "" test_name "${test_name}")
+  string(REGEX REPLACE "^test_" "" test_name "${test_name}")
+
+  cub_get_test_params("${test_src}" variant_labels variant_defs)
+  list(LENGTH variant_labels num_variants)
+
+  if ("${test_name}" MATCHES "nvrtc")
+    if (NOT build_nvrtc_tests)
+      continue()
+    endif()
+  endif()
+
+  # Subtract 1 to support the inclusive endpoint of foreach(...RANGE...):
+  math(EXPR range_end "${num_variants} - 1")
+
+  # Verbose output:
+  if (num_variants GREATER 0)
+    message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':")
+    foreach(var_idx RANGE ${range_end})
+      math(EXPR i "${var_idx} + 1")
+      list(GET variant_labels ${var_idx} label)
+      list(GET variant_defs ${var_idx} defs)
+      message(VERBOSE "  ${i}: ${test_name} ${label} ${defs}")
+    endforeach()
+  endif()
+
+  foreach(cub_target IN LISTS CUB_TARGETS)
+    cub_get_target_property(config_prefix ${cub_target} PREFIX)
+
+    if (num_variants EQUAL 0)
+      if (${CUB_FORCE_RDC})
+        set(launcher 1)
+      else()
+        set(launcher 0)
+      endif()
+
+      # Only one version of this test.
+      cub_add_test(test_target ${test_name} "${test_src}" ${cub_target} ${launcher})
+      cub_configure_cuda_target(${test_target} RDC ${CUB_FORCE_RDC})
+    else() # has variants:
+      # Meta target to build all parametrizations of the current test for the
+      # current CUB_TARGET config
+      set(variant_meta_target ${config_prefix}.test.${test_name}.all)
+      if (NOT TARGET ${variant_meta_target})
+        add_custom_target(${variant_meta_target})
+      endif()
+
+      # Meta target to build all parametrizations of the current test for all
+      # CUB_TARGET configs
+      set(cub_variant_meta_target cub.all.test.${test_name}.all)
+      if (NOT TARGET ${cub_variant_meta_target})
+        add_custom_target(${cub_variant_meta_target})
+      endif()
+
+      # Generate multiple tests, one per variant.
+      # See `cub_get_test_params` for details.
+      foreach(var_idx RANGE ${range_end})
+        list(GET variant_labels ${var_idx} label)
+        list(GET variant_defs ${var_idx} defs)
+        string(REPLACE ":" ";" defs "${defs}")
+        # A unique index per variant:
+        list(APPEND defs VAR_IDX=${var_idx})
+
+        # Check if the test explicitly specifies launcher id:
+        _cub_has_lid_variant(explicit_launcher "${label}")
+        _cub_launcher_id(explicit_launcher_id "${label}")
+
+        if (${explicit_launcher})
+          set(launcher_id "${explicit_launcher_id}")
+        else()
+          if (${CUB_FORCE_RDC})
+            set(launcher_id 1)
+          else()
+            set(launcher_id 0)
+          endif()
+        endif()
+
+        _cub_launcher_requires_rdc(cdp_val "${launcher_id}")
+
+        if (cdp_val AND NOT CUB_ENABLE_RDC_TESTS)
+          continue()
+        endif()
+
+        cub_add_test(test_target ${test_name}.${label} "${test_src}" ${cub_target} ${launcher_id})
+
+        # Enable RDC if the test either:
+        # 1. Explicitly requests it (lid_1 label)
+        # 2. Does not have an explicit CDP variant (no lid_0, lid_1, or lid_2) but
+        #    RDC testing is forced
+        #
+        # Tests that explicitly request no cdp (lid_0 label) should never enable
+        # RDC.
+        cub_configure_cuda_target(${test_target} RDC ${cdp_val})
+        add_dependencies(${variant_meta_target} ${test_target})
+        add_dependencies(${cub_variant_meta_target} ${test_target})
+        target_compile_definitions(${test_target} PRIVATE ${defs})
+      endforeach() # Variant
+    endif() # Has variants
+  endforeach() # CUB targets
+endforeach() # Source file
+
+add_subdirectory(cmake)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/README.md b/source/tnn/device/cuda/thirdparty/cub/test/README.md
new file mode 100644
index 000000000..84c465e19
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/README.md
@@ -0,0 +1,126 @@
+# Test Parametrization
+
+Some of CUB's tests are very slow to build and are capable of exhausting RAM
+during compilation/linking. To avoid such issues, large tests are split into
+multiple executables to take advantage of parallel computation and reduce memory
+usage.
+
+CUB facilitates this by checking for special `%PARAM%` comments in each test's
+source code, and then uses this information to generate multiple executables
+with different configurations.
+
+## Using `%PARAM%`
+
+The `%PARAM%` hint provides an automated method of generating multiple test
+executables from a single source file. To use it, add one or more special
+comments to the test source file:
+
+```cpp
+// %PARAM% [definition] [label] [values]
+```
+
+CMake will parse the source file and extract these comments, using them to
+generate multiple test executables for the full cartesian product of values.
+
+- `definition` will be used as a preprocessor definition name. By convention,
+  these begin with `TEST_`.
+- `label` is a short, human-readable label that will be used in the test
+  executable's name to identify the test variant.
+- `values` is a colon-separated list of values used during test generation. Only
+  numeric values have been tested.
+
+## Special Labels
+
+### Testing Different Launchers
+
+If a `label` is `lid`, it is assumed that the parameter is used to explicitly
+test variants built with different launchers. The `values` for such a
+parameter must be `0:1:2`, with `0` indicating host launch and CDP disabled (RDC off),
+`1` indicating device launch and CDP enabled (RDC on),
+`2` indicating graph capture launch and CDP disabled (RDC off).
+
+Tests that do not contain a variant labeled `lid` will only enable RDC if
+the CMake config enables them.
+
+## Example
+
+For example, if `test_baz.cu` contains the following lines:
+
+```cpp
+// %PARAM% TEST_FOO foo 0:1:2
+// %PARAM% TEST_LAUNCH lid 0:1
+```
+
+Six executables and CTest targets will be generated with unique definitions
+(only c++17 targets shown):
+
+| Executable Name                  | Preprocessor Definitions       | Launcher  |
+|----------------------------------|--------------------------------|-----------|
+| `cub.cpp17.test.baz.foo_0.lid_0` | `-DTEST_FOO=0 -DTEST_LAUNCH=0` | Host      |
+| `cub.cpp17.test.baz.foo_0.lid_1` | `-DTEST_FOO=0 -DTEST_LAUNCH=1` | Device    |
+| `cub.cpp17.test.baz.foo_1.lid_0` | `-DTEST_FOO=1 -DTEST_LAUNCH=0` | Host      |
+| `cub.cpp17.test.baz.foo_1.lid_1` | `-DTEST_FOO=1 -DTEST_LAUNCH=1` | Device    |
+| `cub.cpp17.test.baz.foo_2.lid_0` | `-DTEST_FOO=2 -DTEST_LAUNCH=0` | Host      |
+| `cub.cpp17.test.baz.foo_2.lid_1` | `-DTEST_FOO=2 -DTEST_LAUNCH=1` | Device    |
+
+## Changing `%PARAM%` Hints
+
+Since CMake does not automatically reconfigure the build when source files are
+modified, CMake will need to be rerun manually whenever the `%PARAM%` comments
+change.
+
+## Building and Running Split Tests
+
+CMake will generate individual build and test targets for each test variant, and
+also provides build "metatargets" that compile all variants of a given test.
+
+The variants follow the usual naming convention for CUB's tests, but include a
+suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above).
+
+### Individual Test Variants
+
+Continuing with the `test_baz.cu` example, the test variant that uses
+`-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone:
+
+```bash
+# Build a single variant:
+make cub.cpp17.test.baz.foo_1.bar_4
+
+# Run a single variant
+bin/cub.cpp17.test.baz.foo_1.bar_4
+
+# Run a single variant using CTest regex:
+ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4
+```
+
+### All Variants of a Test
+
+Using a metatarget and the proper regex, all variants of a test can be built and
+executed without listing all variants explicitly:
+
+```bash
+# Build all variants using the `.all` metatarget
+make cub.cpp17.test.baz.all
+
+# Run all variants:
+ctest -R cub\.cpp17\.test\.baz\.
+```
+
+## Debugging
+
+Running CMake with `--log-level=VERBOSE` will print out extra information about
+all detected test variants.
+
+## Additional Info
+
+Ideally, only parameters that directly influence kernel template instantiations
+should be split out in this way. If changing a parameter doesn't change the
+kernel template type, the same kernel will be compiled into multiple
+executables. This defeats the purpose of splitting up the test since the
+compiler will generate redundant code across the new split executables.
+
+The best candidate parameters for splitting are input value types, rather than
+integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more
+infrastructure (data generation, validation) to be reused. Splitting other
+parameters can cause build times to increase since type-related infrastructure
+has to be rebuilt for each test variant.
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_large_array_sort_helper.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_large_array_sort_helper.cuh
new file mode 100644
index 000000000..f4c6afbd4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_large_array_sort_helper.cuh
@@ -0,0 +1,412 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/util_type.cuh>
+
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/memory.h>
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+#include <thrust/tabulate.h>
+
+#include <cuda/std/iterator>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/cpu_timer.cuh>
+#include <c2h/device_policy.cuh>
+#include <c2h/generators.cuh> // seed_t
+#include <c2h/vector.cuh>
+
+// #define DEBUG_TIMING
+
+#ifdef DEBUG_TIMING
+#  define TIME(expr) expr
+#else
+#  define TIME(expr) /* no op */ []() {}()
+#endif
+
+namespace detail
+{
+
+template <typename KeyType>
+class key_sort_ref_key_transform
+{
+  static constexpr double max_key = static_cast<double>(::cuda::std::numeric_limits<KeyType>::max());
+  const double m_conversion;
+  std::size_t m_num_items;
+  bool m_is_descending;
+
+public:
+  key_sort_ref_key_transform(std::size_t num_items, bool is_descending)
+      : m_conversion(max_key / num_items)
+      , m_num_items(num_items)
+      , m_is_descending(is_descending)
+  {}
+
+  _CCCL_HOST_DEVICE KeyType operator()(std::size_t idx) const
+  {
+    return m_is_descending ? static_cast<KeyType>((m_num_items - 1 - idx) * m_conversion)
+                           : static_cast<KeyType>(idx * m_conversion);
+  }
+};
+
+template <typename KeyType>
+struct summary
+{
+  std::size_t index;
+  std::size_t count;
+  KeyType key;
+};
+
+template <typename KeyType>
+struct index_to_summary
+{
+  using summary_t = summary<KeyType>;
+
+  std::size_t num_items;
+  std::size_t num_summaries;
+  bool is_descending;
+
+  template <typename index_type>
+  _CCCL_HOST_DEVICE summary_t operator()(index_type idx) const
+  {
+    constexpr KeyType max_key = ::cuda::std::numeric_limits<KeyType>::max();
+
+    const double key_conversion = static_cast<double>(max_key) / static_cast<double>(num_summaries);
+    const KeyType key           = is_descending ? static_cast<KeyType>((num_summaries - 1 - idx) * key_conversion)
+                                                : static_cast<KeyType>(idx * key_conversion);
+
+    const std::size_t elements_per_summary = num_items / num_summaries;
+    const std::size_t run_index            = idx * elements_per_summary;
+    const std::size_t run_size = idx == (num_summaries - 1) ? (num_items - run_index) : elements_per_summary;
+
+    return summary_t{run_index, run_size, key};
+  }
+};
+
+template <typename KeyType>
+class pair_sort_ref_key_transform
+{
+  static constexpr KeyType max_key = ::cuda::std::numeric_limits<KeyType>::max();
+
+  double m_key_conversion; // Converts summary index to key
+  std::size_t m_num_summaries;
+  std::size_t m_unpadded_run_size; // typical run size
+  bool m_is_descending;
+
+public:
+  pair_sort_ref_key_transform(std::size_t num_items, std::size_t num_summaries, bool is_descending)
+      : m_key_conversion(static_cast<double>(max_key) / static_cast<double>(num_summaries))
+      , m_num_summaries(num_summaries)
+      , m_unpadded_run_size(num_items / num_summaries)
+      , m_is_descending(is_descending)
+  {}
+
+  _CCCL_HOST_DEVICE KeyType operator()(std::size_t idx) const
+  {
+    // The final summary may be padded, so truncate the summary_idx at the last valid idx:
+    const std::size_t summary_idx = thrust::min(m_num_summaries - 1, idx / m_unpadded_run_size);
+    const KeyType key = m_is_descending ? static_cast<KeyType>((m_num_summaries - 1 - summary_idx) * m_key_conversion)
+                                        : static_cast<KeyType>(summary_idx * m_key_conversion);
+
+    return key;
+  }
+};
+
+template <typename ValueType>
+struct index_to_value
+{
+  template <typename index_type>
+  _CCCL_HOST_DEVICE ValueType operator()(index_type index)
+  {
+    return static_cast<ValueType>(index);
+  }
+};
+
+} // namespace detail
+
+template <typename KeyType, typename ValueType = cub::NullType>
+struct large_array_sort_helper
+{
+  // Sorted keys/values in host memory
+  // (May be unused if results can be verified with fancy iterators)
+  c2h::host_vector<KeyType> keys_ref;
+  c2h::host_vector<ValueType> values_ref;
+
+  // Unsorted keys/values in device memory
+  c2h::device_vector<KeyType> keys_in;
+  c2h::device_vector<ValueType> values_in;
+
+  // Allocated device memory for output keys/values
+  c2h::device_vector<KeyType> keys_out;
+  c2h::device_vector<ValueType> values_out;
+
+  // Double buffer for keys/values. Aliases the in/out arrays.
+  cub::DoubleBuffer<KeyType> keys_buffer;
+  cub::DoubleBuffer<ValueType> values_buffer;
+
+  // By default, both input and output arrays are allocated to ensure that 2 * num_items * (sizeof(KeyType) +
+  // sizeof(ValueType)) device memory is available at the start of the initialize_* methods. This ensures that we'll
+  // fail quickly if the problem size exceeds the necessary storage required for sorting. If the output arrays are not
+  // being used (e.g. in-place merge sort API with temporary storage allocation), these may be freed easily by calling
+  // this method:
+  void deallocate_outputs()
+  {
+    keys_out.clear();
+    keys_out.shrink_to_fit();
+    values_out.clear();
+    values_out.shrink_to_fit();
+  }
+
+  // Populates keys_in with random KeyTypes. Allocates keys_out and configures keys_buffer appropriately.
+  // Allocates a total of 2 * num_items * sizeof(KeyType) device memory and no host memory.
+  // Shuffle will allocate some additional device memory overhead for scan temp storage.
+  // Pass the sorted output to verify_unstable_key_sort to validate.
+  void initialize_for_unstable_key_sort(c2h::seed_t seed, std::size_t num_items, bool is_descending)
+  {
+    TIME(c2h::cpu_timer timer);
+
+    // Preallocate device memory ASAP so we fail quickly on bad_alloc
+    keys_in.resize(num_items);
+    keys_out.resize(num_items);
+    keys_buffer =
+      cub::DoubleBuffer<KeyType>(thrust::raw_pointer_cast(keys_in.data()), thrust::raw_pointer_cast(keys_out.data()));
+
+    TIME(timer.print_elapsed_seconds_and_reset("Device Alloc"));
+
+    { // Place the sorted keys into keys_out
+      auto key_iter = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(std::size_t{0}),
+        detail::key_sort_ref_key_transform<KeyType>(num_items, is_descending));
+      thrust::copy(c2h::device_policy, key_iter, key_iter + num_items, keys_out.begin());
+    }
+
+    TIME(timer.print_elapsed_seconds_and_reset("Generate sorted keys"));
+
+    // shuffle random keys into keys_in
+    thrust::shuffle_copy(
+      c2h::device_policy,
+      keys_out.cbegin(),
+      keys_out.cend(),
+      keys_in.begin(),
+      thrust::default_random_engine(static_cast<std::uint32_t>(seed.get())));
+
+    TIME(timer.print_elapsed_seconds_and_reset("Shuffle"));
+
+    // Reset keys_out to remove the valid sorted keys:
+    thrust::fill(c2h::device_policy, keys_out.begin(), keys_out.end(), KeyType{});
+
+    TIME(timer.print_elapsed_seconds_and_reset("Reset Output"));
+  }
+
+  // Verify the results of sorting the keys_in produced by initialize_for_unstable_key_sort.
+  void verify_unstable_key_sort(std::size_t num_items, bool is_descending, const c2h::device_vector<KeyType>& keys)
+  {
+    TIME(c2h::cpu_timer timer);
+    auto key_iter = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(std::size_t{0}),
+      detail::key_sort_ref_key_transform<KeyType>{num_items, is_descending});
+    REQUIRE(thrust::equal(c2h::device_policy, keys.cbegin(), keys.cend(), key_iter));
+    TIME(timer.print_elapsed_seconds_and_reset("Validate keys"));
+  }
+
+  // Populates keys_in with random KeyTypes and values_in with sequential ValueTypes.
+  // Allocates keys_out and values_out and configures keys_buffer and values_buffer appropriately.
+  // values_ref will contain the expected stable sorted values.
+  // Allocates 2 * num_items * (sizeof(KeyType) + sizeof(ValueType)) device memory.
+  // May allocate up to 2 * num_items * (sizeof(KeyType) + sizeof(ValueType)) on the host.
+  // Pass the sorted outputs to verify_stable_pair_sort to validate.
+  void initialize_for_stable_pair_sort(c2h::seed_t seed, std::size_t num_items, bool is_descending)
+  {
+    static_assert(!::cuda::std::is_same<ValueType, cub::NullType>::value, "ValueType must be valid.");
+    using summary_t = detail::summary<KeyType>;
+
+    const std::size_t max_summaries = this->compute_max_summaries(num_items);
+    const std::size_t num_summaries = this->compute_num_summaries(num_items, max_summaries);
+
+    TIME(c2h::cpu_timer timer);
+
+    c2h::device_vector<summary_t> d_summaries;
+    // Overallocate -- if this fails, there won't be be enough free device memory for the input/output arrays.
+    // Better to fail now before spending time computing the inputs/outputs.
+    d_summaries.reserve(2 * max_summaries);
+    d_summaries.resize(num_summaries);
+
+    TIME(timer.print_elapsed_seconds_and_reset("Device allocate"));
+
+    // Populate the summaries using evenly spaced keys and constant sized runs, padding the last run to fill.
+    thrust::tabulate(c2h::device_policy,
+                     d_summaries.begin(),
+                     d_summaries.end(),
+                     detail::index_to_summary<KeyType>{num_items, num_summaries, is_descending});
+
+    TIME(timer.print_elapsed_seconds_and_reset("idx -> summary"));
+
+    // Copy the summaries to host memory and release device summary memory.
+    c2h::host_vector<summary_t> h_summaries = d_summaries;
+
+    TIME(timer.print_elapsed_seconds_and_reset("D->H Summaries"));
+
+    d_summaries.clear();
+    d_summaries.shrink_to_fit();
+
+    TIME(timer.print_elapsed_seconds_and_reset("Free device summaries"));
+
+    // Build the unsorted key and reference value arrays on host:
+    c2h::host_vector<KeyType> h_unsorted_keys(num_items);
+    c2h::host_vector<ValueType> h_sorted_values(num_items);
+
+    TIME(timer.print_elapsed_seconds_and_reset("Host allocate"));
+
+    {
+      using range_t = typename thrust::random::uniform_int_distribution<std::size_t>::param_type;
+      constexpr range_t run_range{1, 256};
+
+      thrust::default_random_engine rng(static_cast<std::uint32_t>(seed.get()));
+      thrust::random::uniform_int_distribution<std::size_t> dist;
+      range_t summary_range{0, num_summaries - 1};
+      for (std::size_t i = 0; i < num_items; /*inc in loop*/)
+      {
+        const std::size_t summ_idx = dist(rng, summary_range);
+        summary_t& summary         = h_summaries[summ_idx];
+        const std::size_t run_size = std::min(summary.count, dist(rng, run_range));
+
+        std::fill(h_unsorted_keys.begin() + i, // formatting
+                  h_unsorted_keys.begin() + i + run_size,
+                  summary.key);
+        std::iota(h_sorted_values.begin() + summary.index, // formatting
+                  h_sorted_values.begin() + summary.index + run_size,
+                  static_cast<ValueType>(i));
+
+        i += run_size;
+        summary.index += run_size;
+        summary.count -= run_size;
+        if (summary.count == 0)
+        {
+          using std::swap;
+          swap(summary, h_summaries.back());
+          h_summaries.pop_back();
+          summary_range.second -= 1;
+        }
+      }
+    }
+
+    TIME(timer.print_elapsed_seconds_and_reset("Host-side summary processing"));
+
+    // Release the host summary memory.
+    REQUIRE(h_summaries.empty());
+    h_summaries.shrink_to_fit();
+
+    TIME(timer.print_elapsed_seconds_and_reset("Host summaries free"));
+
+    // Copy the unsorted keys to device
+    keys_in = h_unsorted_keys;
+    h_unsorted_keys.clear();
+    h_unsorted_keys.shrink_to_fit();
+
+    TIME(timer.print_elapsed_seconds_and_reset("Unsorted keys H->D"));
+
+    // Unsorted values are just a sequence
+    values_in.resize(num_items);
+    thrust::tabulate(c2h::device_policy, values_in.begin(), values_in.end(), detail::index_to_value<ValueType>{});
+
+    TIME(timer.print_elapsed_seconds_and_reset("Unsorted value gen"));
+
+    // Copy the sorted values to the member array.
+    // Sorted keys are verified using a fancy iterator.
+    values_ref = std::move(h_sorted_values); // Same memory space, just move.
+
+    TIME(timer.print_elapsed_seconds_and_reset("Copy/move refs"));
+
+    keys_out.resize(num_items);
+    values_out.resize(num_items);
+
+    TIME(timer.print_elapsed_seconds_and_reset("Prep device outputs"));
+
+    keys_buffer =
+      cub::DoubleBuffer<KeyType>(thrust::raw_pointer_cast(keys_in.data()), thrust::raw_pointer_cast(keys_out.data()));
+    values_buffer = cub::DoubleBuffer<ValueType>(
+      thrust::raw_pointer_cast(values_in.data()), thrust::raw_pointer_cast(values_out.data()));
+  }
+
+  // Verify the results of sorting the keys_in produced by initialize_for_stable_pair_sort.
+  void verify_stable_pair_sort(
+    std::size_t num_items,
+    bool is_descending,
+    const c2h::device_vector<KeyType>& keys,
+    const c2h::device_vector<ValueType>& values)
+  {
+    static_assert(!::cuda::std::is_same<ValueType, cub::NullType>::value, "ValueType must be valid.");
+
+    const std::size_t max_summaries = this->compute_max_summaries(num_items);
+    const std::size_t num_summaries = this->compute_num_summaries(num_items, max_summaries);
+
+    TIME(c2h::cpu_timer timer);
+
+    auto ref_key_begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(std::size_t{0}),
+      detail::pair_sort_ref_key_transform<KeyType>(num_items, num_summaries, is_descending));
+
+    REQUIRE(thrust::equal(c2h::device_policy, keys.cbegin(), keys.cend(), ref_key_begin));
+
+    TIME(timer.print_elapsed_seconds_and_reset("Validate keys"));
+
+    REQUIRE((values == this->values_ref) == true);
+
+    TIME(timer.print_elapsed_seconds_and_reset("Validate values"));
+  }
+
+private:
+  // The maximum number of summaries that will fill the target memory footprint of one full set of key/value pairs.
+  static std::size_t compute_max_summaries(std::size_t num_items)
+  {
+    using summary_t = detail::summary<KeyType>;
+
+    const std::size_t max_summary_mem = num_items * (sizeof(KeyType) + sizeof(ValueType));
+    const std::size_t max_summaries   = ::cuda::ceil_div(max_summary_mem, sizeof(summary_t));
+    return max_summaries;
+  }
+
+  // The actual number of summaries to use, considering memory, key type, and number of items.
+  static std::size_t compute_num_summaries(std::size_t num_items, std::size_t max_summaries)
+  {
+    constexpr KeyType max_key       = ::cuda::std::numeric_limits<KeyType>::max();
+    const std::size_t num_summaries = std::min(std::min(max_summaries, num_items), static_cast<std::size_t>(max_key));
+    return num_summaries;
+  }
+};
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_radix_sort_helper.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_radix_sort_helper.cuh
new file mode 100644
index 000000000..a09094f0b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_radix_sort_helper.cuh
@@ -0,0 +1,518 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/gather.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/memory.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+
+#include <cuda/std/bit>
+
+#include <array>
+#include <climits>
+#include <cstdint>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/generators.cuh>
+#include <c2h/utility.cuh>
+#include <c2h/vector.cuh>
+
+// Index types used for OffsetsT testing
+using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
+using all_offset_types =
+  c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
+
+// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
+template <typename OffsetT, OffsetT Step>
+struct segment_iterator
+{
+  OffsetT last = 0;
+
+  segment_iterator(OffsetT last1)
+      : last{last1}
+  {}
+
+  __host__ __device__ OffsetT operator()(OffsetT x) const
+  {
+    switch (x)
+    {
+      case Step * 100:
+        return Step * 100 + Step / 2;
+      case Step * 200:
+        return Step * 200 + Step / 2;
+      case Step * 300:
+        return Step * 300 + Step / 2;
+      case Step * 400:
+        return Step * 400 + Step / 2;
+      case Step * 500:
+        return Step * 500 + Step / 2;
+      case Step * 600:
+        return Step * 600 + Step / 2;
+      case Step * 700:
+        return Step * 700 + Step / 2;
+      case Step * 800:
+        return Step * 800 + Step / 2;
+      case Step * 900:
+        return Step * 900 + Step / 2;
+      default:
+        return (x >= last) ? last : x * Step;
+    }
+  }
+};
+
+// The launchers defined in catch2_test_launch_helper.h do not support
+// passing objects by reference since the device-launch tests cannot
+// pass references to a __global__ function. The DoubleBuffer object
+// must be passed by reference to the radix sort APIs so that the selector
+// can be updated appropriately for the caller. This wrapper allows the
+// selector to be updated in a way that's compatible with the launch helpers.
+// Call initialize() before using to allocate temporary memory, and finalize()
+// when finished to release.
+struct double_buffer_sort_t
+{
+private:
+  bool m_is_descending;
+  int* m_selector;
+
+public:
+  explicit double_buffer_sort_t(bool is_descending)
+      : m_is_descending(is_descending)
+      , m_selector(nullptr)
+  {}
+
+  void initialize()
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&m_selector, sizeof(int)));
+  }
+
+  void finalize()
+  {
+    REQUIRE(cudaSuccess == cudaFreeHost(m_selector));
+    m_selector = nullptr;
+  }
+
+  int selector() const
+  {
+    return *m_selector;
+  }
+
+  template <class KeyT, class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t
+  operator()(std::uint8_t* d_temp_storage, std::size_t& temp_storage_bytes, cub::DoubleBuffer<KeyT> keys, As... as)
+  {
+    const cudaError_t status =
+      m_is_descending ? cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys, as...)
+                      : cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys, as...);
+
+    *m_selector = keys.selector;
+    return status;
+  }
+
+  template <class KeyT, class ValueT, class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(
+    std::uint8_t* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    cub::DoubleBuffer<KeyT> keys,
+    cub::DoubleBuffer<ValueT> values,
+    As... as)
+  {
+    const cudaError_t status =
+      m_is_descending
+        ? cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, keys, values, as...)
+        : cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys, values, as...);
+
+    *m_selector = keys.selector;
+    return status;
+  }
+};
+
+struct double_buffer_segmented_sort_t
+{
+private:
+  bool m_is_descending;
+  int* m_selector;
+
+public:
+  explicit double_buffer_segmented_sort_t(bool is_descending)
+      : m_is_descending(is_descending)
+      , m_selector(nullptr)
+  {}
+
+  void initialize()
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&m_selector, sizeof(int)));
+  }
+
+  void finalize()
+  {
+    REQUIRE(cudaSuccess == cudaFreeHost(m_selector));
+    m_selector = nullptr;
+  }
+
+  int selector() const
+  {
+    return *m_selector;
+  }
+
+  template <class KeyT, class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t
+  operator()(std::uint8_t* d_temp_storage, std::size_t& temp_storage_bytes, cub::DoubleBuffer<KeyT> keys, As... as)
+  {
+    const cudaError_t status =
+      m_is_descending
+        ? cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, keys, as...)
+        : cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, keys, as...);
+
+    *m_selector = keys.selector;
+    return status;
+  }
+
+  template <class KeyT, class ValueT, class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(
+    std::uint8_t* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    cub::DoubleBuffer<KeyT> keys,
+    cub::DoubleBuffer<ValueT> values,
+    As... as)
+  {
+    const cudaError_t status =
+      m_is_descending
+        ? cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, keys, values, as...)
+        : cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, keys, values, as...);
+
+    *m_selector = keys.selector;
+    return status;
+  }
+};
+
+// Helpers to assist with specifying default args to DeviceRadixSort API:
+template <typename T>
+constexpr int begin_bit()
+{
+  return 0;
+}
+
+template <typename T>
+constexpr int end_bit()
+{
+  return static_cast<int>(sizeof(T) * CHAR_BIT);
+}
+
+template <class KeyT>
+c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, int begin_bit, int end_bit)
+{
+  c2h::host_vector<KeyT> h_striped_keys(h_keys);
+  KeyT* h_striped_keys_data = thrust::raw_pointer_cast(h_striped_keys.data());
+
+  using traits_t      = cub::Traits<KeyT>;
+  using bit_ordered_t = typename traits_t::UnsignedBits;
+
+  const int num_bits = end_bit - begin_bit;
+
+  for (std::size_t i = 0; i < h_keys.size(); i++)
+  {
+    bit_ordered_t key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);
+
+    _CCCL_IF_CONSTEXPR (traits_t::CATEGORY == cub::FLOATING_POINT)
+    {
+      const bit_ordered_t negative_zero = bit_ordered_t(1) << bit_ordered_t(sizeof(bit_ordered_t) * 8 - 1);
+
+      if (key == negative_zero)
+      {
+        key = 0;
+      }
+    }
+
+    key = traits_t::TwiddleIn(key);
+
+    if ((begin_bit > 0) || (end_bit < static_cast<int>(sizeof(KeyT) * 8)))
+    {
+      key &= ((bit_ordered_t{1} << num_bits) - 1) << begin_bit;
+    }
+
+    // striped keys are used to compare bit ordered representation of keys,
+    // so we do not twiddle-out the key here:
+    // key = traits_t::TwiddleOut(key);
+
+    memcpy(h_striped_keys_data + i, &key, sizeof(KeyT));
+  }
+
+  return h_striped_keys;
+}
+
+template <class T>
+struct indirect_binary_comparator_t
+{
+  const T* h_ptr{};
+  bool is_descending{};
+
+  indirect_binary_comparator_t(const T* h_ptr, bool is_descending)
+      : h_ptr(h_ptr)
+      , is_descending(is_descending)
+  {}
+
+  bool operator()(std::size_t a, std::size_t b)
+  {
+    if (is_descending)
+    {
+      return h_ptr[a] > h_ptr[b];
+    }
+
+    return h_ptr[a] < h_ptr[b];
+  }
+};
+
+template <class KeyT, class SegBeginIterT, class SegEndIterT>
+c2h::host_vector<std::size_t> get_permutation(
+  const c2h::host_vector<KeyT>& h_keys,
+  bool is_descending,
+  std::size_t num_segments,
+  SegBeginIterT h_seg_begin_it,
+  SegEndIterT h_seg_end_it,
+  int begin_bit,
+  int end_bit)
+{
+  c2h::host_vector<KeyT> h_striped_keys = get_striped_keys(h_keys, begin_bit, end_bit);
+
+  c2h::host_vector<std::size_t> h_permutation(h_keys.size());
+  thrust::sequence(h_permutation.begin(), h_permutation.end());
+
+  using traits_t      = cub::Traits<KeyT>;
+  using bit_ordered_t = typename traits_t::UnsignedBits;
+
+  auto bit_ordered_striped_keys =
+    reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(h_striped_keys.data()));
+
+  indirect_binary_comparator_t<bit_ordered_t> comp{bit_ordered_striped_keys, is_descending};
+
+  for (std::size_t segment = 0; segment < num_segments; ++segment)
+  {
+    std::stable_sort(
+      h_permutation.begin() + h_seg_begin_it[segment], h_permutation.begin() + h_seg_end_it[segment], comp);
+  }
+
+  return h_permutation;
+}
+
+template <class KeyT>
+c2h::host_vector<KeyT> radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  bool is_descending,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  c2h::host_vector<KeyT> h_keys(d_keys);
+  std::array<std::size_t, 2> segments{0, d_keys.size()};
+  c2h::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, 1, segments.cbegin(), segments.cbegin() + 1, begin_bit, end_bit);
+  c2h::host_vector<KeyT> result(d_keys.size());
+  thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin());
+
+  return result;
+}
+
+template <class KeyT, class ValueT>
+std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  const c2h::device_vector<ValueT>& d_values,
+  bool is_descending,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> result;
+  result.first.resize(d_keys.size());
+  result.second.resize(d_keys.size());
+
+  std::array<std::size_t, 2> segments{0, d_keys.size()};
+
+  c2h::host_vector<KeyT> h_keys(d_keys);
+  c2h::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, 1, segments.cbegin(), segments.cbegin() + 1, begin_bit, end_bit);
+
+  c2h::host_vector<ValueT> h_values(d_values);
+  thrust::gather(h_permutation.cbegin(),
+                 h_permutation.cend(),
+                 thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()),
+                 thrust::make_zip_iterator(result.first.begin(), result.second.begin()));
+
+  return result;
+}
+
+template <class KeyT, class SegBeginIterT, class SegEndIterT>
+c2h::host_vector<KeyT> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  bool is_descending,
+  std::size_t num_segments,
+  SegBeginIterT h_seg_begin_it,
+  SegEndIterT h_seg_end_it,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  c2h::host_vector<KeyT> h_keys(d_keys);
+  c2h::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+  c2h::host_vector<KeyT> result(d_keys.size());
+  thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin());
+
+  return result;
+}
+
+template <class KeyT, class ValueT, class SegBeginIterT, class SegEndIterT>
+std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  const c2h::device_vector<ValueT>& d_values,
+  bool is_descending,
+  std::size_t num_segments,
+  SegBeginIterT h_seg_begin_it,
+  SegEndIterT h_seg_end_it,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> result;
+  result.first.resize(d_keys.size());
+  result.second.resize(d_keys.size());
+
+  c2h::host_vector<KeyT> h_keys(d_keys);
+  c2h::host_vector<std::size_t> h_permutation =
+    get_permutation(h_keys, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+
+  c2h::host_vector<ValueT> h_values(d_values);
+  thrust::gather(h_permutation.cbegin(),
+                 h_permutation.cend(),
+                 thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()),
+                 thrust::make_zip_iterator(result.first.begin(), result.second.begin()));
+
+  return result;
+}
+
+template <class KeyT, class OffsetT>
+c2h::host_vector<KeyT> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  bool is_descending,
+  const c2h::device_vector<OffsetT>& d_offsets,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  const c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  const std::size_t num_segments = h_offsets.size() - 1;
+  auto h_seg_begin_it            = h_offsets.cbegin();
+  auto h_seg_end_it              = h_offsets.cbegin() + 1;
+  return segmented_radix_sort_reference(
+    d_keys, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+}
+
+template <class KeyT, class OffsetT>
+c2h::host_vector<KeyT> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  bool is_descending,
+  const c2h::device_vector<OffsetT>& d_offsets_begin,
+  const c2h::device_vector<OffsetT>& d_offsets_end,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  const c2h::host_vector<OffsetT> h_offsets_begin(d_offsets_begin);
+  const c2h::host_vector<OffsetT> h_offsets_end(d_offsets_end);
+  const std::size_t num_segments = h_offsets_begin.size();
+  auto h_seg_begin_it            = h_offsets_begin.cbegin();
+  auto h_seg_end_it              = h_offsets_end.cbegin();
+  return segmented_radix_sort_reference(
+    d_keys, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+}
+
+template <class KeyT, class ValueT, class OffsetT>
+std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  const c2h::device_vector<ValueT>& d_values,
+  bool is_descending,
+  const c2h::device_vector<OffsetT>& d_offsets,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  const c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  const std::size_t num_segments = h_offsets.size() - 1;
+  auto h_seg_begin_it            = h_offsets.cbegin();
+  auto h_seg_end_it              = h_offsets.cbegin() + 1;
+  return segmented_radix_sort_reference(
+    d_keys, d_values, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+}
+
+template <class KeyT, class ValueT, class OffsetT>
+std::pair<c2h::host_vector<KeyT>, c2h::host_vector<ValueT>> segmented_radix_sort_reference(
+  const c2h::device_vector<KeyT>& d_keys,
+  const c2h::device_vector<ValueT>& d_values,
+  bool is_descending,
+  const c2h::device_vector<OffsetT>& d_offsets_begin,
+  const c2h::device_vector<OffsetT>& d_offsets_end,
+  int begin_bit = 0,
+  int end_bit   = static_cast<int>(sizeof(KeyT) * CHAR_BIT))
+{
+  const c2h::host_vector<OffsetT> h_offsets_begin(d_offsets_begin);
+  const c2h::host_vector<OffsetT> h_offsets_end(d_offsets_end);
+  const std::size_t num_segments = h_offsets_begin.size();
+  auto h_seg_begin_it            = h_offsets_begin.cbegin();
+  auto h_seg_end_it              = h_offsets_end.cbegin();
+  return segmented_radix_sort_reference(
+    d_keys, d_values, is_descending, num_segments, h_seg_begin_it, h_seg_end_it, begin_bit, end_bit);
+}
+
+template <typename OffsetT>
+struct radix_offset_scan_op_t
+{
+  OffsetT num_items;
+
+  __host__ __device__ OffsetT operator()(OffsetT a, OffsetT b) const
+  {
+    const OffsetT sum = a + b;
+    return CUB_MIN(sum, num_items);
+  }
+};
+
+template <class OffsetT>
+void generate_segment_offsets(c2h::seed_t seed, c2h::device_vector<OffsetT>& offsets, std::size_t num_items)
+{
+  const std::size_t num_segments        = offsets.size() - 1;
+  const OffsetT expected_segment_length = static_cast<OffsetT>(::cuda::ceil_div(num_items, num_segments));
+  const OffsetT max_segment_length      = (expected_segment_length * 2) + 1;
+  c2h::gen(seed, offsets, OffsetT{0}, max_segment_length);
+  thrust::exclusive_scan(
+    c2h::device_policy,
+    offsets.begin(),
+    offsets.end(),
+    offsets.begin(),
+    OffsetT{0},
+    radix_offset_scan_op_t<OffsetT>{static_cast<OffsetT>(num_items)});
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_segmented_sort_helper.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_segmented_sort_helper.cuh
new file mode 100644
index 000000000..f28140d6c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_segmented_sort_helper.cuh
@@ -0,0 +1,1561 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+// #define CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT
+#include <cub/device/device_segmented_sort.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+#include <cstdio>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/cpu_timer.cuh>
+#include <c2h/extended_types.cuh>
+#include <c2h/utility.cuh>
+#include <catch2_test_launch_helper.h>
+#include <nv/target>
+
+#define MAKE_SEED_MOD_FUNCTION(name, xor_mask)                  \
+  inline c2h::seed_t make_##name##_seed(const c2h::seed_t seed) \
+  {                                                             \
+    auto seed_val = seed.get();                                 \
+    /* Verify assumptions: */                                   \
+    static_assert(sizeof(seed_val) == 8, "");                   \
+    static_assert(sizeof(xor_mask) == 8, "");                   \
+    return c2h::seed_t{seed_val ^ xor_mask};                    \
+  }
+
+// Each set of params should be different to make sure that we don't reuse the same seed for different random ops:
+MAKE_SEED_MOD_FUNCTION(key, 0xcccccccccccccccc)
+MAKE_SEED_MOD_FUNCTION(value, 0xaaaaaaaaaaaaaaaa)
+MAKE_SEED_MOD_FUNCTION(offset, 0x5555555555555555)
+MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333)
+
+#undef MAKE_SEED_MOD_FUNCTION
+
+template <typename T>
+struct unwrap_value_t_impl
+{
+  using type = T;
+};
+
+#if TEST_HALF_T
+template <>
+struct unwrap_value_t_impl<half_t>
+{
+  using type = __half;
+};
+#endif
+
+#if TEST_BF_T
+template <>
+struct unwrap_value_t_impl<bfloat16_t>
+{
+  using type = __nv_bfloat16;
+};
+#endif
+
+template <typename T>
+using unwrap_value_t = typename unwrap_value_t_impl<T>::type;
+
+///////////////////////////////////////////////////////////////////////////////
+// Derived element gen/validation
+
+template <typename T>
+_CCCL_HOST_DEVICE __forceinline__ double compute_conversion_factor(int segment_size, T)
+{
+  const double max_value = static_cast<double>(::cuda::std::numeric_limits<T>::max());
+  return (max_value + 1) / segment_size;
+}
+
+_CCCL_HOST_DEVICE __forceinline__ double compute_conversion_factor(int segment_size, double)
+{
+  const double max_value = ::cuda::std::numeric_limits<double>::max();
+  return max_value / segment_size;
+}
+
+_CCCL_HOST_DEVICE __forceinline__ double compute_conversion_factor(int, cub::NullType)
+{
+  return 1.0;
+}
+
+template <typename T>
+struct segment_filler
+{
+  T* d_data{};
+  const int* d_offsets{};
+  bool descending{};
+
+  segment_filler(T* d_data, const int* d_offsets, bool descending)
+      : d_data(d_data)
+      , d_offsets(d_offsets)
+      , descending(descending)
+  {}
+
+  _CCCL_DEVICE void operator()(int segment_id) const
+  {
+    const int segment_begin = d_offsets[segment_id];
+    const int segment_end   = d_offsets[segment_id + 1];
+    const int segment_size  = segment_end - segment_begin;
+    if (segment_size == 0)
+    {
+      return;
+    }
+
+    const double conversion = compute_conversion_factor(segment_size, T{});
+
+    if (descending)
+    {
+      int counter = segment_size - 1;
+      for (int i = segment_begin; i < segment_end; i++)
+      {
+        d_data[i] = static_cast<T>(conversion * counter--);
+      }
+    }
+    else
+    {
+      int counter = 0;
+      for (int i = segment_begin; i < segment_end; i++)
+      {
+        d_data[i] = static_cast<T>(conversion * counter++);
+      }
+    }
+  }
+};
+
+template <typename KeyT, typename ValueT, bool STABLE>
+struct segment_checker
+{
+  const KeyT* d_keys{};
+  ValueT* d_values{}; // May be permuted if STABLE is false.
+  const int* d_offsets{};
+  bool sort_descending{};
+
+  segment_checker(const KeyT* d_keys, ValueT* d_values, const int* d_offsets, bool sort_descending)
+      : d_keys(d_keys)
+      , d_values(d_values)
+      , d_offsets(d_offsets)
+      , sort_descending(sort_descending)
+  {}
+
+  _CCCL_DEVICE bool operator()(int segment_id)
+  {
+    const int segment_begin = d_offsets[segment_id];
+    const int segment_end   = d_offsets[segment_id + 1];
+    const int segment_size  = segment_end - segment_begin;
+    if (segment_size == 0)
+    {
+      return true;
+    }
+
+    if (!this->check_results(segment_begin, segment_size, ValueT{}))
+    {
+      return false;
+    }
+
+    return true;
+  }
+
+private:
+  // Keys only:
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool check_results(
+    int segment_begin, //
+    int segment_size,
+    cub::NullType)
+  {
+    const double conversion = compute_conversion_factor(segment_size, KeyT{});
+
+    for (int i = 0; i < segment_size; i++)
+    {
+      const KeyT key = this->compute_key(i, segment_size, conversion);
+      if (d_keys[segment_begin + i] != key)
+      {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  // Pairs:
+  template <typename DispatchValueT> // Same as ValueT if not cub::NullType
+  _CCCL_DEVICE _CCCL_FORCEINLINE bool
+  check_results(int segment_begin, //
+                int segment_size,
+                DispatchValueT)
+  {
+    // Validating values is trickier, since duplicate keys lead to different requirements for stable/unstable sorts.
+    const double key_conversion   = compute_conversion_factor(segment_size, KeyT{});
+    const double value_conversion = compute_conversion_factor(segment_size, ValueT{});
+
+    // Find ranges of duplicate keys in the output:
+    int key_out_dup_begin = 0;
+    while (key_out_dup_begin < segment_size)
+    {
+      int key_out_dup_end    = key_out_dup_begin;
+      const KeyT current_key = this->compute_key(key_out_dup_end, segment_size, key_conversion);
+
+      // Find end of duplicate key range and validate all output keys as we go:
+      do
+      {
+        if (current_key != d_keys[segment_begin + key_out_dup_end])
+        {
+          return false;
+        }
+        key_out_dup_end++;
+      } while (key_out_dup_end < segment_size
+               && current_key == this->compute_key(key_out_dup_end, segment_size, key_conversion));
+
+      // Bookkeeping for validating unstable sorts:
+      int unchecked_values_for_current_dup_key_begin     = segment_begin + key_out_dup_begin;
+      const int unchecked_values_for_current_dup_key_end = segment_begin + key_out_dup_end;
+
+      // NVCC claims that these variables are set-but-not-used, and the usual tricks to silence
+      // those warnings don't work. This convoluted nonsense, however, does work...
+      if (static_cast<bool>(unchecked_values_for_current_dup_key_begin)
+          || static_cast<bool>(unchecked_values_for_current_dup_key_end))
+      {
+        []() {}(); // no-op lambda
+      }
+
+      // Validate all output values for the current key by determining the input key indicies and computing the matching
+      // input values.
+      const int num_dup_keys     = key_out_dup_end - key_out_dup_begin;
+      const int key_in_dup_begin = segment_size - key_out_dup_end;
+
+      // Validate the range of values corresponding to the current duplicate key:
+      for (int dup_idx = 0; dup_idx < num_dup_keys; ++dup_idx)
+      {
+        const int in_seg_idx = key_in_dup_begin + dup_idx;
+
+        // Compute the original input value corresponding to the current duplicate key.
+        // NOTE: Keys and values are generated using opposing ascending/descending parameters, so the generated input
+        // values are descending when generating ascending input keys for a descending sort.
+        const int conv_idx         = sort_descending ? (segment_size - 1 - in_seg_idx) : in_seg_idx;
+        const ValueT current_value = static_cast<ValueT>(conv_idx * value_conversion);
+        _CCCL_IF_CONSTEXPR (STABLE)
+        {
+          // For stable sorts, the output value must appear at an exact offset:
+          const int out_seg_idx = key_out_dup_begin + dup_idx;
+          if (current_value != d_values[segment_begin + out_seg_idx])
+          {
+            return false;
+          }
+        }
+        else
+        {
+          // For unstable sorts, the reference value can appear anywhere in the output values corresponding to the
+          // current duplicate key.
+          // For each reference value, find the corresponding value in the output and swap it out of the unchecked
+          // region:
+          int probe_unchecked_idx = unchecked_values_for_current_dup_key_begin;
+          for (; probe_unchecked_idx < unchecked_values_for_current_dup_key_end; ++probe_unchecked_idx)
+          {
+            if (current_value == d_values[probe_unchecked_idx])
+            {
+              using thrust::swap;
+              swap(d_values[probe_unchecked_idx], d_values[unchecked_values_for_current_dup_key_begin]);
+              unchecked_values_for_current_dup_key_begin++;
+              break;
+            }
+          }
+
+          //  Check that the probe found a match:
+          if (probe_unchecked_idx == unchecked_values_for_current_dup_key_end)
+          {
+            return false;
+          }
+        } // End of STABLE/UNSTABLE check
+      } // End of duplicate key value validation
+
+      // Prepare for next set of dup keys
+      key_out_dup_begin = key_out_dup_end;
+    }
+
+    return true;
+  }
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE KeyT compute_key(int seg_idx, int segment_size, double conversion)
+  {
+    int conv_idx = sort_descending ? (segment_size - 1 - seg_idx) : seg_idx;
+    return static_cast<KeyT>(conv_idx * conversion);
+  }
+};
+
+// Generates segmented arrays using keys/values derived from segment indices.
+// d_offsets should be populated and d_keys/d_values preallocated.
+// d_values may be left empty if ValueT == cub::NullType.
+// If descending_sort is true, the keys will ascend and the values will descend.
+// Duplicate keys will be generated if the segment size exceeds the max KeyT.
+// Sorted results may be validated with validate_sorted_derived_outputs.
+template <typename KeyT, typename ValueT>
+void generate_unsorted_derived_inputs(
+  bool descending_sort, //
+  const c2h::device_vector<int>& d_offsets,
+  c2h::device_vector<KeyT>& d_keys,
+  c2h::device_vector<ValueT>& d_values)
+{
+  C2H_TIME_SCOPE("GenerateUnsortedDerivedInputs");
+
+  static constexpr bool sort_pairs = !::cuda::std::is_same<ValueT, cub::NullType>::value;
+
+  const int num_segments = static_cast<int>(d_offsets.size() - 1);
+  const int* offsets     = thrust::raw_pointer_cast(d_offsets.data());
+  KeyT* keys             = thrust::raw_pointer_cast(d_keys.data());
+  ValueT* values         = thrust::raw_pointer_cast(d_values.data());
+
+  (void) values; // Unused for key-only sort.
+
+  // Build keys in reversed order from how they'll eventually be sorted:
+  thrust::for_each(c2h::nosync_device_policy,
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(num_segments),
+                   segment_filler<KeyT>{keys, offsets, !descending_sort});
+  _CCCL_IF_CONSTEXPR (sort_pairs)
+  {
+    // Values are generated in reversed order from keys:
+    thrust::for_each(c2h::nosync_device_policy,
+                     thrust::make_counting_iterator(0),
+                     thrust::make_counting_iterator(num_segments),
+                     segment_filler<ValueT>{values, offsets, descending_sort});
+  }
+
+  // The for_each calls are using nosync policies:
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// Verifies the results of sorting the segmented key/value arrays produced by generate_unsorted_derived_inputs.
+// Reference values are computed on-the-fly, avoiding the need for host/device transfers and reference array sorting.
+// d_values may be left empty if ValueT == cub::NullType. d_values may be permuted within duplicate key ranges if STABLE
+// is false.
+template <bool STABLE, typename KeyT, typename ValueT>
+void validate_sorted_derived_outputs(
+  bool descending_sort, //
+  const c2h::device_vector<int>& d_offsets,
+  const c2h::device_vector<KeyT>& d_keys,
+  c2h::device_vector<ValueT>& d_values)
+{
+  C2H_TIME_SCOPE("validate_sorted_derived_outputs");
+  const int num_segments = static_cast<int>(d_offsets.size() - 1);
+  const KeyT* keys       = thrust::raw_pointer_cast(d_keys.data());
+  ValueT* values         = thrust::raw_pointer_cast(d_values.data());
+  const int* offsets     = thrust::raw_pointer_cast(d_offsets.data());
+
+  REQUIRE(thrust::all_of(c2h::device_policy,
+                         thrust::make_counting_iterator(0),
+                         thrust::make_counting_iterator(num_segments),
+                         segment_checker<KeyT, ValueT, STABLE>{keys, values, offsets, descending_sort}));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Random element gen/validation
+
+// Generates random key/value pairs in keys/values.
+// d_values may be left empty if ValueT == cub::NullType.
+template <typename KeyT, typename ValueT>
+void generate_random_unsorted_inputs(c2h::seed_t seed, //
+                                     c2h::device_vector<KeyT>& d_keys,
+                                     c2h::device_vector<ValueT>& d_values)
+{
+  C2H_TIME_SCOPE("generate_random_unsorted_inputs");
+
+  (void) d_values; // Unused for key-only sort.
+
+  c2h::gen(make_key_seed(seed), d_keys);
+
+  _CCCL_IF_CONSTEXPR (!::cuda::std::is_same<ValueT, cub::NullType>::value)
+  {
+    c2h::gen(make_value_seed(seed), d_values);
+  }
+}
+
+// Stable sort the segmented key/values pairs in the host arrays.
+// d_values may be left empty if ValueT == cub::NullType.
+template <typename KeyT, typename ValueT>
+void host_sort_random_inputs(
+  bool sort_descending, //
+  int num_segments,
+  const int* h_begin_offsets,
+  const int* h_end_offsets,
+  c2h::host_vector<KeyT>& h_unsorted_keys,
+  c2h::host_vector<ValueT>& h_unsorted_values = {})
+{
+  C2H_TIME_SCOPE("host_sort_random_inputs");
+
+  constexpr bool sort_pairs = !::cuda::std::is_same<ValueT, cub::NullType>::value;
+
+  (void) h_unsorted_values; // Unused for key-only sort.
+
+  for (int segment_i = 0; segment_i < num_segments; segment_i++)
+  {
+    const int segment_begin = h_begin_offsets[segment_i];
+    const int segment_end   = h_end_offsets[segment_i];
+
+    if (segment_end == segment_begin)
+    {
+      continue;
+    }
+
+    _CCCL_IF_CONSTEXPR (sort_pairs)
+    {
+      if (sort_descending)
+      {
+        thrust::stable_sort_by_key(
+          h_unsorted_keys.begin() + segment_begin,
+          h_unsorted_keys.begin() + segment_end,
+          h_unsorted_values.begin() + segment_begin,
+          thrust::greater<KeyT>{});
+      }
+      else
+      {
+        thrust::stable_sort_by_key(h_unsorted_keys.begin() + segment_begin,
+                                   h_unsorted_keys.begin() + segment_end,
+                                   h_unsorted_values.begin() + segment_begin);
+      }
+    }
+    else
+    {
+      if (sort_descending)
+      {
+        thrust::stable_sort(h_unsorted_keys.begin() + segment_begin, //
+                            h_unsorted_keys.begin() + segment_end,
+                            thrust::greater<KeyT>{});
+      }
+      else
+      {
+        thrust::stable_sort(h_unsorted_keys.begin() + segment_begin, //
+                            h_unsorted_keys.begin() + segment_end);
+      }
+    }
+  }
+}
+
+template <typename KeyT, typename ValueT>
+struct unstable_segmented_value_checker
+{
+  const KeyT* ref_keys{};
+  const ValueT* ref_values{};
+  ValueT* test_values{};
+  const int* offsets_begin{};
+  const int* offsets_end{};
+
+  unstable_segmented_value_checker(
+    const KeyT* ref_keys,
+    const ValueT* ref_values,
+    ValueT* test_values,
+    const int* offsets_begin,
+    const int* offsets_end)
+      : ref_keys(ref_keys)
+      , ref_values(ref_values)
+      , test_values(test_values)
+      , offsets_begin(offsets_begin)
+      , offsets_end(offsets_end)
+  {}
+
+  _CCCL_DEVICE bool operator()(int segment_id) const
+  {
+    const int segment_begin = offsets_begin[segment_id];
+    const int segment_end   = offsets_end[segment_id];
+
+    // Identify duplicate ranges of keys in the current segment
+    for (int key_offset = segment_begin; key_offset < segment_end; /*inc in loop*/)
+    {
+      // Per range of duplicate keys, find the corresponding range of values:
+      int unchecked_values_for_current_dup_key_begin = key_offset;
+      int unchecked_values_for_current_dup_key_end   = key_offset + 1;
+
+      const KeyT current_key = ref_keys[unchecked_values_for_current_dup_key_begin];
+      while (unchecked_values_for_current_dup_key_end < segment_end
+             && current_key == ref_keys[unchecked_values_for_current_dup_key_end])
+      {
+        unchecked_values_for_current_dup_key_end++;
+      }
+
+      // Iterate through all of the ref values and verify that they each appear once-and-only-once in the test values:
+      for (int value_idx = unchecked_values_for_current_dup_key_begin;
+           value_idx < unchecked_values_for_current_dup_key_end;
+           value_idx++)
+      {
+        const ValueT current_value = ref_values[value_idx];
+        int probe_unchecked_idx    = unchecked_values_for_current_dup_key_begin;
+        for (; probe_unchecked_idx < unchecked_values_for_current_dup_key_end; probe_unchecked_idx++)
+        {
+          if (current_value == test_values[probe_unchecked_idx])
+          {
+            // Swap the found value out of the unchecked region to reduce the search space in future iterations:
+            using thrust::swap;
+            swap(test_values[probe_unchecked_idx], test_values[unchecked_values_for_current_dup_key_begin]);
+            unchecked_values_for_current_dup_key_begin++;
+            break;
+          }
+        }
+
+        // Check that the probe found a match:
+        if (probe_unchecked_idx == unchecked_values_for_current_dup_key_end)
+        {
+          return false;
+        }
+      }
+
+      key_offset = unchecked_values_for_current_dup_key_end;
+    }
+
+    return true;
+  }
+};
+
+// For UNSTABLE verification, test values may be permutated within each duplicate key range.
+// They will not be modified when STABLE.
+template <bool STABLE, typename KeyT, typename ValueT>
+void validate_sorted_random_outputs(
+  int num_segments,
+  const int* d_segment_begin,
+  const int* d_segment_end,
+  const c2h::device_vector<KeyT>& d_ref_keys,
+  const c2h::device_vector<KeyT>& d_sorted_keys,
+  const c2h::device_vector<ValueT>& d_ref_values,
+  c2h::device_vector<ValueT>& d_sorted_values)
+{
+  C2H_TIME_SCOPE("validate_sorted_random_outputs");
+
+  (void) d_ref_values;
+  (void) d_sorted_values;
+  (void) num_segments;
+  (void) d_segment_begin;
+  (void) d_segment_end;
+
+  // Verify that the key arrays match exactly:
+  REQUIRE((d_ref_keys == d_sorted_keys) == true);
+
+  // Verify segment-by-segment that the values are appropriately sorted for an unstable key-value sort:
+  _CCCL_IF_CONSTEXPR (!::cuda::std::is_same<ValueT, cub::NullType>::value)
+  {
+    _CCCL_IF_CONSTEXPR (STABLE)
+    {
+      REQUIRE((d_ref_values == d_sorted_values) == true);
+    }
+    else
+    {
+      const KeyT* ref_keys     = thrust::raw_pointer_cast(d_ref_keys.data());
+      const ValueT* ref_values = thrust::raw_pointer_cast(d_ref_values.data());
+      ValueT* test_values      = thrust::raw_pointer_cast(d_sorted_values.data());
+
+      REQUIRE(thrust::all_of(
+        c2h::device_policy,
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_segments),
+        unstable_segmented_value_checker<KeyT, ValueT>{
+          ref_keys, ref_values, test_values, d_segment_begin, d_segment_end}));
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Sorting abstraction/launcher
+
+template <typename WrappedKeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION cudaError_t call_cub_segmented_sort_api(
+  bool descending,
+  bool double_buffer,
+  bool stable_sort,
+
+  void* tmp_storage,
+  std::size_t& temp_storage_bytes,
+
+  int* keys_selector,
+  int* values_selector,
+
+  WrappedKeyT* wrapped_input_keys,
+  WrappedKeyT* wrapped_output_keys,
+
+  ValueT* input_values,
+  ValueT* output_values,
+
+  int num_items,
+  int num_segments,
+
+  const int* d_begin_offsets,
+  const int* d_end_offsets,
+
+  cudaStream_t stream = 0)
+{
+  using KeyT                = unwrap_value_t<WrappedKeyT>;
+  constexpr bool sort_pairs = !::cuda::std::is_same<ValueT, cub::NullType>::value;
+
+  // Unused for key-only sort.
+  (void) values_selector;
+  (void) input_values;
+  (void) output_values;
+
+  auto input_keys  = reinterpret_cast<KeyT*>(wrapped_input_keys);
+  auto output_keys = reinterpret_cast<KeyT*>(wrapped_output_keys);
+
+  // Use different types for the offset begin/end iterators to ensure that this is supported:
+  const int* offset_begin_it                  = d_begin_offsets;
+  thrust::device_ptr<const int> offset_end_it = thrust::device_pointer_cast(d_end_offsets);
+
+  cudaError_t status = cudaErrorInvalidValue;
+
+  if (stable_sort)
+  {
+    _CCCL_IF_CONSTEXPR (sort_pairs)
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          status = cub::DeviceSegmentedSort::StableSortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            values_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::StableSortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            input_values,
+            output_values,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          status = cub::DeviceSegmentedSort::StableSortPairs(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            values_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::StableSortPairs(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            input_values,
+            output_values,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+    }
+    else
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          status = cub::DeviceSegmentedSort::StableSortKeysDescending(
+            tmp_storage, //
+            temp_storage_bytes,
+            keys_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::StableSortKeysDescending(
+            tmp_storage, //
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          status = cub::DeviceSegmentedSort::StableSortKeys(
+            tmp_storage, //
+            temp_storage_bytes,
+            keys_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::StableSortKeys(
+            tmp_storage, //
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+    }
+  }
+  else
+  {
+    _CCCL_IF_CONSTEXPR (sort_pairs)
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          status = cub::DeviceSegmentedSort::SortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            values_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::SortPairsDescending(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            input_values,
+            output_values,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          cub::DoubleBuffer<ValueT> values_buffer(input_values, output_values);
+          values_buffer.selector = *values_selector;
+
+          status = cub::DeviceSegmentedSort::SortPairs(
+            tmp_storage,
+            temp_storage_bytes,
+            keys_buffer,
+            values_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector   = keys_buffer.selector;
+          *values_selector = values_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::SortPairs(
+            tmp_storage,
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            input_values,
+            output_values,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+    }
+    else
+    {
+      if (descending)
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          status = cub::DeviceSegmentedSort::SortKeysDescending(
+            tmp_storage, //
+            temp_storage_bytes,
+            keys_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::SortKeysDescending(
+            tmp_storage, //
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+      else
+      {
+        if (double_buffer)
+        {
+          cub::DoubleBuffer<KeyT> keys_buffer(input_keys, output_keys);
+          keys_buffer.selector = *keys_selector;
+
+          status = cub::DeviceSegmentedSort::SortKeys(
+            tmp_storage, //
+            temp_storage_bytes,
+            keys_buffer,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+
+          *keys_selector = keys_buffer.selector;
+        }
+        else
+        {
+          status = cub::DeviceSegmentedSort::SortKeys(
+            tmp_storage, //
+            temp_storage_bytes,
+            input_keys,
+            output_keys,
+            num_items,
+            num_segments,
+            offset_begin_it,
+            offset_end_it,
+            stream);
+        }
+      }
+    }
+  }
+
+  return status;
+}
+
+struct segmented_sort_launcher_t
+{
+private:
+  bool m_is_descending;
+  bool m_double_buffer;
+  bool m_stable_sort;
+  int* m_selectors;
+
+public:
+  explicit segmented_sort_launcher_t(bool is_descending, bool double_buffer, bool stable_sort)
+      : m_is_descending(is_descending)
+      , m_double_buffer(double_buffer)
+      , m_stable_sort(stable_sort)
+      , m_selectors(nullptr)
+  {}
+
+  void initialize()
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&m_selectors, 2 * sizeof(int)));
+  }
+
+  void finalize()
+  {
+    REQUIRE(cudaSuccess == cudaFreeHost(m_selectors));
+    m_selectors = nullptr;
+  }
+
+  void set_key_selector(int sel)
+  {
+    m_selectors[0] = sel;
+  }
+
+  int key_selector() const
+  {
+    return m_selectors[0];
+  }
+
+  void set_value_selector(int sel)
+  {
+    m_selectors[1] = sel;
+  }
+
+  int value_selector() const
+  {
+    return m_selectors[1];
+  }
+
+  template <class... As>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(std::uint8_t* d_temp_storage, std::size_t& temp_storage_bytes, As... as)
+  {
+    const cudaError_t status = call_cub_segmented_sort_api(
+      m_is_descending, //
+      m_double_buffer,
+      m_stable_sort,
+      d_temp_storage,
+      temp_storage_bytes,
+      m_selectors,
+      m_selectors + 1,
+      as...);
+
+    return status;
+  }
+};
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void call_cub_segmented_sort_api(
+  bool descending,
+  bool double_buffer,
+  bool stable_sort,
+
+  KeyT* input_keys,
+  KeyT* output_keys,
+
+  ValueT* input_values,
+  ValueT* output_values,
+
+  int num_items,
+  int num_segments,
+
+  const int* d_begin_offsets,
+  const int* d_end_offsets,
+
+  int* keys_selector   = nullptr,
+  int* values_selector = nullptr)
+{
+  C2H_TIME_SCOPE("cub::DeviceSegmentedSort");
+  CAPTURE(descending, double_buffer, stable_sort, num_items, num_segments);
+
+  segmented_sort_launcher_t action(descending, double_buffer, stable_sort);
+  action.initialize();
+
+  if (keys_selector)
+  {
+    action.set_key_selector(*keys_selector);
+  }
+
+  if (values_selector)
+  {
+    action.set_value_selector(*values_selector);
+  }
+
+  launch(action, //
+         input_keys,
+         output_keys,
+         input_values,
+         output_values,
+         num_items,
+         num_segments,
+         d_begin_offsets,
+         d_end_offsets);
+
+  if (keys_selector)
+  {
+    *keys_selector = action.key_selector();
+  }
+
+  if (values_selector)
+  {
+    *values_selector = action.value_selector();
+  }
+
+  action.finalize();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Testing implementations
+
+constexpr bool ascending  = false;
+constexpr bool descending = true;
+
+constexpr bool pointers      = false;
+constexpr bool double_buffer = true;
+
+constexpr bool unstable = false;
+constexpr bool stable   = true;
+
+// Uses analytically derived key/value pairs for generation and validation.
+// Much faster that test_segments_random, as this avoids H<->D copies and host reference sorting.
+// Drawback is that the unsorted keys are always reversed (though duplicate keys introduce some sorting variation
+// due to stability).
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_segments_derived(const c2h::device_vector<int>& d_offsets_vec)
+{
+  C2H_TIME_SECTION_INIT();
+  constexpr bool sort_pairs = !::cuda::std::is_same<ValueT, cub::NullType>::value;
+
+  const int num_items    = d_offsets_vec.back();
+  const int num_segments = static_cast<int>(d_offsets_vec.size() - 1);
+
+  C2H_TIME_SECTION("Fetch num_items D->H");
+
+  c2h::device_vector<KeyT> keys_input(num_items);
+  c2h::device_vector<KeyT> keys_output(num_items);
+
+  c2h::device_vector<ValueT> values_input;
+  c2h::device_vector<ValueT> values_output;
+  _CCCL_IF_CONSTEXPR (sort_pairs)
+  {
+    values_input.resize(num_items);
+    values_output.resize(num_items);
+  }
+
+  C2H_TIME_SECTION("Allocate device memory");
+
+  const bool stable_sort     = GENERATE(unstable, stable);
+  const bool sort_descending = GENERATE(ascending, descending);
+  const bool sort_buffers    = GENERATE(pointers, double_buffer);
+
+  CAPTURE(c2h::type_name<KeyT>(),
+          c2h::type_name<ValueT>(),
+          sort_pairs,
+          num_items,
+          num_segments,
+          stable_sort,
+          sort_descending,
+          sort_buffers);
+
+  generate_unsorted_derived_inputs(sort_descending, d_offsets_vec, keys_input, values_input);
+
+  int keys_selector   = 0;
+  int values_selector = 1;
+
+  _CCCL_IF_CONSTEXPR (sort_pairs)
+  {
+    if (sort_buffers)
+    { // Value buffer selector is initialized to read from the second buffer:
+      using namespace std;
+      swap(values_input, values_output);
+    }
+  }
+
+  const int* d_begin_offsets = thrust::raw_pointer_cast(d_offsets_vec.data());
+  const int* d_end_offsets   = thrust::raw_pointer_cast(d_offsets_vec.data() + 1);
+  KeyT* d_keys_input         = thrust::raw_pointer_cast(keys_input.data());
+  KeyT* d_keys_output        = thrust::raw_pointer_cast(keys_output.data());
+  ValueT* d_values_input     = thrust::raw_pointer_cast(values_input.data());
+  ValueT* d_values_output    = thrust::raw_pointer_cast(values_output.data());
+
+  call_cub_segmented_sort_api(
+    sort_descending,
+    sort_buffers,
+    stable_sort,
+    d_keys_input,
+    d_keys_output,
+    d_values_input,
+    d_values_output,
+    num_items,
+    num_segments,
+    d_begin_offsets,
+    d_end_offsets,
+    &keys_selector,
+    &values_selector);
+
+  auto& keys   = (keys_selector || !sort_buffers) ? keys_output : keys_input;
+  auto& values = (values_selector || !sort_buffers) ? values_output : values_input;
+
+  if (stable_sort)
+  {
+    validate_sorted_derived_outputs<true>(sort_descending, d_offsets_vec, keys, values);
+  }
+  else
+  {
+    validate_sorted_derived_outputs<false>(sort_descending, d_offsets_vec, keys, values);
+  }
+}
+
+template <typename KeyT, typename ValueT>
+void test_segments_random(
+  c2h::seed_t seed, //
+  int num_items,
+  int num_segments,
+  const int* d_begin_offsets,
+  const int* d_end_offsets)
+{
+  constexpr bool sort_pairs = !::cuda::std::is_same<ValueT, cub::NullType>::value;
+
+  CAPTURE(c2h::type_name<KeyT>(), //
+          c2h::type_name<ValueT>(),
+          sort_pairs,
+          num_items,
+          num_segments);
+
+  C2H_TIME_SECTION_INIT();
+
+  c2h::device_vector<KeyT> keys_input(num_items);
+  c2h::device_vector<KeyT> keys_output(num_items);
+
+  c2h::device_vector<ValueT> values_input;
+  c2h::device_vector<ValueT> values_output;
+  _CCCL_IF_CONSTEXPR (sort_pairs)
+  {
+    values_input.resize(num_items);
+    values_output.resize(num_items);
+  }
+
+  C2H_TIME_SECTION("Allocate device memory");
+
+  const bool sort_descending = GENERATE(ascending, descending);
+
+  generate_random_unsorted_inputs(seed, keys_input, values_input);
+
+  // Initialize the output values to the inputs so unused segments will be filled with the expected random values:
+  keys_output   = keys_input;
+  values_output = values_input;
+
+  C2H_TIME_SECTION_RESET();
+
+  // Since we only have offset pointers, allocate offsets first and then copy -- using the iterator constructor for
+  // cross-system copies would do a separate D->H transfer for each element:
+  c2h::host_vector<int> h_begin_offsets(num_segments);
+  thrust::copy(thrust::device_pointer_cast(d_begin_offsets),
+               thrust::device_pointer_cast(d_begin_offsets + num_segments),
+               h_begin_offsets.begin());
+  c2h::host_vector<int> h_end_offsets(num_segments);
+  thrust::copy(thrust::device_pointer_cast(d_end_offsets),
+               thrust::device_pointer_cast(d_end_offsets + num_segments),
+               h_end_offsets.begin());
+
+  // Copying a vector D->H will do a bulk copy:
+  c2h::host_vector<KeyT> h_keys_ref     = keys_input;
+  c2h::host_vector<ValueT> h_values_ref = values_input;
+
+  C2H_TIME_SECTION("D->H input arrays");
+
+  c2h::device_vector<KeyT> keys_orig     = keys_input;
+  c2h::device_vector<ValueT> values_orig = values_input;
+
+  C2H_TIME_SECTION("Clone input arrays on device");
+
+  host_sort_random_inputs(
+    sort_descending,
+    num_segments,
+    thrust::raw_pointer_cast(h_begin_offsets.data()),
+    thrust::raw_pointer_cast(h_end_offsets.data()),
+    h_keys_ref,
+    h_values_ref);
+
+  C2H_TIME_SECTION_RESET();
+
+  c2h::device_vector<KeyT> keys_ref     = h_keys_ref;
+  c2h::device_vector<ValueT> values_ref = h_values_ref;
+
+  C2H_TIME_SECTION("H->D reference arrays");
+
+  bool need_reset = false;
+
+  // Don't use GENERATE for these. We can reuse the expensive reference arrays for them:
+  for (bool stable_sort : {unstable, stable})
+  {
+    for (bool sort_buffers : {pointers, double_buffer})
+    {
+      CAPTURE(stable_sort, sort_descending, sort_buffers);
+
+      if (need_reset)
+      {
+        C2H_TIME_SCOPE("Reset input/output device arrays");
+        keys_input   = keys_orig;
+        values_input = values_orig;
+        // Initialize the outputs so we have the expected random sequences in any unused segments.
+        keys_output   = keys_orig;
+        values_output = values_orig;
+      }
+
+      int keys_selector   = 0;
+      int values_selector = 1;
+
+      _CCCL_IF_CONSTEXPR (sort_pairs)
+      {
+        if (sort_buffers)
+        { // Value buffer selector is initialized to read from the second buffer:
+          using namespace std;
+          swap(values_input, values_output);
+        }
+      }
+
+      KeyT* d_keys_input      = thrust::raw_pointer_cast(keys_input.data());
+      KeyT* d_keys_output     = thrust::raw_pointer_cast(keys_output.data());
+      ValueT* d_values_input  = thrust::raw_pointer_cast(values_input.data());
+      ValueT* d_values_output = thrust::raw_pointer_cast(values_output.data());
+
+      call_cub_segmented_sort_api(
+        sort_descending,
+        sort_buffers,
+        stable_sort,
+        d_keys_input,
+        d_keys_output,
+        d_values_input,
+        d_values_output,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        &keys_selector,
+        &values_selector);
+
+      need_reset       = true;
+      const auto& keys = (keys_selector || !sort_buffers) ? keys_output : keys_input;
+      auto& values     = (values_selector || !sort_buffers) ? values_output : values_input;
+
+      if (stable_sort)
+      {
+        validate_sorted_random_outputs<true>(
+          num_segments, d_begin_offsets, d_end_offsets, keys_ref, keys, values_ref, values);
+      }
+      else
+      {
+        validate_sorted_random_outputs<false>(
+          num_segments, d_begin_offsets, d_end_offsets, keys_ref, keys, values_ref, values);
+      }
+    }
+  }
+}
+
+template <typename KeyT, typename ValueT>
+void test_segments_random(c2h::seed_t seed, const c2h::device_vector<int>& d_offsets_vec)
+{
+  const int num_items    = d_offsets_vec.back();
+  const int num_segments = static_cast<int>(d_offsets_vec.size() - 1);
+
+  test_segments_random<KeyT, ValueT>(
+    seed, //
+    num_items,
+    num_segments,
+    thrust::raw_pointer_cast(d_offsets_vec.data()),
+    thrust::raw_pointer_cast(d_offsets_vec.data() + 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Offset generators
+
+inline c2h::device_vector<int> generate_same_size_offsets(int segment_size, int num_segments)
+{
+  c2h::device_vector<int> offsets(num_segments + 1);
+  thrust::sequence(c2h::device_policy, offsets.begin(), offsets.end(), int{}, segment_size);
+  return offsets;
+}
+
+struct offset_scan_op_t
+{
+  int max_items;
+
+  _CCCL_DEVICE _CCCL_FORCEINLINE int operator()(int a, int b) const
+  {
+    const int sum = a + b;
+    return CUB_MIN(sum, max_items);
+  }
+};
+
+inline c2h::device_vector<int>
+generate_random_offsets(c2h::seed_t seed, int max_items, int max_segment, int num_segments)
+{
+  C2H_TIME_SCOPE("generate_random_offsets");
+  const int expected_segment_length = ::cuda::ceil_div(max_items, num_segments);
+  const int max_segment_length      = CUB_MIN(max_segment, (expected_segment_length * 2) + 1);
+
+  c2h::device_vector<int> offsets(num_segments + 1);
+  c2h::gen(make_offset_seed(seed), offsets, 0, max_segment_length);
+
+  thrust::exclusive_scan(
+    c2h::device_policy, //
+    offsets.cbegin(),
+    offsets.cend(),
+    offsets.begin(),
+    0,
+    offset_scan_op_t{max_items});
+
+  return offsets;
+}
+
+struct generate_edge_case_offsets_dispatch
+{
+  // Edge cases that needs to be tested
+  static constexpr int empty_short_circuit_segment_size = 0;
+  static constexpr int copy_short_circuit_segment_size  = 1;
+  static constexpr int swap_short_circuit_segment_size  = 2;
+
+  static constexpr int a_few      = 2;
+  static constexpr int a_bunch_of = 42;
+  static constexpr int a_lot_of   = 420;
+
+  int small_segment_max_segment_size;
+  int items_per_small_segment;
+  int medium_segment_max_segment_size;
+  int single_thread_segment_size;
+  int large_cached_segment_max_segment_size;
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION cudaError_t Invoke()
+  {
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      (using SmallAndMediumPolicyT = typename ActivePolicyT::SmallAndMediumSegmentedSortPolicyT;
+       using LargeSegmentPolicyT   = typename ActivePolicyT::LargeSegmentPolicy;
+
+       small_segment_max_segment_size  = SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_TILE;
+       items_per_small_segment         = SmallAndMediumPolicyT::SmallPolicyT::ITEMS_PER_THREAD;
+       medium_segment_max_segment_size = SmallAndMediumPolicyT::MediumPolicyT::ITEMS_PER_TILE;
+       single_thread_segment_size      = items_per_small_segment;
+       large_cached_segment_max_segment_size =
+         LargeSegmentPolicyT::BLOCK_THREADS * LargeSegmentPolicyT::ITEMS_PER_THREAD; //
+       ));
+
+    return cudaSuccess;
+  }
+
+  c2h::device_vector<int> generate_offsets() const
+  {
+    c2h::host_vector<int> h_offsets;
+
+    auto add_segments = [&h_offsets](int num_segments, int segment_size) {
+      h_offsets.resize(h_offsets.size() + num_segments, segment_size);
+    };
+
+    add_segments(a_lot_of, empty_short_circuit_segment_size);
+    add_segments(a_lot_of, copy_short_circuit_segment_size);
+    add_segments(a_lot_of, swap_short_circuit_segment_size);
+    add_segments(a_lot_of, swap_short_circuit_segment_size + 1);
+    add_segments(a_lot_of, swap_short_circuit_segment_size + 1);
+    add_segments(a_lot_of, single_thread_segment_size - 1);
+    add_segments(a_lot_of, single_thread_segment_size);
+    add_segments(a_lot_of, single_thread_segment_size + 1);
+    add_segments(a_lot_of, single_thread_segment_size * 2 - 1);
+    add_segments(a_lot_of, single_thread_segment_size * 2);
+    add_segments(a_lot_of, single_thread_segment_size * 2 + 1);
+    add_segments(a_bunch_of, small_segment_max_segment_size - 1);
+    add_segments(a_bunch_of, small_segment_max_segment_size);
+    add_segments(a_bunch_of, small_segment_max_segment_size + 1);
+    add_segments(a_bunch_of, medium_segment_max_segment_size - 1);
+    add_segments(a_bunch_of, medium_segment_max_segment_size);
+    add_segments(a_bunch_of, medium_segment_max_segment_size + 1);
+    add_segments(a_bunch_of, large_cached_segment_max_segment_size - 1);
+    add_segments(a_bunch_of, large_cached_segment_max_segment_size);
+    add_segments(a_bunch_of, large_cached_segment_max_segment_size + 1);
+    add_segments(a_few, large_cached_segment_max_segment_size * 2);
+    add_segments(a_few, large_cached_segment_max_segment_size * 3);
+    add_segments(a_few, large_cached_segment_max_segment_size * 5);
+
+    c2h::device_vector<int> d_offsets = h_offsets;
+    thrust::exclusive_scan(c2h::device_policy, d_offsets.cbegin(), d_offsets.cend(), d_offsets.begin(), 0);
+    return d_offsets;
+  }
+};
+
+template <typename KeyT, typename ValueT>
+c2h::device_vector<int> generate_edge_case_offsets()
+{
+  C2H_TIME_SCOPE("generate_edge_case_offsets");
+
+  using MaxPolicyT = typename cub::DeviceSegmentedSortPolicy<KeyT, ValueT>::MaxPolicy;
+
+  int ptx_version = 0;
+  REQUIRE(cudaSuccess == CubDebug(cub::PtxVersion(ptx_version)));
+
+  generate_edge_case_offsets_dispatch dispatch;
+  REQUIRE(cudaSuccess == CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)));
+
+  return dispatch.generate_offsets();
+}
+
+// Returns num_items
+inline int generate_unspecified_segments_offsets(
+  c2h::seed_t seed, c2h::device_vector<int>& d_begin_offsets, c2h::device_vector<int>& d_end_offsets)
+{
+  C2H_TIME_SCOPE("generate_unspecified_segments_offsets");
+
+  const int max_items        = 1 << 18;
+  const int max_segment_size = 1000;
+  const int num_segments     = 4000;
+
+  d_begin_offsets = generate_random_offsets(seed, max_items, max_segment_size, num_segments);
+  d_end_offsets.resize(num_segments);
+
+  // Skip the first offset -- it's always 0 at this point (exclusive_scan generates begin offsets, end offsets are
+  // initialized to 0). This ensures that we test the case where the beginning of the inputs are not in any segment.
+  thrust::copy(d_begin_offsets.cbegin() + 2, d_begin_offsets.cend(), d_end_offsets.begin() + 1);
+  d_begin_offsets.pop_back();
+
+  // Scatter some zeros around to erase half of the segments:
+  c2h::device_vector<int> erase_indices(num_segments / 2);
+  // Don't zero out the first or last segment. These are handled specially in other ways -- the first segment is always
+  // zeroed explicitly in the offset arrays, and the last segment is required to be specified for the num_items
+  // calculation below.
+  c2h::gen(make_offset_eraser_seed(seed), erase_indices, 1, num_segments - 2);
+
+  auto const_zero_begin = thrust::make_constant_iterator<int>(0);
+  auto const_zero_end   = const_zero_begin + erase_indices.size();
+
+  thrust::scatter(
+    c2h::nosync_device_policy, const_zero_begin, const_zero_end, erase_indices.cbegin(), d_begin_offsets.begin());
+  thrust::scatter(
+    c2h::nosync_device_policy, const_zero_begin, const_zero_end, erase_indices.cbegin(), d_end_offsets.begin());
+
+  REQUIRE(cudaSuccess == CubDebug(cudaDeviceSynchronize()));
+
+  // Add more items to place another unspecified segment at the end.
+  const int num_items = d_end_offsets.back() + 243;
+  return num_items;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry points
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_same_size_segments_derived(int segment_size, int num_segments)
+{
+  CAPTURE(segment_size, num_segments);
+  c2h::device_vector<int> offsets = generate_same_size_offsets(segment_size, num_segments);
+  test_segments_derived<KeyT, ValueT>(offsets);
+}
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_random_size_segments_derived(c2h::seed_t seed, int max_items, int max_segment, int num_segments)
+{
+  CAPTURE(seed.get());
+  c2h::device_vector<int> offsets = generate_random_offsets(seed, max_items, max_segment, num_segments);
+  test_segments_derived<KeyT, ValueT>(offsets);
+}
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_random_size_segments_random(c2h::seed_t seed, int max_items, int max_segment, int num_segments)
+{
+  CAPTURE(seed.get());
+  c2h::device_vector<int> offsets = generate_random_offsets(seed, max_items, max_segment, num_segments);
+  test_segments_random<KeyT, ValueT>(seed, offsets);
+}
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_edge_case_segments_random(c2h::seed_t seed)
+{
+  CAPTURE(seed.get());
+  c2h::device_vector<int> offsets = generate_edge_case_offsets<KeyT, ValueT>();
+  test_segments_random<KeyT, ValueT>(seed, offsets);
+}
+
+template <typename KeyT, typename ValueT = cub::NullType>
+void test_unspecified_segments_random(c2h::seed_t seed)
+{
+  CAPTURE(seed.get());
+
+  c2h::device_vector<int> d_begin_offsets;
+  c2h::device_vector<int> d_end_offsets;
+  const int num_items = generate_unspecified_segments_offsets(seed, d_begin_offsets, d_end_offsets);
+
+  const int num_segments = static_cast<int>(d_begin_offsets.size());
+
+  test_segments_random<KeyT, ValueT>(
+    seed,
+    num_items,
+    num_segments,
+    thrust::raw_pointer_cast(d_begin_offsets.data()),
+    thrust::raw_pointer_cast(d_end_offsets.data()));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_adjacent_difference.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_adjacent_difference.cu
new file mode 100644
index 000000000..df25604d2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_adjacent_difference.cu
@@ -0,0 +1,367 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_adjacent_difference.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int ThreadsInBlock, int ItemsPerThread, class T, class ActionT>
+__global__ void block_adj_diff_kernel(T* data, ActionT action, bool in_place)
+{
+  using block_adjacent_differencet_t = cub::BlockAdjacentDifference<T, ThreadsInBlock>;
+  using temp_storage_t               = typename block_adjacent_differencet_t::TempStorage;
+
+  __shared__ temp_storage_t temp_storage;
+
+  T thread_in[ItemsPerThread];
+  T thread_out[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_in[item] = data[thread_offset + item];
+  }
+  __syncthreads();
+
+  block_adjacent_differencet_t adj_diff(temp_storage);
+
+  if (in_place)
+  {
+    action(adj_diff, thread_in, thread_in);
+
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      data[thread_offset + item] = thread_in[item];
+    }
+  }
+  else
+  {
+    action(adj_diff, thread_in, thread_out);
+
+    for (unsigned int item = 0; item < ItemsPerThread; item++)
+    {
+      data[thread_offset + item] = thread_out[item];
+    }
+  }
+}
+
+template <class T>
+struct custom_difference_t
+{
+  __host__ __device__ T operator()(const T& lhs, const T& rhs)
+  {
+    return lhs - rhs;
+  }
+};
+
+template <bool ReadLeft>
+struct base_op_t
+{
+  template <int ItemsPerThread, typename T, typename BlockAdjDiff>
+  __device__ void operator()(BlockAdjDiff& block_adj_diff, T (&input)[ItemsPerThread], T (&output)[ItemsPerThread]) const
+  {
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input, output, custom_difference_t<T>{});
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input, output, custom_difference_t<T>{});
+    }
+  }
+};
+
+template <bool ReadLeft>
+struct last_tile_op_t
+{
+  int m_valid_items{};
+
+  __host__ last_tile_op_t(int valid_items)
+      : m_valid_items(valid_items)
+  {}
+
+  template <int ITEMS_PER_THREAD, typename T, typename BlockAdjDiff>
+  __device__ void
+  operator()(BlockAdjDiff& block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeftPartialTile(input, output, diff, m_valid_items);
+    }
+    else
+    {
+      block_adj_diff.SubtractRightPartialTile(input, output, diff, m_valid_items);
+    }
+  }
+};
+
+template <class T, bool ReadLeft>
+struct middle_tile_op_t
+{
+  T m_neighbour_tile_value;
+
+  __host__ middle_tile_op_t(T neighbour_tile_value)
+      : m_neighbour_tile_value(neighbour_tile_value)
+  {}
+
+  template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
+  __device__ void
+  operator()(BlockAdjDiff& block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+
+    if (ReadLeft)
+    {
+      block_adj_diff.SubtractLeft(input, output, diff, m_neighbour_tile_value);
+    }
+    else
+    {
+      block_adj_diff.SubtractRight(input, output, diff, m_neighbour_tile_value);
+    }
+  }
+};
+
+template <typename T>
+struct last_tile_with_pred_op_t
+{
+  int m_valid_items;
+  T m_neighbour_tile_value;
+
+  __host__ last_tile_with_pred_op_t(int valid_items, T neighbour_tile_value)
+      : m_valid_items(valid_items)
+      , m_neighbour_tile_value(neighbour_tile_value)
+  {}
+
+  template <int ITEMS_PER_THREAD, typename BlockAdjDiff>
+  __device__ void
+  operator()(BlockAdjDiff& block_adj_diff, T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) const
+  {
+    custom_difference_t<T> diff{};
+    block_adj_diff.SubtractLeftPartialTile(input, output, diff, m_valid_items, m_neighbour_tile_value);
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock, class T, class ActionT>
+void block_adj_diff(c2h::device_vector<T>& data, bool in_place, ActionT action)
+{
+  block_adj_diff_kernel<ThreadsInBlock, ItemsPerThread, T, ActionT>
+    <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(data.data()), action, in_place);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <bool ReadLeft, class T>
+void host_adj_diff(c2h::host_vector<T>& h_data, int valid_items)
+{
+  custom_difference_t<T> diff{};
+
+  if (ReadLeft)
+  {
+    for (int i = valid_items - 1; i > 0; i--)
+    {
+      h_data[i] = diff(h_data[i], h_data[i - 1]);
+    }
+  }
+  else
+  {
+    for (int i = 0; i < valid_items - 1; i++)
+    {
+      h_data[i] = diff(h_data[i], h_data[i + 1]);
+    }
+  }
+}
+
+template <bool ReadLeft, class T>
+void host_adj_diff(c2h::host_vector<T>& h_data, int valid_items, T neighbour_value)
+{
+  custom_difference_t<T> diff{};
+
+  host_adj_diff<ReadLeft>(h_data, valid_items);
+
+  if (valid_items == 0)
+  {
+    return;
+  }
+
+  if (ReadLeft)
+  {
+    h_data[0] = diff(h_data[0], neighbour_value);
+  }
+  else
+  {
+    h_data[valid_items - 1] = diff(h_data[valid_items - 1], neighbour_value);
+  }
+}
+
+// %PARAM% THREADS_IN_BLOCK bs 64:256
+
+using key_types = c2h::type_list<std::uint16_t, std::int32_t, std::int64_t>;
+
+using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
+using directions       = c2h::enum_type_list<bool, false, true>;
+using left_only        = c2h::enum_type_list<bool, true>;
+
+template <class TestType>
+struct params_t
+{
+  using key_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size        = items_per_thread * threads_in_block;
+  static constexpr bool read_left       = c2h::get<3, TestType>::value;
+};
+
+C2H_TEST("Block adjacent difference works with full tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+
+  c2h::host_vector<key_t> h_data = d_data;
+  host_adj_diff<params::read_left>(h_data, params::tile_size);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(d_data, in_place, base_op_t<params::read_left>{});
+
+  REQUIRE(h_data == d_data);
+}
+
+C2H_TEST("Block adjacent difference works with last tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const bool in_place   = GENERATE(false, true);
+  const int valid_items = GENERATE_COPY(take(10, random(0, params::tile_size)));
+
+  c2h::host_vector<key_t> h_data = d_data;
+  host_adj_diff<params::read_left>(h_data, valid_items);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data, in_place, last_tile_op_t<params::read_left>{valid_items});
+
+  REQUIRE(h_data == d_data);
+}
+
+C2H_TEST("Block adjacent difference works with single tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         left_only)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const bool in_place      = GENERATE(false, true);
+  const int valid_items    = GENERATE_COPY(take(10, random(0, params::tile_size)));
+  constexpr bool read_left = true;
+
+  c2h::host_vector<key_t> h_data = d_data;
+  key_t neighbour_value          = h_data[h_data.size() / 2];
+
+  host_adj_diff<read_left>(h_data, valid_items, neighbour_value);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data, in_place, last_tile_with_pred_op_t<key_t>{valid_items, neighbour_value});
+
+  REQUIRE(h_data == d_data);
+}
+
+C2H_TEST("Block adjacent difference works with middle tiles",
+         "[adjacent difference][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block,
+         directions)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const bool in_place = GENERATE(false, true);
+
+  c2h::host_vector<key_t> h_data = d_data;
+  key_t neighbour_value          = h_data[h_data.size() / 2];
+
+  host_adj_diff<params::read_left>(h_data, params::tile_size, neighbour_value);
+
+  block_adj_diff<params::items_per_thread, params::threads_in_block>(
+    d_data, in_place, middle_tile_op_t<key_t, params::read_left>{neighbour_value});
+
+  REQUIRE(h_data == d_data);
+}
+
+C2H_TEST("Block adjacent difference supports custom types", "[adjacent difference][block]", threads_in_block)
+{
+  using key_t = c2h::custom_type_t<c2h::equal_comparable_t, c2h::subtractable_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size        = threads_in_block * items_per_thread;
+  constexpr bool read_left       = true;
+  constexpr bool in_place        = true;
+
+  c2h::device_vector<key_t> d_data(tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  c2h::host_vector<key_t> h_data = d_data;
+  host_adj_diff<read_left>(h_data, tile_size);
+
+  block_adj_diff<items_per_thread, threads_in_block>(d_data, in_place, base_op_t<read_left>{});
+
+  REQUIRE(h_data == d_data);
+}
+
+// TODO Test different input/output types
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_histogram.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_histogram.cu
new file mode 100644
index 000000000..6bf005a36
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_histogram.cu
@@ -0,0 +1,190 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/block/block_histogram.cuh>
+
+#include <limits>
+#include <string>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int BINS,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockHistogramAlgorithm ALGORITHM,
+          typename T,
+          typename HistoCounter>
+__global__ void block_histogram_kernel(T* d_samples, HistoCounter* d_histogram)
+{
+  // Parameterize BlockHistogram type for our thread block
+  using block_histogram_t = cub::BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM>;
+
+  // Allocate temp storage in shared memory
+  __shared__ typename block_histogram_t::TempStorage temp_storage;
+
+  // Per-thread tile data
+  T data[ITEMS_PER_THREAD];
+  cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
+
+  // Test histo (writing directly to histogram buffer in global)
+  block_histogram_t(temp_storage).Histogram(data, d_histogram);
+}
+
+template <int ItemsPerThread, int ThreadsInBlock, int Bins, cub::BlockHistogramAlgorithm Algorithm, typename SampleT>
+void block_histogram(c2h::device_vector<SampleT>& d_samples, c2h::device_vector<int>& d_histogram)
+{
+  block_histogram_kernel<Bins, ThreadsInBlock, ItemsPerThread, Algorithm>
+    <<<1, ThreadsInBlock>>>(thrust::raw_pointer_cast(d_samples.data()), thrust::raw_pointer_cast(d_histogram.data()));
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% TEST_BINS bins 32:256:1024
+
+using types            = c2h::type_list<std::uint8_t, std::uint16_t>;
+using threads_in_block = c2h::enum_type_list<int, 32, 96, 128>;
+using items_per_thread = c2h::enum_type_list<int, 1, 5>;
+using bins             = c2h::enum_type_list<int, TEST_BINS>;
+using algorithms = c2h::enum_type_list<cub::BlockHistogramAlgorithm, cub::BLOCK_HISTO_SORT, cub::BLOCK_HISTO_ATOMIC>;
+
+template <class TestType>
+struct params_t
+{
+  using sample_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread                   = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block                   = c2h::get<2, TestType>::value;
+  static constexpr int bins                               = c2h::get<3, TestType>::value;
+  static constexpr int num_samples                        = threads_in_block * items_per_thread;
+  static constexpr cub::BlockHistogramAlgorithm algorithm = c2h::get<4, TestType>::value;
+};
+
+C2H_TEST("Block histogram can be computed with uniform input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  const sample_t uniform_value = static_cast<sample_t>(GENERATE_COPY(take(10, random(0, params::bins - 1))));
+
+  c2h::host_vector<sample_t> h_samples(params::num_samples, uniform_value);
+  c2h::host_vector<int> h_reference(params::bins);
+  h_reference[static_cast<std::size_t>(uniform_value)] = params::num_samples;
+
+  // Allocate problem device arrays
+  c2h::device_vector<sample_t> d_samples = h_samples;
+  c2h::device_vector<int> d_histogram(params::bins);
+
+  // Run kernel
+  block_histogram<params::items_per_thread, params::threads_in_block, params::bins, params::algorithm>(
+    d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
+
+template <typename SampleT>
+c2h::host_vector<int> compute_host_reference(int bins, const c2h::host_vector<SampleT>& h_samples)
+{
+  c2h::host_vector<int> h_reference(bins);
+  for (const SampleT& sample : h_samples)
+  {
+    h_reference[sample]++;
+  }
+
+  return h_reference;
+}
+
+C2H_TEST("Block histogram can be computed with modulo input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  // Allocate problem device arrays
+  c2h::device_vector<int> d_histogram(params::bins);
+  c2h::device_vector<sample_t> d_samples(params::num_samples);
+
+  c2h::gen(c2h::modulo_t{params::bins}, d_samples);
+
+  c2h::host_vector<sample_t> h_samples = d_samples;
+  auto h_reference                     = compute_host_reference(params::bins, h_samples);
+
+  // Run kernel
+  block_histogram<params::items_per_thread, params::threads_in_block, params::bins, params::algorithm>(
+    d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
+
+C2H_TEST("Block histogram can be computed with random input",
+         "[histogram][block]",
+         types,
+         items_per_thread,
+         threads_in_block,
+         bins,
+         algorithms)
+{
+  using params   = params_t<TestType>;
+  using sample_t = typename params::sample_t;
+
+  // Allocate problem device arrays
+  c2h::device_vector<int> d_histogram(params::bins);
+  c2h::device_vector<sample_t> d_samples(params::num_samples);
+
+  const sample_t min_bin = static_cast<sample_t>(0);
+  const sample_t max_bin = static_cast<sample_t>(std::min(
+    static_cast<std::int32_t>(std::numeric_limits<sample_t>::max()), static_cast<std::int32_t>(params::bins - 1)));
+
+  c2h::gen(C2H_SEED(10), d_samples, min_bin, max_bin);
+
+  c2h::host_vector<sample_t> h_samples = d_samples;
+  auto h_reference                     = compute_host_reference(params::bins, h_samples);
+
+  // Run kernel
+  block_histogram<params::items_per_thread, params::threads_in_block, params::bins, params::algorithm>(
+    d_samples, d_histogram);
+
+  REQUIRE(h_reference == d_histogram);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_load.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_load.cu
new file mode 100644
index 000000000..32d87b3e4
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_load.cu
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_load.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_arch.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int ItemsPerThread, int ThreadsInBlock, cub::BlockLoadAlgorithm LoadAlgorithm>
+struct output_idx
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) * ItemsPerThread + item;
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock>
+struct output_idx<ItemsPerThread, ThreadsInBlock, cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED>
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) + ThreadsInBlock * item;
+  }
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm LoadAlgorithm>
+__global__ void kernel(std::integral_constant<bool, true>, InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  using input_t      = cub::detail::value_t<InputIteratorT>;
+  using block_load_t = cub::BlockLoad<input_t, ThreadsInBlock, ItemsPerThread, LoadAlgorithm>;
+  using storage_t    = typename block_load_t::TempStorage;
+
+  __shared__ storage_t storage;
+  block_load_t block_load(storage);
+
+  input_t data[ItemsPerThread];
+
+  if (ItemsPerThread * ThreadsInBlock == num_items)
+  {
+    block_load.Load(input, data);
+  }
+  else
+  {
+    block_load.Load(input, data, num_items);
+  }
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx = output_idx<ItemsPerThread, ThreadsInBlock, LoadAlgorithm>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = data[i];
+    }
+  }
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm /* LoadAlgorithm */>
+__global__ void kernel(std::integral_constant<bool, false>, InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx = output_idx<ItemsPerThread, ThreadsInBlock, cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = input[idx];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockLoadAlgorithm LoadAlgorithm,
+          typename InputIteratorT,
+          typename OutputIteratorT>
+void block_load(InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  using input_t                       = cub::detail::value_t<InputIteratorT>;
+  using block_load_t                  = cub::BlockLoad<input_t, ThreadsInBlock, ItemsPerThread, LoadAlgorithm>;
+  using storage_t                     = typename block_load_t::TempStorage;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;
+
+  kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, LoadAlgorithm>
+    <<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% IPT it 1:11
+
+using types     = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>;
+using vec_types = c2h::type_list<long2, double2>;
+
+using even_threads_in_block = c2h::enum_type_list<int, 32, 128>;
+using odd_threads_in_block  = c2h::enum_type_list<int, 15, 65>;
+using a_block_size          = c2h::enum_type_list<int, 256>;
+
+using items_per_thread = c2h::enum_type_list<int, IPT>;
+using load_algorithm =
+  c2h::enum_type_list<cub::BlockLoadAlgorithm,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED>;
+
+using odd_load_algorithm =
+  c2h::enum_type_list<cub::BlockLoadAlgorithm,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
+                      cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread                   = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block                   = c2h::get<2, TestType>::value;
+  static constexpr int tile_size                          = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = c2h::get<3, TestType>::value;
+};
+
+C2H_TEST("Block load works with even block sizes",
+         "[load][block]",
+         types,
+         items_per_thread,
+         even_threads_in_block,
+         load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread, params::threads_in_block, params::load_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block load works with even odd sizes",
+         "[load][block]",
+         types,
+         items_per_thread,
+         odd_threads_in_block,
+         odd_load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread, params::threads_in_block, params::load_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST(
+  "Block load works with even vector types", "[load][block]", vec_types, items_per_thread, a_block_size, load_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_load<params::items_per_thread, params::threads_in_block, params::load_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block load works with custom types", "[load][block]", items_per_thread, load_algorithm)
+{
+  using type                                              = c2h::custom_type_t<c2h::equal_comparable_t>;
+  constexpr int items_per_thread                          = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block                          = 64;
+  constexpr int tile_size                                 = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = c2h::get<1, TestType>::value;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_load<items_per_thread, threads_in_block, load_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block load works with caching iterators", "[load][block]", items_per_thread, load_algorithm)
+{
+  using type                                              = int;
+  constexpr int items_per_thread                          = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block                          = 64;
+  constexpr int tile_size                                 = items_per_thread * threads_in_block;
+  static constexpr cub::BlockLoadAlgorithm load_algorithm = c2h::get<1, TestType>::value;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  cub::CacheModifiedInputIterator<cub::CacheLoadModifier::LOAD_DEFAULT, type> in(
+    thrust::raw_pointer_cast(d_input.data()));
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_load<items_per_thread, threads_in_block, load_algorithm>(
+    in, thrust::raw_pointer_cast(d_output.data()), static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_merge_sort.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_merge_sort.cu
new file mode 100644
index 000000000..a26caad9c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_merge_sort.cu
@@ -0,0 +1,424 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/block/block_merge_sort.cuh>
+
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include <c2h/catch2_test_helper.cuh>
+
+struct CustomLess
+{
+  template <typename DataType>
+  __device__ __host__ bool operator()(const DataType& lhs, const DataType& rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <int ThreadsInBlock, int ItemsPerThread, class KeyT, class ActionT>
+__global__ void block_merge_sort_kernel(KeyT* data, int valid_items, KeyT oob_default, ActionT action)
+{
+  using BlockMergeSort = cub::BlockMergeSort<KeyT, ThreadsInBlock, ItemsPerThread>;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  KeyT thread_data[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = idx < valid_items ? data[idx] : KeyT();
+  }
+  __syncthreads();
+
+  BlockMergeSort sort(temp_storage_shuffle);
+
+  action(sort, thread_data, valid_items, oob_default);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+
+    if (idx >= valid_items)
+    {
+      break;
+    }
+
+    data[idx] = thread_data[item];
+  }
+}
+
+template <int ThreadsInBlock, int ItemsPerThread, class KeyT, class ValueT, class ActionT>
+__global__ void block_merge_sort_kernel(KeyT* keys, ValueT* vals, int valid_items, KeyT oob_default, ActionT action)
+{
+  using BlockMergeSort = cub::BlockMergeSort<KeyT, ThreadsInBlock, ItemsPerThread, ValueT>;
+
+  __shared__ typename BlockMergeSort::TempStorage temp_storage_shuffle;
+
+  KeyT thread_keys[ItemsPerThread];
+  ValueT thread_vals[ItemsPerThread];
+
+  const int thread_offset = static_cast<int>(threadIdx.x) * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_keys[item] = idx < valid_items ? keys[idx] : KeyT{};
+    thread_vals[item] = idx < valid_items ? vals[idx] : ValueT{};
+  }
+  __syncthreads();
+
+  BlockMergeSort sort(temp_storage_shuffle);
+
+  action(sort, thread_keys, thread_vals, valid_items, oob_default);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+
+    if (idx >= valid_items)
+    {
+      break;
+    }
+
+    keys[idx] = thread_keys[item];
+    vals[idx] = thread_vals[item];
+  }
+}
+
+struct stable_sort_keys_partial_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class DefaultT>
+  __device__ void operator()(BlockMergeSortT& sort, KeyT& thread_data, int valid_items, DefaultT oob_default) const
+  {
+    sort.StableSort(thread_data, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+struct stable_sort_pairs_partial_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class ValueT, class DefaultT>
+  __device__ void
+  operator()(BlockMergeSortT& sort, KeyT& thread_keys, ValueT& thread_vals, int valid_items, DefaultT oob_default) const
+  {
+    sort.StableSort(thread_keys, thread_vals, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+struct stable_sort_pairs_full_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class ValueT, class DefaultT>
+  __device__ void operator()(
+    BlockMergeSortT& sort, KeyT& thread_keys, ValueT& thread_vals, int /* valid_items */, DefaultT /* oob_default */)
+    const
+  {
+    sort.StableSort(thread_keys, thread_vals, CustomLess());
+  }
+};
+
+struct stable_sort_keys_full_tile_t
+{
+  template <class BlockMergeSortT, class KeyT, class DefaultT>
+  __device__ void
+  operator()(BlockMergeSortT& sort, KeyT& thread_keys, int /* valid_items */, DefaultT /* oob_default */) const
+  {
+    sort.StableSort(thread_keys, CustomLess());
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock, class KeyT, class ActionT>
+void block_merge_sort(c2h::device_vector<KeyT>& keys, ActionT action)
+{
+  block_merge_sort_kernel<ThreadsInBlock, ItemsPerThread><<<1, ThreadsInBlock>>>(
+    thrust::raw_pointer_cast(keys.data()), static_cast<int>(keys.size()), std::numeric_limits<KeyT>::max(), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int ItemsPerThread, int ThreadsInBlock, class KeyT, class ValueT, class ActionT>
+void block_merge_sort(c2h::device_vector<KeyT>& keys, c2h::device_vector<ValueT>& vals, ActionT action)
+{
+  block_merge_sort_kernel<ThreadsInBlock, ItemsPerThread><<<1, ThreadsInBlock>>>(
+    thrust::raw_pointer_cast(keys.data()),
+    thrust::raw_pointer_cast(vals.data()),
+    static_cast<int>(keys.size()),
+    std::numeric_limits<KeyT>::max(),
+    action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% THREADS_IN_BLOCK bs 64:256
+
+using key_types        = c2h::type_list<std::int32_t, std::int64_t>;
+using threads_in_block = c2h::enum_type_list<int, THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, 1, 2, 10, 15>;
+
+template <class TestType>
+struct params_t
+{
+  using key_t = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block = c2h::get<2, TestType>::value;
+  static constexpr int tile_size        = items_per_thread * threads_in_block;
+};
+
+C2H_TEST("Block merge sort can sort keys in partial tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_keys(GENERATE_COPY(take(10, random(0, params::tile_size))));
+
+  c2h::gen(C2H_SEED(10), d_keys);
+
+  c2h::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(thrust::raw_pointer_cast(h_reference.data()),
+                   thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(),
+                   CustomLess{});
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(d_keys, stable_sort_keys_partial_tile_t{});
+
+  REQUIRE(h_reference == d_keys);
+}
+
+C2H_TEST(
+  "Block merge sort can sort keys in full tiles", "[merge sort][block]", key_types, items_per_thread, threads_in_block)
+{
+  using params = params_t<TestType>;
+  using key_t  = typename params::key_t;
+
+  c2h::device_vector<key_t> d_keys(params::tile_size);
+
+  c2h::gen(C2H_SEED(10), d_keys);
+
+  c2h::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(thrust::raw_pointer_cast(h_reference.data()),
+                   thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(),
+                   CustomLess{});
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(d_keys, stable_sort_keys_full_tile_t{});
+
+  REQUIRE(h_reference == d_keys);
+}
+
+C2H_TEST("Block merge sort can sort pairs in partial tiles",
+         "[merge sort][block]",
+         key_types,
+         items_per_thread,
+         threads_in_block)
+{
+  using params  = params_t<TestType>;
+  using key_t   = typename params::key_t;
+  using value_t = key_t;
+  using pair_t  = std::pair<key_t, value_t>;
+
+  c2h::device_vector<key_t> d_keys(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(C2H_SEED(5), d_keys);
+  c2h::gen(C2H_SEED(5), d_vals);
+
+  c2h::host_vector<key_t> h_keys   = d_keys;
+  c2h::host_vector<value_t> h_vals = d_vals;
+
+  c2h::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(thrust::raw_pointer_cast(h_ref.data()),
+                   thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(),
+                   [](pair_t l, pair_t r) -> bool {
+                     return l.first < r.first;
+                   });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(
+    d_keys, d_vals, stable_sort_pairs_partial_tile_t{});
+
+  REQUIRE(h_keys == d_keys);
+  REQUIRE(h_vals == d_vals);
+}
+
+C2H_TEST(
+  "Block merge sort can sort pairs in full tiles", "[merge sort][block]", key_types, items_per_thread, threads_in_block)
+{
+  using params  = params_t<TestType>;
+  using key_t   = typename params::key_t;
+  using value_t = key_t;
+  using pair_t  = std::pair<key_t, value_t>;
+
+  c2h::device_vector<key_t> d_keys(params::tile_size);
+  c2h::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(C2H_SEED(5), d_keys);
+  c2h::gen(C2H_SEED(5), d_vals);
+
+  c2h::host_vector<key_t> h_keys   = d_keys;
+  c2h::host_vector<value_t> h_vals = d_vals;
+
+  c2h::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(thrust::raw_pointer_cast(h_ref.data()),
+                   thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(),
+                   [](pair_t l, pair_t r) -> bool {
+                     return l.first < r.first;
+                   });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<params::items_per_thread, params::threads_in_block>(d_keys, d_vals, stable_sort_pairs_full_tile_t{});
+
+  REQUIRE(h_keys == d_keys);
+  REQUIRE(h_vals == d_vals);
+}
+
+C2H_TEST("Block merge sort can sort pairs with mixed types", "[merge sort][block]", threads_in_block)
+{
+  using key_t   = std::int32_t;
+  using value_t = std::int64_t;
+  using pair_t  = std::pair<key_t, value_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size        = items_per_thread * threads_in_block;
+
+  c2h::device_vector<key_t> d_keys(tile_size);
+  c2h::device_vector<value_t> d_vals(d_keys.size());
+
+  c2h::gen(C2H_SEED(5), d_keys);
+  c2h::gen(C2H_SEED(5), d_vals);
+
+  c2h::host_vector<key_t> h_keys   = d_keys;
+  c2h::host_vector<value_t> h_vals = d_vals;
+
+  c2h::host_vector<pair_t> h_ref(d_keys.size());
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_ref[idx] = std::make_pair(h_keys[idx], h_vals[idx]);
+  }
+
+  std::stable_sort(thrust::raw_pointer_cast(h_ref.data()),
+                   thrust::raw_pointer_cast(h_ref.data()) + h_ref.size(),
+                   [](pair_t l, pair_t r) -> bool {
+                     return l.first < r.first;
+                   });
+
+  for (std::size_t idx = 0; idx < h_ref.size(); idx++)
+  {
+    h_keys[idx] = h_ref[idx].first;
+    h_vals[idx] = h_ref[idx].second;
+  }
+
+  block_merge_sort<items_per_thread, threads_in_block>(d_keys, d_vals, stable_sort_pairs_full_tile_t{});
+
+  REQUIRE(h_keys == d_keys);
+  REQUIRE(h_vals == d_vals);
+}
+
+C2H_TEST("Block merge sort can sort large tiles", "[merge sort][block]", threads_in_block)
+{
+  using key_t = std::uint16_t;
+
+  constexpr int items_per_thread = 2;
+
+  // Repurpose block sizes
+  constexpr int cmake_threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block       = cmake_threads_in_block < 256 ? 512 : 1024;
+
+  constexpr int tile_size = threads_in_block * items_per_thread;
+
+  c2h::device_vector<key_t> d_keys(tile_size);
+  c2h::gen(C2H_SEED(10), d_keys);
+
+  c2h::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(thrust::raw_pointer_cast(h_reference.data()),
+                   thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(),
+                   CustomLess{});
+
+  block_merge_sort<items_per_thread, threads_in_block>(d_keys, stable_sort_keys_full_tile_t{});
+
+  REQUIRE(h_reference == d_keys);
+}
+
+C2H_TEST("Block merge sort is stable", "[merge sort][block]", threads_in_block)
+{
+  using key_t = c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>;
+
+  constexpr int items_per_thread = 2;
+  constexpr int threads_in_block = c2h::get<0, TestType>::value;
+  constexpr int tile_size        = threads_in_block * items_per_thread;
+
+  c2h::device_vector<key_t> d_keys(tile_size);
+  c2h::gen(C2H_SEED(10), d_keys);
+
+  c2h::host_vector<key_t> h_reference = d_keys;
+  std::stable_sort(thrust::raw_pointer_cast(h_reference.data()),
+                   thrust::raw_pointer_cast(h_reference.data()) + h_reference.size(),
+                   CustomLess{});
+
+  block_merge_sort<items_per_thread, threads_in_block>(d_keys, stable_sort_keys_full_tile_t{});
+
+  REQUIRE(h_reference == d_keys);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cu
new file mode 100644
index 000000000..e5636edde
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cu
@@ -0,0 +1,410 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/execution_policy.h>
+
+#include <algorithm>
+#include <type_traits>
+#include <utility>
+
+#include "catch2_test_block_radix_sort.cuh"
+#include <c2h/catch2_test_helper.cuh> // __CUDA_FP8_TYPES_EXIST__
+
+// %PARAM% TEST_MEMOIZE mem 0:1
+// %PARAM% TEST_ALGORITHM alg 0:1
+// %PARAM% TEST_IPT ipt 1:11
+// %PARAM% TEST_THREADS_IN_BLOCK ipt 32:160
+
+using types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+                 ,
+                 __nv_fp8_e5m2,
+                 __nv_fp8_e4m3
+#endif // defined(__CUDA_FP8_TYPES_EXIST__)
+                 >;
+using no_value_types = c2h::type_list<cub::NullType>;
+
+using key_types =
+  c2h::type_list<std::int8_t,
+                 std::int16_t,
+                 std::int32_t,
+                 std::int64_t,
+                 float,
+                 double
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+                 ,
+                 __nv_fp8_e5m2,
+                 __nv_fp8_e4m3
+#endif // defined(__CUDA_FP8_TYPES_EXIST__)
+                 >;
+using value_types = c2h::type_list<std::int8_t, c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using threads_in_block = c2h::enum_type_list<int, TEST_THREADS_IN_BLOCK>;
+using items_per_thread = c2h::enum_type_list<int, TEST_IPT>;
+using radix_bits       = c2h::enum_type_list<int, 1, 5>;
+using memoize          = c2h::enum_type_list<bool, TEST_MEMOIZE>;
+
+#if TEST_ALGORITHM == 0
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING>;
+#else
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>;
+#endif
+
+using shmem_config =
+  c2h::enum_type_list<cudaSharedMemConfig, cudaSharedMemBankSizeFourByte, cudaSharedMemBankSizeEightByte>;
+
+using shmem_config_4 = c2h::enum_type_list<cudaSharedMemConfig, cudaSharedMemBankSizeFourByte>;
+
+template <class TestType>
+struct params_t
+{
+  using key_type   = typename c2h::get<0, TestType>;
+  using value_type = typename c2h::get<1, TestType>;
+
+  static constexpr int items_per_thread              = c2h::get<2, TestType>::value;
+  static constexpr int threads_in_block              = c2h::get<3, TestType>::value;
+  static constexpr int tile_size                     = items_per_thread * threads_in_block;
+  static constexpr int radix_bits                    = c2h::get<4, TestType>::value;
+  static constexpr bool memoize                      = c2h::get<5, TestType>::value;
+  static constexpr cub::BlockScanAlgorithm algorithm = c2h::get<6, TestType>::value;
+  static constexpr cudaSharedMemConfig shmem_config  = c2h::get<7, TestType>::value;
+};
+
+template <class T>
+bool binary_equal(
+  const c2h::device_vector<T>& d_output, const c2h::host_vector<T>& h_reference, c2h::device_vector<T>& d_tmp)
+{
+  d_tmp = h_reference;
+
+  using traits_t      = cub::Traits<T>;
+  using bit_ordered_t = typename traits_t::UnsignedBits;
+
+  auto d_output_ptr    = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_output.data()));
+  auto d_reference_ptr = reinterpret_cast<const bit_ordered_t*>(thrust::raw_pointer_cast(d_tmp.data()));
+
+  return thrust::equal(c2h::device_policy, d_output_ptr, d_output_ptr + d_output.size(), d_reference_ptr);
+}
+
+C2H_TEST("Block radix sort can sort keys",
+         "[radix][sort][block]",
+         types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::key_type;
+
+  c2h::device_vector<type> d_output(params::tile_size);
+  c2h::device_vector<type> d_input(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input);
+
+  constexpr int key_size = sizeof(type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_op_t{},
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  c2h::host_vector<type> h_reference = radix_sort_reference(d_input, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input` for comparison
+  REQUIRE(binary_equal(d_output, h_reference, d_input));
+}
+
+C2H_TEST("Block radix sort can sort keys in descending order",
+         "[radix][sort][block]",
+         types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::key_type;
+
+  c2h::device_vector<type> d_output(params::tile_size);
+  c2h::device_vector<type> d_input(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input);
+
+  constexpr int key_size = sizeof(type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_op_t{},
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  c2h::host_vector<type> h_reference = radix_sort_reference(d_input, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input` for comparison
+  REQUIRE(binary_equal(d_output, h_reference, d_input));
+}
+
+C2H_TEST("Block radix sort can sort pairs",
+         "[radix][sort][block]",
+         key_types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::key_type;
+  using value_type = key_type;
+
+  c2h::device_vector<key_type> d_output_keys(params::tile_size);
+  c2h::device_vector<value_type> d_output_values(params::tile_size);
+  c2h::device_vector<key_type> d_input_keys(params::tile_size);
+  c2h::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input_keys);
+  c2h::gen(C2H_SEED(2), d_input_values);
+
+  constexpr int key_size = sizeof(key_type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<c2h::host_vector<key_type>, c2h::host_vector<value_type>> h_reference =
+    radix_sort_reference(d_input_keys, d_input_values, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input_*` for comparison
+  REQUIRE(binary_equal(d_output_keys, h_reference.first, d_input_keys));
+  REQUIRE(binary_equal(d_output_values, h_reference.second, d_input_values));
+}
+
+C2H_TEST("Block radix sort can sort pairs in descending order",
+         "[radix][sort][block]",
+         key_types,
+         no_value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::key_type;
+  using value_type = key_type;
+
+  c2h::device_vector<key_type> d_output_keys(params::tile_size);
+  c2h::device_vector<value_type> d_output_values(params::tile_size);
+  c2h::device_vector<key_type> d_input_keys(params::tile_size);
+  c2h::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input_keys);
+  c2h::gen(C2H_SEED(2), d_input_values);
+
+  constexpr int key_size = sizeof(key_type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<c2h::host_vector<key_type>, c2h::host_vector<value_type>> h_reference =
+    radix_sort_reference(d_input_keys, d_input_values, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input_*` for comparison
+  REQUIRE(binary_equal(d_output_keys, h_reference.first, d_input_keys));
+  REQUIRE(binary_equal(d_output_values, h_reference.second, d_input_values));
+}
+
+C2H_TEST("Block radix sort can sort mixed pairs",
+         "[radix][sort][block]",
+         key_types,
+         value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::key_type;
+  using value_type = typename params::value_type;
+
+  c2h::device_vector<key_type> d_output_keys(params::tile_size);
+  c2h::device_vector<value_type> d_output_values(params::tile_size);
+  c2h::device_vector<key_type> d_input_keys(params::tile_size);
+  c2h::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input_keys);
+  c2h::gen(C2H_SEED(2), d_input_values);
+
+  constexpr int key_size = sizeof(key_type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = false;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<c2h::host_vector<key_type>, c2h::host_vector<value_type>> h_reference =
+    radix_sort_reference(d_input_keys, d_input_values, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input_*` for comparison
+  REQUIRE(binary_equal(d_output_keys, h_reference.first, d_input_keys));
+  REQUIRE(d_output_values == h_reference.second);
+}
+
+C2H_TEST("Block radix sort can sort mixed pairs in descending order",
+         "[radix][sort][block]",
+         key_types,
+         value_types,
+         items_per_thread,
+         threads_in_block,
+         radix_bits,
+         memoize,
+         algorithm,
+         shmem_config_4)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::key_type;
+  using value_type = typename params::value_type;
+
+  c2h::device_vector<key_type> d_output_keys(params::tile_size);
+  c2h::device_vector<value_type> d_output_values(params::tile_size);
+  c2h::device_vector<key_type> d_input_keys(params::tile_size);
+  c2h::device_vector<value_type> d_input_values(params::tile_size);
+  c2h::gen(C2H_SEED(2), d_input_keys);
+  c2h::gen(C2H_SEED(2), d_input_values);
+
+  constexpr int key_size = sizeof(key_type) * 8;
+  const int begin_bit    = GENERATE_COPY(take(2, random(0, key_size)));
+  const int end_bit      = GENERATE_COPY(take(2, random(begin_bit, key_size)));
+  const bool striped     = GENERATE_COPY(false, true);
+
+  constexpr bool is_descending = true;
+
+  block_radix_sort<params::items_per_thread,
+                   params::threads_in_block,
+                   params::radix_bits,
+                   params::memoize,
+                   params::algorithm,
+                   params::shmem_config>(
+    descending_sort_pairs_op_t{},
+    thrust::raw_pointer_cast(d_input_keys.data()),
+    thrust::raw_pointer_cast(d_input_values.data()),
+    thrust::raw_pointer_cast(d_output_keys.data()),
+    thrust::raw_pointer_cast(d_output_values.data()),
+    begin_bit,
+    end_bit,
+    striped);
+
+  std::pair<c2h::host_vector<key_type>, c2h::host_vector<value_type>> h_reference =
+    radix_sort_reference(d_input_keys, d_input_values, is_descending, begin_bit, end_bit);
+
+  // overwrite `d_input_*` for comparison
+  REQUIRE(binary_equal(d_output_keys, h_reference.first, d_input_keys));
+  REQUIRE(d_output_values == h_reference.second);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cuh
new file mode 100644
index 000000000..2142eb5bf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort.cuh
@@ -0,0 +1,289 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/block/block_radix_sort.cuh>
+
+#include "catch2_radix_sort_helper.cuh"
+#include <c2h/catch2_test_helper.cuh>
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ActionT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig>
+__global__ void
+kernel(ActionT action, InputIteratorT input, OutputIteratorT output, int begin_bit, int end_bit, bool striped)
+{
+  using key_t = cub::detail::value_t<InputIteratorT>;
+  using block_radix_sort_t =
+    cub::BlockRadixSort<key_t, ThreadsInBlock, ItemsPerThread, cub::NullType, RadixBits, Memoize, Algorithm, ShmemConfig>;
+
+  using storage_t = typename block_radix_sort_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  key_t keys[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    keys[i] = input[threadIdx.x * ItemsPerThread + i];
+  }
+
+  block_radix_sort_t block_radix_sort(storage);
+
+  if (striped)
+  {
+    action(block_radix_sort, keys, begin_bit, end_bit, cub::Int2Type<1>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output[threadIdx.x + ThreadsInBlock * i] = keys[i];
+    }
+  }
+  else
+  {
+    action(block_radix_sort, keys, begin_bit, end_bit, cub::Int2Type<0>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output[threadIdx.x * ItemsPerThread + i] = keys[i];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename ActionT>
+void block_radix_sort(
+  ActionT action, InputIteratorT input, OutputIteratorT output, int begin_bit, int end_bit, bool striped)
+{
+  kernel<InputIteratorT, OutputIteratorT, ActionT, ItemsPerThread, ThreadsInBlock, RadixBits, Memoize, Algorithm, ShmemConfig>
+    <<<1, ThreadsInBlock>>>(action, input, output, begin_bit, end_bit, striped);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <typename InputKeyIteratorT,
+          typename InputValueIteratorT,
+          typename OutputKeyIteratorT,
+          typename OutputValueIteratorT,
+          typename ActionT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig>
+__global__ void kernel(
+  ActionT action,
+  InputKeyIteratorT input_keys,
+  InputValueIteratorT input_values,
+  OutputKeyIteratorT output_keys,
+  OutputValueIteratorT output_values,
+  int begin_bit,
+  int end_bit,
+  bool striped)
+{
+  using key_t   = cub::detail::value_t<InputKeyIteratorT>;
+  using value_t = cub::detail::value_t<InputValueIteratorT>;
+  using block_radix_sort_t =
+    cub::BlockRadixSort<key_t, ThreadsInBlock, ItemsPerThread, value_t, RadixBits, Memoize, Algorithm, ShmemConfig>;
+
+  using storage_t = typename block_radix_sort_t::TempStorage;
+  __shared__ storage_t storage;
+
+  key_t keys[ItemsPerThread];
+  value_t values[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    keys[i]   = input_keys[threadIdx.x * ItemsPerThread + i];
+    values[i] = input_values[threadIdx.x * ItemsPerThread + i];
+  }
+
+  block_radix_sort_t block_radix_sort(storage);
+
+  if (striped)
+  {
+    action(block_radix_sort, keys, values, begin_bit, end_bit, cub::Int2Type<1>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output_keys[threadIdx.x + ThreadsInBlock * i]   = keys[i];
+      output_values[threadIdx.x + ThreadsInBlock * i] = values[i];
+    }
+  }
+  else
+  {
+    action(block_radix_sort, keys, values, begin_bit, end_bit, cub::Int2Type<0>{});
+
+    for (int i = 0; i < ItemsPerThread; i++)
+    {
+      output_keys[threadIdx.x * ItemsPerThread + i]   = keys[i];
+      output_values[threadIdx.x * ItemsPerThread + i] = values[i];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          int RadixBits,
+          bool Memoize,
+          cub::BlockScanAlgorithm Algorithm,
+          cudaSharedMemConfig ShmemConfig,
+          typename InputKeyIteratorT,
+          typename InputValueIteratorT,
+          typename OutputKeyIteratorT,
+          typename OutputValueIteratorT,
+          typename ActionT>
+void block_radix_sort(
+  ActionT action,
+  InputKeyIteratorT input_keys,
+  InputValueIteratorT input_values,
+  OutputKeyIteratorT output_keys,
+  OutputValueIteratorT output_values,
+  int begin_bit,
+  int end_bit,
+  bool striped)
+{
+  kernel<InputKeyIteratorT,
+         InputValueIteratorT,
+         OutputKeyIteratorT,
+         OutputValueIteratorT,
+         ActionT,
+         ItemsPerThread,
+         ThreadsInBlock,
+         RadixBits,
+         Memoize,
+         Algorithm,
+         ShmemConfig>
+    <<<1, ThreadsInBlock>>>(action, input_keys, input_values, output_keys, output_values, begin_bit, end_bit, striped);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+struct sort_op_t
+{
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void
+  operator()(BlockRadixSortT& block_radix_sort, KeysT& keys, int begin_bit, int end_bit, cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.Sort(keys, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void
+  operator()(BlockRadixSortT& block_radix_sort, KeysT& keys, int begin_bit, int end_bit, cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortBlockedToStriped(keys, begin_bit, end_bit);
+  }
+};
+
+struct descending_sort_op_t
+{
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void
+  operator()(BlockRadixSortT& block_radix_sort, KeysT& keys, int begin_bit, int end_bit, cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.SortDescending(keys, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT>
+  __device__ void
+  operator()(BlockRadixSortT& block_radix_sort, KeysT& keys, int begin_bit, int end_bit, cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortDescendingBlockedToStriped(keys, begin_bit, end_bit);
+  }
+};
+
+struct sort_pairs_op_t
+{
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(
+    BlockRadixSortT& block_radix_sort,
+    KeysT& keys,
+    ValuesT& values,
+    int begin_bit,
+    int end_bit,
+    cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.Sort(keys, values, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(
+    BlockRadixSortT& block_radix_sort,
+    KeysT& keys,
+    ValuesT& values,
+    int begin_bit,
+    int end_bit,
+    cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortBlockedToStriped(keys, values, begin_bit, end_bit);
+  }
+};
+
+struct descending_sort_pairs_op_t
+{
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(
+    BlockRadixSortT& block_radix_sort,
+    KeysT& keys,
+    ValuesT& values,
+    int begin_bit,
+    int end_bit,
+    cub::Int2Type<0> /* striped */)
+  {
+    block_radix_sort.SortDescending(keys, values, begin_bit, end_bit);
+  }
+
+  template <class BlockRadixSortT, class KeysT, class ValuesT>
+  __device__ void operator()(
+    BlockRadixSortT& block_radix_sort,
+    KeysT& keys,
+    ValuesT& values,
+    int begin_bit,
+    int end_bit,
+    cub::Int2Type<1> /* striped */)
+  {
+    block_radix_sort.SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
+  }
+};
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort_custom.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort_custom.cu
new file mode 100644
index 000000000..bcce108d2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_radix_sort_custom.cu
@@ -0,0 +1,1032 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#undef NDEBUG
+#include <algorithm>
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include "catch2_test_block_radix_sort.cuh"
+#include "cub/block/radix_rank_sort_operations.cuh"
+
+// example-begin custom-type
+struct custom_t
+{
+  float f;
+  int unused;
+  long long int lli;
+
+  custom_t() = default;
+  __device__ custom_t(float f, long long int lli)
+      : f(f)
+      , unused(42)
+      , lli(lli)
+  {}
+};
+
+static __device__ bool operator==(const custom_t& lhs, const custom_t& rhs)
+{
+  return lhs.f == rhs.f && lhs.lli == rhs.lli;
+}
+
+struct decomposer_t
+{
+  __device__ ::cuda::std::tuple<float&, long long int&> //
+  operator()(custom_t & key) const
+  {
+    return {key.f, key.lli};
+  }
+};
+// example-end custom-type
+
+__global__ void sort_keys()
+{
+  // example-begin keys
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5} //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).Sort(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {+0.0, 1}, //
+       {-0.0, 2} //
+     },
+     {
+       // thread 1 expected keys
+       {+1.1, 3}, //
+       {+2.5, 4}, //
+       {+3.7, 5} //
+     }};
+  // example-end keys
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_bits()
+{
+  // example-begin keys-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 1 key each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {24.2, 1ll << 61} // thread 0 keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 keys
+     }};
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).Sort(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {42.4, 1ll << 60}, // thread 0 expected keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 expected keys
+     }};
+  // example-end keys-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+}
+
+__global__ void sort_keys_descending()
+{
+  // example-begin keys-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0} //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortDescending(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+2.5, 1}, //
+       {+1.1, 2}, //
+     },
+     {
+       // thread 1 expected keys
+       {-0.0, 4}, //
+       {+0.0, 3}, //
+       {-2.5, 5} //
+     }};
+  // example-end keys-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_descending_bits()
+{
+  // example-begin keys-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 1 key each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {42.4, 1ll << 60} // thread 0 keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 keys
+     }};
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+  // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortDescending(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {24.2, 1ll << 61}, // thread 0 expected keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 expected keys
+     }};
+  // example-end keys-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+}
+
+__global__ void sort_pairs()
+{
+  // example-begin pairs
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5} //
+     }};
+
+  int thread_values[2][3] = //
+    {{4, 0, 3}, // thread 0 values
+     {1, 2, 5}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).Sort(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_keys[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {+0.0, 1}, //
+       {-0.0, 2} //
+     },
+     {
+       // thread 1 expected keys
+       {+1.1, 3}, //
+       {+2.5, 4}, //
+       {+3.7, 5} //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 1, 2}, // thread 0 expected values
+     {3, 4, 5}}; // thread 1 expected values
+  // example-end pairs
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_keys[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_keys[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_bits()
+{
+  // example-begin pairs-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {24.2, 1ll << 61} // thread 0 keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 keys
+     }};
+
+  int thread_values[2][1] = //
+    {{1}, // thread 0 values
+     {0}}; // thread 1 values
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .Sort(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_keys[2][3] = //
+    {{
+       {42.4, 1ll << 60}, // thread 0 expected keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 expected keys
+     }};
+
+  int expected_values[2][1] = //
+    {{0}, // thread 0 values
+     {1}}; // thread 1 values
+  // example-end pairs-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+}
+
+__global__ void sort_pairs_descending()
+{
+  // example-begin pairs-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0} //
+     }};
+
+  int thread_values[2][3] = //
+    {{2, 1, 4}, // thread 0 values
+     {3, 5, 0}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortDescending(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_keys[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+2.5, 1}, //
+       {+1.1, 2}, //
+     },
+     {
+       // thread 1 expected keys
+       {-0.0, 4}, //
+       {+0.0, 3}, //
+       {-2.5, 5} //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 1, 2}, // thread 0 expected values
+     {4, 3, 5}}; // thread 1 expected values
+  // example-end pairs-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_keys[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_keys[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_keys[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_descending_bits()
+{
+  // example-begin pairs-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 1, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][1] = //
+    {{
+       {42.4, 1ll << 60} // thread 0 keys
+     },
+     {
+       {24.2, 1ll << 61} // thread 1 keys
+     }};
+
+  int thread_values[2][1] = //
+    {{1}, // thread 0 values
+     {0}}; // thread 1 values
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+  // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescending(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{
+       {24.2, 1ll << 61}, // thread 0 expected keys
+     },
+     {
+       {42.4, 1ll << 60} // thread 1 expected keys
+     }};
+
+  int expected_values[2][1] = //
+    {{0}, // thread 0 expected values
+     {1}}; // thread 1 expected values
+  // example-end pairs-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+}
+
+__global__ void sort_keys_blocked_to_striped()
+{
+  // example-begin keys-striped
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5} //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {-0.0, 2}, //
+       {+2.5, 4} //
+     },
+     {
+       // thread 1 expected keys
+       {+0.0, 1}, //
+       {+1.1, 3}, //
+       {+3.7, 5} //
+     }};
+  // example-end keys-striped
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_blocked_to_striped_bits()
+{
+  // example-begin keys-striped-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{// thread 0 keys
+      {24.2, 1ll << 62},
+      {42.4, 1ll << 61}},
+     {// thread 1 keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 59}}};
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{// thread 0 expected keys
+      {24.2, 1ll << 59},
+      {42.4, 1ll << 61}},
+     {// thread 1 expected keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 62}}};
+  // example-end keys-striped-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+}
+
+__global__ void sort_pairs_blocked_to_striped()
+{
+  // example-begin pairs-striped
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+2.5, 4}, //
+       {-2.5, 0}, //
+       {+1.1, 3}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 1}, //
+       {-0.0, 2}, //
+       {+3.7, 5} //
+     }};
+
+  int thread_values[2][3] = //
+    {{4, 0, 3}, // thread 0 values
+     {1, 2, 5}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortBlockedToStriped(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {-2.5, 0}, //
+       {-0.0, 2}, //
+       {+2.5, 4} //
+     },
+     {
+       // thread 1 expected keys
+       {+0.0, 1}, //
+       {+1.1, 3}, //
+       {+3.7, 5} //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 2, 4}, // thread 0 values
+     {1, 3, 5}}; // thread 1 values
+  // example-end pairs-striped
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_blocked_to_striped_bits()
+{
+  // example-begin pairs-striped-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{// thread 0 keys
+      {24.2, 1ll << 62},
+      {42.4, 1ll << 61}},
+     {// thread 1 keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 59}}};
+
+  int thread_values[2][2] = //
+    {{3, 2}, // thread 0 values
+     {1, 0}}; // thread 1 values
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortBlockedToStriped(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][3] = //
+    {{// thread 0 expected keys
+      {24.2, 1ll << 59},
+      {42.4, 1ll << 61}},
+     {// thread 1 expected keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 62}}};
+
+  int expected_values[2][2] = //
+    {{0, 2}, // thread 0 values
+     {1, 3}}; // thread 1 values
+  // example-end pairs-striped-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+}
+
+__global__ void sort_keys_descending_blocked_to_striped()
+{
+  // example-begin keys-striped-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0} //
+     }};
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage).SortDescendingBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+1.1, 2}, //
+       {+0.0, 3} //
+     },
+     {
+       // thread 1 expected keys
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+       {-2.5, 5} //
+     }};
+  // example-end keys-striped-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+}
+
+__global__ void sort_keys_descending_blocked_to_striped_bits()
+{
+  // example-begin keys-striped-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{// thread 0 keys
+      {24.2, 1ll << 62},
+      {42.4, 1ll << 61}},
+     {// thread 1 keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 59}}};
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][2] = //
+    {{
+       // thread 0 expected keys
+       {24.2, 1ll << 62}, //
+       {42.4, 1ll << 60} //
+     },
+     {
+       // thread 1 expected keys
+       {42.4, 1ll << 61}, //
+       {24.2, 1ll << 59} //
+     }};
+  // example-end keys-striped-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+}
+
+__global__ void sort_pairs_descending_blocked_to_striped()
+{
+  // example-begin pairs-striped-descending
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 3 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 3, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][3] = //
+    {{
+       // thread 0 keys
+       {+1.1, 2}, //
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+     },
+     {
+       // thread 1 keys
+       {+0.0, 3}, //
+       {-2.5, 5}, //
+       {+3.7, 0} //
+     }};
+
+  int thread_values[2][3] = //
+    {{2, 1, 4}, // thread 0 values
+     {3, 5, 0}}; // thread 1 values
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{});
+
+  custom_t expected_output[2][3] = //
+    {{
+       // thread 0 expected keys
+       {+3.7, 0}, //
+       {+1.1, 2}, //
+       {+0.0, 3} //
+     },
+     {
+       // thread 1 expected keys
+       {+2.5, 1}, //
+       {-0.0, 4}, //
+       {-2.5, 5} //
+     }};
+
+  int expected_values[2][3] = //
+    {{0, 2, 3}, // thread 0 values
+     {1, 4, 5}}; // thread 1 values
+  // example-end pairs-striped-descending
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+  assert(thread_keys[threadIdx.x][2] == expected_output[threadIdx.x][2]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+  assert(thread_values[threadIdx.x][2] == expected_values[threadIdx.x][2]);
+}
+
+__global__ void sort_pairs_descending_blocked_to_striped_bits()
+{
+  // example-begin pairs-striped-descending-bits
+  // Specialize `cub::BlockRadixSort` for a 1D block of 2 threads owning 2 keys and values each
+  using block_radix_sort_t = cub::BlockRadixSort<custom_t, 2, 2, int>;
+
+  // Allocate shared memory for `cub::BlockRadixSort`
+  __shared__ block_radix_sort_t::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  custom_t thread_keys[2][2] = //
+    {{// thread 0 keys
+      {24.2, 1ll << 62},
+      {42.4, 1ll << 61}},
+     {// thread 1 keys
+      {42.4, 1ll << 60},
+      {24.2, 1ll << 59}}};
+
+  int thread_values[2][2] = //
+    {{3, 2}, // thread 0 values
+     {1, 0}}; // thread 1 values
+
+  constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+  constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+  // Decomposition orders the bits as follows:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = 01000001110000011001100110011010 01000000000000...0000
+  // decompose(in[1]) = 01000010001010011001100110011010 00100000000000...0000
+  // decompose(in[2]) = 01000001110000011001100110011010 00010000000000...0000
+  // decompose(in[3]) = 01000010001010011001100110011010 00001000000000...0000
+  //                    <-----------  higher bits  /  lower bits  ----------->
+  //
+  // The bit subrange `[60, 68)` specifies differentiating key bits:
+  //
+  //                    <------------- fp32 -----------> <------ int64 ------>
+  // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0100xxxxxxxxxx...xxxx
+  // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+  // decompose(in[2]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+  // decompose(in[3]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0000xxxxxxxxxx...xxxx
+  //                    <-----------  higher bits  /  lower bits  ----------->
+
+  // Collectively sort the keys
+  block_radix_sort_t(temp_storage)
+    .SortDescendingBlockedToStriped(
+      thread_keys[threadIdx.x], thread_values[threadIdx.x], decomposer_t{}, begin_bit, end_bit);
+
+  custom_t expected_output[2][2] = //
+    {{
+       // thread 0 expected keys
+       {24.2, 1ll << 62}, //
+       {42.4, 1ll << 60} //
+     },
+     {
+       // thread 1 expected keys
+       {42.4, 1ll << 61}, //
+       {24.2, 1ll << 59} //
+     }};
+
+  int expected_values[2][2] = //
+    {{3, 1}, // thread 0 values
+     {2, 0}}; // thread 1 values
+  // example-end pairs-striped-descending-bits
+
+  assert(thread_keys[threadIdx.x][0] == expected_output[threadIdx.x][0]);
+  assert(thread_keys[threadIdx.x][1] == expected_output[threadIdx.x][1]);
+
+  assert(thread_values[threadIdx.x][0] == expected_values[threadIdx.x][0]);
+  assert(thread_values[threadIdx.x][1] == expected_values[threadIdx.x][1]);
+}
+
+TEST_CASE("Block radix sort works in some corner cases", "[radix][sort][block]")
+{
+  sort_keys<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_keys_descending_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_blocked_to_striped<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  sort_pairs_descending_blocked_to_striped_bits<<<1, 2>>>();
+  REQUIRE(cudaSuccess == cudaGetLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_reduce.cu
new file mode 100644
index 000000000..9b7fef921
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_reduce.cu
@@ -0,0 +1,328 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_reduce.cuh>
+
+#include <limits>
+#include <numeric>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <cub::BlockReduceAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+__global__ void block_reduce_kernel(T* in, T* out, int valid_items, ActionT action)
+{
+  using block_reduce_t = cub::BlockReduce<T, BlockDimX, Algorithm, BlockDimY, BlockDimZ>;
+  using storage_t      = typename block_reduce_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  T thread_data[ItemsPerThread];
+
+  const int tid           = static_cast<int>(cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ));
+  const int thread_offset = tid * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = idx < valid_items ? in[idx] : T();
+  }
+  __syncthreads();
+
+  block_reduce_t reduce(storage);
+
+  T aggregate = action(reduce, thread_data, valid_items);
+
+  if (tid == 0)
+  {
+    out[0] = aggregate;
+  }
+}
+
+template <cub::BlockReduceAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+void block_reduce(c2h::device_vector<T>& in, c2h::device_vector<T>& out, ActionT action)
+{
+  dim3 block_dims(BlockDimX, BlockDimY, BlockDimZ);
+
+  block_reduce_kernel<Algorithm, ItemsPerThread, BlockDimX, BlockDimY, BlockDimZ, T, ActionT><<<1, block_dims>>>(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), static_cast<int>(in.size()), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+struct sum_partial_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT& reduce, T (&thread_data)[ItemsPerThread], int valid_items) const
+  {
+    return reduce.Sum(thread_data[0], valid_items);
+  }
+};
+
+struct sum_full_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT& reduce, T (&thread_data)[ItemsPerThread], int /* valid_items */) const
+  {
+    return reduce.Sum(thread_data);
+  }
+};
+
+struct max_partial_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT& reduce, T (&thread_data)[ItemsPerThread], int valid_items) const
+  {
+    return reduce.Reduce(thread_data[0], cub::Max{}, valid_items);
+  }
+};
+
+struct max_full_tile_op_t
+{
+  template <int ItemsPerThread, class BlockReduceT, class T>
+  __device__ T operator()(BlockReduceT& reduce, T (&thread_data)[ItemsPerThread], int /* valid_items */) const
+  {
+    return reduce.Reduce(thread_data, cub::Max{});
+  }
+};
+
+using types     = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t, float, double>;
+using vec_types = c2h::type_list<ulonglong4, uchar3, short2>;
+
+// %PARAM% TEST_DIM_X dimx 1:7:32:65:128
+// %PARAM% TEST_DIM_YZ dimyz 1:2
+
+using block_dim_xs           = c2h::enum_type_list<int, TEST_DIM_X>;
+using block_dim_yzs          = c2h::enum_type_list<int, TEST_DIM_YZ>;
+using items_per_thread       = c2h::enum_type_list<int, 1, 4>;
+using single_item_per_thread = c2h::enum_type_list<int, 1>;
+using algorithm =
+  c2h::enum_type_list<cub::BlockReduceAlgorithm,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+                      cub::BlockReduceAlgorithm::BLOCK_REDUCE_WARP_REDUCTIONS>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_x      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<3, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int tile_size        = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+
+  static constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<4, TestType>::value;
+};
+
+C2H_TEST(
+  "Block reduce works with sum", "[reduce][block]", types, items_per_thread, block_dim_xs, block_dim_yzs, algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in, std::numeric_limits<type>::min());
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<params::algorithm,
+               params::items_per_thread,
+               params::block_dim_x,
+               params::block_dim_y,
+               params::block_dim_z,
+               type>(d_in, d_out, sum_full_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+C2H_TEST("Block reduce works with sum in partial tiles",
+         "[reduce][block]",
+         types,
+         single_item_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_in, std::numeric_limits<type>::min());
+
+  c2h::host_vector<type> h_in = d_in;
+  std::vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<params::algorithm,
+               params::items_per_thread,
+               params::block_dim_x,
+               params::block_dim_y,
+               params::block_dim_z,
+               type>(d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+C2H_TEST("Block reduce works with custom op",
+         "[reduce][block]",
+         types,
+         items_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in, std::numeric_limits<type>::min());
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return std::max(lhs, rhs);
+    }));
+
+  block_reduce<params::algorithm,
+               params::items_per_thread,
+               params::block_dim_x,
+               params::block_dim_y,
+               params::block_dim_z,
+               type>(d_in, d_out, max_full_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+C2H_TEST("Block reduce works with custom op in partial tiles",
+         "[reduce][block]",
+         types,
+         single_item_per_thread,
+         block_dim_xs,
+         block_dim_yzs,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_in, std::numeric_limits<type>::min());
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return std::max(lhs, rhs);
+    }));
+
+  block_reduce<params::algorithm,
+               params::items_per_thread,
+               params::block_dim_x,
+               params::block_dim_y,
+               params::block_dim_z,
+               type>(d_in, d_out, max_partial_tile_op_t{});
+
+  REQUIRE_APPROX_EQ(h_reference, d_out);
+}
+
+C2H_TEST("Block reduce works with custom types", "[reduce][block]", block_dim_xs, block_dim_yzs, algorithm)
+{
+  using type = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+
+  constexpr int items_per_thread                = 1;
+  constexpr int block_dim_x                     = c2h::get<0, TestType>::value;
+  constexpr int block_dim_y                     = c2h::get<1, TestType>::value;
+  constexpr int block_dim_z                     = block_dim_y;
+  constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<2, TestType>::value;
+
+  constexpr int tile_size = block_dim_x * block_dim_y * block_dim_z * items_per_thread;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, tile_size))));
+  c2h::gen(C2H_SEED(10), d_in, std::numeric_limits<type>::min());
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z, type>(
+    d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE(h_reference == d_out);
+}
+
+C2H_TEST("Block reduce works with vec types", "[reduce][block]", vec_types, block_dim_xs, block_dim_yzs, algorithm)
+{
+  using type = c2h::get<0, TestType>;
+
+  constexpr int items_per_thread                = 1;
+  constexpr int block_dim_x                     = c2h::get<1, TestType>::value;
+  constexpr int block_dim_y                     = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                     = block_dim_y;
+  constexpr cub::BlockReduceAlgorithm algorithm = c2h::get<3, TestType>::value;
+
+  constexpr int tile_size = block_dim_x * block_dim_y * block_dim_z * items_per_thread;
+
+  c2h::device_vector<type> d_out(1);
+  c2h::device_vector<type> d_in(GENERATE_COPY(take(2, random(1, tile_size))));
+  c2h::gen(C2H_SEED(10), d_in);
+
+  c2h::host_vector<type> h_in = d_in;
+  c2h::host_vector<type> h_reference(
+    1, std::accumulate(h_in.begin() + 1, h_in.end(), h_in[0], [](const type& lhs, const type& rhs) {
+      return static_cast<type>(lhs + rhs);
+    }));
+
+  block_reduce<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z, type>(
+    d_in, d_out, sum_partial_tile_op_t{});
+
+  REQUIRE(h_reference == d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_run_length_decode.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_run_length_decode.cu
new file mode 100644
index 000000000..03d62defe
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_run_length_decode.cu
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_run_length_decode.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+/******************************************************************************
+ * HELPER CLASS FOR RUN-LENGTH DECODING TESTS
+ ******************************************************************************/
+
+/**
+ * \brief Class template to facilitate testing the BlockRunLengthDecode algorithm for all its
+ * template parameter specialisations.
+ *
+ * \tparam ItemItT The item type being run-length decoded
+ * \tparam RunLengthsItT Iterator type providing the runs' lengths
+ * \tparam RUNS_PER_THREAD The number of runs that each thread is getting assigned to
+ * \tparam DECODED_ITEMS_PER_THREAD The number of run-length decoded items that each thread is
+ *         decoding \tparam TEST_RELATIVE_OFFSETS_ Whether to also retrieve each decoded item's
+ *         relative offset within its run \tparam TEST_RUN_OFFSETS_ Whether to pass in each run's
+ *         offset instead of each run's length \tparam BLOCK_DIM_X The thread block length in
+ *         threads along the X dimension
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template <typename ItemItT,
+          typename RunLengthsItT,
+          int RUNS_PER_THREAD,
+          int DECODED_ITEMS_PER_THREAD,
+          bool TEST_RELATIVE_OFFSETS_,
+          bool TEST_RUN_OFFSETS_,
+          int BLOCK_DIM_X,
+          int BLOCK_DIM_Y = 1,
+          int BLOCK_DIM_Z = 1>
+class AgentTestBlockRunLengthDecode
+{
+public:
+  static constexpr uint32_t BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  static constexpr uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * BLOCK_THREADS;
+  static constexpr bool TEST_RELATIVE_OFFSETS = TEST_RELATIVE_OFFSETS_;
+
+private:
+  using RunItemT   = cub::detail::value_t<ItemItT>;
+  using RunLengthT = cub::detail::value_t<RunLengthsItT>;
+
+  using BlockRunOffsetScanT = cub::BlockScan<RunLengthT, BLOCK_DIM_X, cub::BLOCK_SCAN_RAKING, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockRunLengthDecodeT =
+    cub::BlockRunLengthDecode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+
+  using BlockLoadRunItemT =
+    cub::BlockLoad<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockLoadRunLengthsT =
+    cub::BlockLoad<RunLengthT, BLOCK_DIM_X, RUNS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockStoreDecodedItemT = cub::
+    BlockStore<RunItemT, BLOCK_DIM_X, DECODED_ITEMS_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE, BLOCK_DIM_Y, BLOCK_DIM_Z>;
+
+  using BlockStoreRelativeOffsetT =
+    cub::BlockStore<RunLengthT,
+                    BLOCK_DIM_X,
+                    DECODED_ITEMS_PER_THREAD,
+                    cub::BLOCK_STORE_WARP_TRANSPOSE,
+                    BLOCK_DIM_Y,
+                    BLOCK_DIM_Z>;
+
+  __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(
+    RunItemT (&unique_items)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    RunLengthT& decoded_size,
+    cub::Int2Type<true> /*test_run_offsets*/)
+  {
+    RunLengthT run_offsets[RUNS_PER_THREAD];
+    BlockRunOffsetScanT(temp_storage.run_offsets_scan_storage).ExclusiveSum(run_lengths, run_offsets, decoded_size);
+
+    // Ensure temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+
+    // Construct BlockRunLengthDecode and initialize with the run offsets
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_offsets);
+  }
+
+  __device__ __forceinline__ BlockRunLengthDecodeT InitBlockRunLengthDecode(
+    RunItemT (&unique_items)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    RunLengthT& decoded_size,
+    cub::Int2Type<false> /*test_run_offsets*/)
+  {
+    // Construct BlockRunLengthDecode and initialize with the run lengths
+    return BlockRunLengthDecodeT(temp_storage.decode.run_length_decode_storage, unique_items, run_lengths, decoded_size);
+  }
+
+  __device__ __forceinline__ void LoadRuns(
+    ItemItT d_block_unique_items,
+    RunLengthsItT d_block_run_lengths,
+    RunItemT (&unique_items)[RUNS_PER_THREAD],
+    RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+    size_t num_valid_items)
+  {
+    if (num_valid_items < RUNS_PER_BLOCK)
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items, num_valid_items);
+    }
+    else
+    {
+      BlockLoadRunItemT(temp_storage.load_uniques_storage).Load(d_block_unique_items, unique_items);
+    }
+
+    // Ensure BlockLoad's temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+
+    // Load this block's tile of run lengths
+    if (num_valid_items < RUNS_PER_BLOCK)
+    {
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage)
+        .Load(d_block_run_lengths, run_lengths, num_valid_items, static_cast<RunLengthT>(0));
+    }
+    else
+    {
+      BlockLoadRunLengthsT(temp_storage.load_run_lengths_storage).Load(d_block_run_lengths, run_lengths);
+    }
+
+    // Ensure temporary shared memory can be repurposed
+    cub::CTA_SYNC();
+  }
+
+public:
+  union TempStorage
+  {
+    typename BlockLoadRunItemT::TempStorage load_uniques_storage;
+    typename BlockLoadRunLengthsT::TempStorage load_run_lengths_storage;
+    ::cuda::std::_If<TEST_RUN_OFFSETS_, typename BlockRunOffsetScanT::TempStorage, cub::NullType>
+      run_offsets_scan_storage;
+    struct
+    {
+      typename BlockRunLengthDecodeT::TempStorage run_length_decode_storage;
+      typename BlockStoreDecodedItemT::TempStorage store_decoded_runs_storage;
+      typename BlockStoreRelativeOffsetT::TempStorage store_relative_offsets;
+    } decode;
+  };
+
+  TempStorage& temp_storage;
+
+  __device__ __forceinline__ AgentTestBlockRunLengthDecode(TempStorage& temp_storage)
+      : temp_storage(temp_storage)
+  {}
+
+  /**
+   * \brief Loads the given block (or tile) of runs, and computes their "decompressed" (run-length
+   * decoded) size.
+   */
+  __device__ __forceinline__ uint32_t
+  GetDecodedSize(ItemItT d_block_unique_items, RunLengthsItT d_block_run_lengths, size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
+    // "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type<TEST_RUN_OFFSETS_>());
+    return decoded_size;
+  }
+
+  /**
+   * \brief Loads the given block (or tile) of runs, run-length decodes them, and writes the results
+   * to \p d_block_decoded_out.
+   */
+  template <typename UniqueItemOutItT, typename RelativeOffsetOutItT>
+  __device__ __forceinline__ uint32_t WriteDecodedRuns(
+    ItemItT d_block_unique_items,
+    RunLengthsItT d_block_run_lengths,
+    UniqueItemOutItT d_block_decoded_out,
+    RelativeOffsetOutItT d_block_rel_out,
+    size_t num_valid_runs)
+  {
+    // Load this block's tile of encoded runs
+    RunItemT unique_items[RUNS_PER_THREAD];
+    RunLengthT run_lengths[RUNS_PER_THREAD];
+    LoadRuns(d_block_unique_items, d_block_run_lengths, unique_items, run_lengths, num_valid_runs);
+
+    // Init the BlockRunLengthDecode and get the total decoded size of this block's tile (i.e., the
+    // "decompressed" size)
+    uint32_t decoded_size = 0U;
+    BlockRunLengthDecodeT run_length_decode =
+      InitBlockRunLengthDecode(unique_items, run_lengths, decoded_size, cub::Int2Type<TEST_RUN_OFFSETS_>());
+
+    // Run-length decode ("decompress") the runs into a window buffer of limited size. This is
+    // repeated until all runs have been decoded.
+    uint32_t decoded_window_offset = 0U;
+    while (decoded_window_offset < decoded_size)
+    {
+      RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+      RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+
+      // The number of decoded items that are valid within this window (aka pass) of run-length
+      // decoding
+      uint32_t num_valid_items = decoded_size - decoded_window_offset;
+      run_length_decode.RunLengthDecode(decoded_items, relative_offsets, decoded_window_offset);
+      BlockStoreDecodedItemT(temp_storage.decode.store_decoded_runs_storage)
+        .Store(d_block_decoded_out + decoded_window_offset, decoded_items, num_valid_items);
+
+      if (TEST_RELATIVE_OFFSETS)
+      {
+        BlockStoreRelativeOffsetT(temp_storage.decode.store_relative_offsets)
+          .Store(d_block_rel_out + decoded_window_offset, relative_offsets, num_valid_items);
+      }
+
+      decoded_window_offset += DECODED_ITEMS_PER_THREAD * BLOCK_THREADS;
+    }
+    return decoded_size;
+  }
+};
+
+/******************************************************************************
+ * [STAGE 1] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename OffsetT,
+          typename DecodedSizesOutT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ void BlockRunLengthDecodeGetSizeKernel(
+  const ItemItT d_unique_items,
+  const RunLengthsItT d_run_lengths,
+  const OffsetT num_runs,
+  DecodedSizesOutT d_decoded_sizes)
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  uint64_t num_decoded_items =
+    run_length_decode_agent.GetDecodedSize(d_unique_items + block_offset, d_run_lengths + block_offset, num_valid_runs);
+
+  d_decoded_sizes[blockIdx.x] = num_decoded_items;
+}
+
+/******************************************************************************
+ * [STAGE 2] RUN-LENGTH DECODING TEST KERNEL
+ ******************************************************************************/
+template <typename AgentTestBlockRunLengthDecode,
+          typename ItemItT,
+          typename RunLengthsItT,
+          typename DecodedSizesOutT,
+          typename OffsetT,
+          typename DecodedItemsOutItT,
+          typename RelativeOffsetOutItT>
+__launch_bounds__(AgentTestBlockRunLengthDecode::BLOCK_THREADS) __global__ void BlockRunLengthDecodeTestKernel(
+  const ItemItT d_unique_items,
+  const RunLengthsItT d_run_lengths,
+  const DecodedSizesOutT d_decoded_offsets,
+  const OffsetT num_runs,
+  DecodedItemsOutItT d_decoded_items,
+  RelativeOffsetOutItT d_relative_offsets)
+
+{
+  constexpr OffsetT RUNS_PER_BLOCK = AgentTestBlockRunLengthDecode::RUNS_PER_BLOCK;
+
+  __shared__ typename AgentTestBlockRunLengthDecode::TempStorage temp_storage;
+
+  OffsetT block_offset   = blockIdx.x * RUNS_PER_BLOCK;
+  OffsetT num_valid_runs = (block_offset + RUNS_PER_BLOCK >= num_runs) ? (num_runs - block_offset) : RUNS_PER_BLOCK;
+
+  AgentTestBlockRunLengthDecode run_length_decode_agent(temp_storage);
+  run_length_decode_agent.WriteDecodedRuns(
+    d_unique_items + block_offset,
+    d_run_lengths + block_offset,
+    d_decoded_items + d_decoded_offsets[blockIdx.x],
+    d_relative_offsets + d_decoded_offsets[blockIdx.x],
+    num_valid_runs);
+}
+
+struct ModOp
+{
+  using T = uint32_t;
+  __host__ __device__ __forceinline__ T operator()(const T& x) const
+  {
+    return 1 + (x % 100);
+  }
+};
+
+template <uint32_t RUNS_PER_THREAD,
+          uint32_t DECODED_ITEMS_PER_THREAD,
+          uint32_t BLOCK_DIM_X,
+          uint32_t BLOCK_DIM_Y,
+          uint32_t BLOCK_DIM_Z,
+          bool TEST_RUN_OFFSETS,
+          bool TEST_RELATIVE_OFFSETS>
+void TestAlgorithmSpecialisation()
+{
+  constexpr uint32_t THREADS_PER_BLOCK = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+  constexpr uint32_t RUNS_PER_BLOCK    = RUNS_PER_THREAD * THREADS_PER_BLOCK;
+
+  using RunItemT      = float;
+  using RunLengthT    = uint32_t;
+  using ItemItT       = cub::CountingInputIterator<RunItemT>;
+  using RunLengthsItT = cub::TransformInputIterator<RunLengthT, ModOp, cub::CountingInputIterator<RunLengthT>>;
+
+  ItemItT d_unique_items(1000U);
+  RunLengthsItT d_run_lengths(cub::CountingInputIterator<RunLengthT>(0), ModOp{});
+
+  constexpr uint32_t num_runs   = 10000;
+  constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK;
+
+  size_t temp_storage_bytes      = 0ULL;
+  void* temp_storage             = nullptr;
+  uint32_t* h_num_decoded_total  = nullptr;
+  uint32_t* d_decoded_sizes      = nullptr;
+  uint32_t* d_decoded_offsets    = nullptr;
+  RunItemT* d_decoded_out        = nullptr;
+  RunLengthT* d_relative_offsets = nullptr;
+  RunItemT* h_decoded_out        = nullptr;
+  RunLengthT* h_relative_offsets = nullptr;
+
+  using AgentTestBlockRunLengthDecodeT = AgentTestBlockRunLengthDecode<
+    ItemItT,
+    RunLengthsItT,
+    RUNS_PER_THREAD,
+    DECODED_ITEMS_PER_THREAD,
+    TEST_RELATIVE_OFFSETS,
+    TEST_RUN_OFFSETS,
+    THREADS_PER_BLOCK,
+    1,
+    1>;
+
+  enum : uint32_t
+  {
+    TIMER_SIZE_BEGIN = 0,
+    TIMER_SIZE_END,
+    TIMER_DECODE_BEGIN,
+    TIMER_DECODE_END,
+    NUM_TIMERS,
+  };
+
+  // Get temporary storage requirements for the scan (for computing offsets for the per-block
+  // run-length decoded items)
+  cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, d_decoded_sizes, d_decoded_offsets, num_blocks);
+
+  // Allocate device memory
+  CubDebugExit(cudaMalloc(&temp_storage, temp_storage_bytes));
+  CubDebugExit(cudaMalloc(&d_decoded_sizes, num_blocks * sizeof(*d_decoded_sizes)));
+  // Allocate for the exclusive sum PLUS the overall aggregate
+  CubDebugExit(cudaMalloc(&d_decoded_offsets, (num_blocks + 1) * sizeof(*d_decoded_offsets)));
+  CubDebugExit(cudaMallocHost(&h_num_decoded_total, sizeof(*h_num_decoded_total)));
+
+  // Get the per-block number of items being decoded (i-th thread block writing size to
+  // d_decoded_sizes[i])
+  BlockRunLengthDecodeGetSizeKernel<AgentTestBlockRunLengthDecodeT>
+    <<<num_blocks, THREADS_PER_BLOCK, 0U>>>(d_unique_items, d_run_lengths, num_runs, d_decoded_sizes);
+
+  // Compute offsets for the runs decoded by each block (exclusive sum + aggregate)
+  CubDebugExit(cudaMemsetAsync(d_decoded_offsets, 0, sizeof(d_decoded_offsets[0])));
+  CubDebugExit(cub::DeviceScan::InclusiveSum(
+    temp_storage, temp_storage_bytes, d_decoded_sizes, &d_decoded_offsets[1], num_blocks));
+
+  // Copy the total decoded size to CPU in order to allocate just the right amount of device memory
+  CubDebugExit(cudaMemcpy(
+    h_num_decoded_total, &d_decoded_offsets[num_blocks], sizeof(*h_num_decoded_total), cudaMemcpyDeviceToHost));
+
+  // Allocate device memory for the run-length decoded output
+  CubDebugExit(cudaMallocHost(&h_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  CubDebugExit(cudaMalloc(&d_decoded_out, (*h_num_decoded_total) * sizeof(RunItemT)));
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaMalloc(&d_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+    CubDebugExit(cudaMallocHost(&h_relative_offsets, (*h_num_decoded_total) * sizeof(RunLengthT)));
+  }
+
+  // Perform the block-wise run-length decoding (each block taking its offset from
+  // d_decoded_offsets)
+  BlockRunLengthDecodeTestKernel<AgentTestBlockRunLengthDecodeT><<<num_blocks, THREADS_PER_BLOCK, 0U>>>(
+    d_unique_items, d_run_lengths, d_decoded_offsets, num_runs, d_decoded_out, d_relative_offsets);
+
+  // Copy back results for verification
+  CubDebugExit(
+    cudaMemcpy(h_decoded_out, d_decoded_out, (*h_num_decoded_total) * sizeof(*h_decoded_out), cudaMemcpyDeviceToHost));
+
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    // Copy back the relative offsets
+    CubDebugExit(cudaMemcpy(
+      h_relative_offsets,
+      d_relative_offsets,
+      (*h_num_decoded_total) * sizeof(*h_relative_offsets),
+      cudaMemcpyDeviceToHost));
+  }
+
+  // Generate host-side run-length decoded data for verification
+  std::vector<std::pair<RunItemT, RunLengthT>> host_golden;
+  host_golden.reserve(*h_num_decoded_total);
+  for (uint32_t run = 0; run < num_runs; run++)
+  {
+    for (RunLengthT i = 0; i < d_run_lengths[run]; i++)
+    {
+      host_golden.push_back({d_unique_items[run], i});
+    }
+  }
+
+  // Verify the total run-length decoded size is correct
+  REQUIRE(host_golden.size() == h_num_decoded_total[0]);
+
+  // Verify the run-length decoded data is correct
+  bool cmp_eq = true;
+  for (uint32_t i = 0; i < host_golden.size(); i++)
+  {
+    if (host_golden[i].first != h_decoded_out[i])
+    {
+      FAIL("Mismatch at #" << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i] << "\n");
+      cmp_eq = false;
+    }
+    if (TEST_RELATIVE_OFFSETS)
+    {
+      if (host_golden[i].second != h_relative_offsets[i])
+      {
+        FAIL("Mismatch of relative offset at #"
+             << i << ": CPU item: " << host_golden[i].first << ", GPU: " << h_decoded_out[i]
+             << "; relative offsets: CPU: " << host_golden[i].second << ", GPU: " << h_relative_offsets[i] << "\n");
+        cmp_eq = false;
+        break;
+      }
+    }
+  }
+
+  REQUIRE(cmp_eq == true);
+
+  // Clean up memory allocations
+  CubDebugExit(cudaFree(temp_storage));
+  CubDebugExit(cudaFree(d_decoded_sizes));
+  CubDebugExit(cudaFree(d_decoded_offsets));
+  CubDebugExit(cudaFree(d_decoded_out));
+  CubDebugExit(cudaFreeHost(h_num_decoded_total));
+  CubDebugExit(cudaFreeHost(h_decoded_out));
+
+  if (TEST_RELATIVE_OFFSETS)
+  {
+    CubDebugExit(cudaFree(d_relative_offsets));
+    CubDebugExit(cudaFreeHost(h_relative_offsets));
+  }
+}
+
+constexpr bool DO_TEST_RELATIVE_OFFSETS     = true;
+constexpr bool DO_NOT_TEST_RELATIVE_OFFSETS = false;
+
+constexpr bool TEST_WITH_RUN_OFFSETS = true;
+constexpr bool TEST_WITH_RUN_LENGTHS = false;
+
+template <int RunsPerThread, int DecodedItemsPerThread, int BlockDimX, int BlockDimY = 1, int BlockDimZ = 1>
+struct params_t
+{
+  static constexpr int runs_per_thread          = RunsPerThread;
+  static constexpr int decoded_items_per_thread = DecodedItemsPerThread;
+  static constexpr int block_dim_x              = BlockDimX;
+  static constexpr int block_dim_y              = BlockDimY;
+  static constexpr int block_dim_z              = BlockDimZ;
+};
+
+C2H_TEST_LIST(
+  "Block Run Length Decode works with run lengths and offsets relative to each run",
+  "[rld][block]",
+  params_t<1, 1, 64>,
+  params_t<1, 3, 32, 2, 3>,
+  params_t<1, 1, 128>,
+  params_t<1, 8, 128>,
+  params_t<3, 1, 256>,
+  params_t<1, 8, 256>,
+  params_t<8, 1, 256>,
+  params_t<1, 1, 256>,
+  params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+}
+
+C2H_TEST_LIST(
+  "Block Run Length Decode works with run lengths and performs normal run-length "
+  "decoding",
+  "[rld][block]",
+  params_t<1, 1, 64>,
+  params_t<1, 3, 32, 2, 3>,
+  params_t<1, 1, 128>,
+  params_t<1, 8, 128>,
+  params_t<3, 1, 256>,
+  params_t<1, 8, 256>,
+  params_t<8, 1, 256>,
+  params_t<1, 1, 256>,
+  params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_LENGTHS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+}
+
+C2H_TEST_LIST(
+  "Block Run Length Decode works with run offsets and generates offsets relative to "
+  "each run",
+  "[rld][block]",
+  params_t<1, 1, 64>,
+  params_t<1, 3, 32, 2, 3>,
+  params_t<1, 1, 128>,
+  params_t<1, 8, 128>,
+  params_t<3, 1, 256>,
+  params_t<1, 8, 256>,
+  params_t<8, 1, 256>,
+  params_t<1, 1, 256>,
+  params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_TEST_RELATIVE_OFFSETS>();
+}
+
+C2H_TEST_LIST(
+  "Block Run Length Decode works with run offsets and performs normal run-length "
+  "decoding",
+  "[rld][block]",
+  params_t<1, 1, 64>,
+  params_t<1, 3, 32, 2, 3>,
+  params_t<1, 1, 128>,
+  params_t<1, 8, 128>,
+  params_t<3, 1, 256>,
+  params_t<1, 8, 256>,
+  params_t<8, 1, 256>,
+  params_t<1, 1, 256>,
+  params_t<2, 2, 384>)
+{
+  using params = TestType;
+
+  TestAlgorithmSpecialisation<params::runs_per_thread,
+                              params::decoded_items_per_thread,
+                              params::block_dim_x,
+                              params::block_dim_y,
+                              params::block_dim_z,
+                              TEST_WITH_RUN_OFFSETS,
+                              DO_NOT_TEST_RELATIVE_OFFSETS>();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan.cu
new file mode 100644
index 000000000..c399048e7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan.cu
@@ -0,0 +1,636 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_scan.cuh>
+
+#include <climits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <cub::BlockScanAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+__global__ void block_scan_kernel(T* in, T* out, ActionT action)
+{
+  using block_scan_t = cub::BlockScan<T, BlockDimX, Algorithm, BlockDimY, BlockDimZ>;
+  using storage_t    = typename block_scan_t::TempStorage;
+
+  __shared__ storage_t storage;
+
+  T thread_data[ItemsPerThread];
+
+  const int tid           = static_cast<int>(cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ));
+  const int thread_offset = tid * ItemsPerThread;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = in[idx];
+  }
+  __syncthreads();
+
+  block_scan_t scan(storage);
+
+  action(scan, thread_data);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    const int idx = thread_offset + item;
+    out[idx]      = thread_data[item];
+  }
+}
+
+template <cub::BlockScanAlgorithm Algorithm,
+          int ItemsPerThread,
+          int BlockDimX,
+          int BlockDimY,
+          int BlockDimZ,
+          class T,
+          class ActionT>
+void block_scan(c2h::device_vector<T>& in, c2h::device_vector<T>& out, ActionT action)
+{
+  dim3 block_dims(BlockDimX, BlockDimY, BlockDimZ);
+
+  block_scan_kernel<Algorithm, ItemsPerThread, BlockDimX, BlockDimY, BlockDimZ, T, ActionT>
+    <<<1, block_dims>>>(thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+enum class scan_mode
+{
+  exclusive,
+  inclusive
+};
+
+template <scan_mode Mode>
+struct sum_op_t
+{
+  template <int ItemsPerThread, class BlockScanT, class T>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data);
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_init_value_op_t
+{
+  T initial_value;
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{});
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, initial_value, cub::Min{});
+    }
+  }
+};
+
+template <scan_mode Mode>
+struct min_op_t
+{
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, int (&thread_data)[ItemsPerThread]) const
+  {
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_init_value_aggregate_op_t
+{
+  int m_target_thread_id;
+  T initial_value;
+  T* m_d_block_aggregate;
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    T block_aggregate{};
+
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{}, block_aggregate);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, initial_value, cub::Min{}, block_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid == m_target_thread_id)
+    {
+      *m_d_block_aggregate = block_aggregate;
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_aggregate_op_t
+{
+  int m_target_thread_id;
+  T* m_d_block_aggregate;
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    T block_aggregate{};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, block_aggregate);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data, block_aggregate);
+    }
+
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+
+    if (tid == m_target_thread_id)
+    {
+      *m_d_block_aggregate = block_aggregate;
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_prefix_op_t
+{
+  T m_prefix;
+
+  struct block_prefix_op_t
+  {
+    int linear_tid;
+    T prefix;
+
+    __device__ block_prefix_op_t(int linear_tid, T prefix)
+        : linear_tid(linear_tid)
+        , prefix(prefix)
+    {}
+
+    __device__ T operator()(T block_aggregate)
+    {
+      T retval = (linear_tid == 0) ? prefix : T{};
+      prefix   = prefix + block_aggregate;
+      return retval;
+    }
+  };
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+    block_prefix_op_t prefix_op{tid, m_prefix};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, prefix_op);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data, prefix_op);
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_prefix_op_t
+{
+  T m_prefix;
+  static constexpr T min_identity = std::numeric_limits<T>::max();
+
+  struct block_prefix_op_t
+  {
+    int linear_tid;
+    T prefix;
+
+    __device__ block_prefix_op_t(int linear_tid, T prefix)
+        : linear_tid(linear_tid)
+        , prefix(prefix)
+    {}
+
+    __device__ T operator()(T block_aggregate)
+    {
+      T retval = (linear_tid == 0) ? prefix : min_identity;
+      prefix   = cub::Min{}(prefix, block_aggregate);
+      return retval;
+    }
+  };
+
+  template <int ItemsPerThread, class BlockScanT>
+  __device__ void operator()(BlockScanT& scan, T (&thread_data)[ItemsPerThread]) const
+  {
+    const int tid = static_cast<int>(cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z));
+    block_prefix_op_t prefix_op{tid, m_prefix};
+
+    if (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{}, prefix_op);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{}, prefix_op);
+    }
+  }
+};
+
+template <class T, class ScanOpT>
+T host_scan(scan_mode mode, c2h::host_vector<T>& result, ScanOpT scan_op, T initial_value = T{})
+{
+  if (result.empty())
+  {
+    return {};
+  }
+
+  T accumulator       = static_cast<T>(scan_op(initial_value, result[0]));
+  T block_accumulator = result[0];
+
+  if (mode == scan_mode::exclusive)
+  {
+    result[0] = initial_value;
+
+    for (std::size_t i = 1; i < result.size(); i++)
+    {
+      T tmp             = result[i];
+      result[i]         = accumulator;
+      accumulator       = static_cast<T>(scan_op(accumulator, tmp));
+      block_accumulator = static_cast<T>(scan_op(block_accumulator, tmp));
+    }
+  }
+  else
+  {
+    result[0] = accumulator;
+
+    for (std::size_t i = 1; i < result.size(); i++)
+    {
+      accumulator       = static_cast<T>(scan_op(accumulator, result[i]));
+      block_accumulator = static_cast<T>(scan_op(block_accumulator, result[i]));
+      result[i]         = accumulator;
+    }
+  }
+
+  return block_accumulator;
+}
+
+// %PARAM% ALGO_TYPE alg 0:1:2
+// %PARAM% TEST_MODE mode 0:1
+
+using types            = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using vec_types        = c2h::type_list<ulonglong4, uchar3, short2>;
+using block_dim_x      = c2h::enum_type_list<int, 17, 32, 65, 96>;
+using block_dim_yz     = c2h::enum_type_list<int, 1, 2>;
+using items_per_thread = c2h::enum_type_list<int, 1, 9>;
+using algorithms =
+  c2h::enum_type_list<cub::BlockScanAlgorithm,
+                      cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING,
+                      cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS,
+                      cub::BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE>;
+using algorithm = c2h::enum_type_list<cub::BlockScanAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+#if TEST_MODE == 0
+using modes = c2h::enum_type_list<scan_mode, scan_mode::inclusive>;
+#else
+using modes = c2h::enum_type_list<scan_mode, scan_mode::exclusive>;
+#endif
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int block_dim_x      = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int items_per_thread = c2h::get<3, TestType>::value;
+  static constexpr int tile_size        = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+
+  static constexpr cub::BlockScanAlgorithm algorithm = c2h::get<4, TestType>::value;
+  static constexpr scan_mode mode                    = c2h::get<5, TestType>::value;
+};
+
+C2H_TEST(
+  "Block scan works with sum", "[scan][block]", types, block_dim_x, block_dim_yz, items_per_thread, algorithm, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<params::algorithm, params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_in, d_out, sum_op_t<params::mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(params::mode, h_out, std::plus<type>{});
+
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+C2H_TEST("Block scan works with vec types", "[scan][block]", vec_types, algorithm, modes)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 256;
+  constexpr int block_dim_y                   = 1;
+  constexpr int block_dim_z                   = 1;
+  constexpr int tile_size                     = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<1, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<2, TestType>::value;
+
+  using type = typename c2h::get<0, TestType>;
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(d_in, d_out, sum_op_t<mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Block scan works with custom types", "[scan][block]", algorithm, modes)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 256;
+  constexpr int block_dim_y                   = 1;
+  constexpr int block_dim_z                   = 1;
+  constexpr int tile_size                     = items_per_thread * block_dim_x * block_dim_y * block_dim_z;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(d_in, d_out, sum_op_t<mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Block scan returns valid block aggregate", "[scan][block]", algorithm, modes, block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, threads_in_block - 1)));
+
+  c2h::device_vector<type> d_block_aggregate(1);
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in, d_out, sum_aggregate_op_t<type, mode>{target_thread_id, thrust::raw_pointer_cast(d_block_aggregate.data())});
+
+  c2h::host_vector<type> h_out = d_in;
+  type block_aggregate         = host_scan(mode, h_out, std::plus<type>{});
+
+  REQUIRE(h_out == d_out);
+  REQUIRE(block_aggregate == d_block_aggregate[0]);
+}
+
+C2H_TEST("Block scan supports prefix op", "[scan][block]", algorithm, modes, block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  const type prefix = GENERATE_COPY(take(2, random(0, tile_size)));
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in, d_out, sum_prefix_op_t<type, mode>{prefix});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(mode, h_out, std::plus<type>{}, prefix);
+
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Block scan supports custom scan op", "[scan][block]", algorithm, modes, block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(d_in, d_out, min_op_t<mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(
+    mode,
+    h_out,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    INT_MAX);
+
+  _CCCL_IF_CONSTEXPR (mode == scan_mode::exclusive)
+  {
+    //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined.
+    d_out.erase(d_out.begin());
+    h_out.erase(h_out.begin());
+  }
+
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Block custom op scan works with initial value", "[scan][block]", algorithm, modes, block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const type initial_value = static_cast<type>(GENERATE_COPY(take(2, random(0, tile_size))));
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in, d_out, min_init_value_op_t<type, mode>{initial_value});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(
+    mode,
+    h_out,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Block custom op scan with initial value returns valid block aggregate",
+         "[scan][block]",
+         algorithm,
+         modes,
+         block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const type initial_value = static_cast<type>(GENERATE_COPY(take(2, random(0, tile_size))));
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, threads_in_block - 1)));
+
+  c2h::device_vector<type> d_block_aggregate(1);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in,
+    d_out,
+    min_init_value_aggregate_op_t<type, mode>{
+      target_thread_id, initial_value, thrust::raw_pointer_cast(d_block_aggregate.data())});
+
+  c2h::host_vector<type> h_out = d_in;
+  type h_block_aggregate       = host_scan(
+    mode,
+    h_out,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_block_aggregate == d_block_aggregate[0]);
+}
+
+C2H_TEST("Block scan supports prefix op and custom scan op", "[scan][block]", algorithm, modes, block_dim_yz)
+{
+  constexpr int items_per_thread              = 3;
+  constexpr int block_dim_x                   = 64;
+  constexpr int block_dim_y                   = c2h::get<2, TestType>::value;
+  constexpr int block_dim_z                   = block_dim_y;
+  constexpr int threads_in_block              = block_dim_x * block_dim_y * block_dim_z;
+  constexpr int tile_size                     = items_per_thread * threads_in_block;
+  constexpr cub::BlockScanAlgorithm algorithm = c2h::get<0, TestType>::value;
+  constexpr scan_mode mode                    = c2h::get<1, TestType>::value;
+
+  using type = int;
+
+  const type prefix = GENERATE_COPY(take(2, random(0, tile_size)));
+
+  c2h::device_vector<type> d_out(tile_size);
+  c2h::device_vector<type> d_in(tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  block_scan<algorithm, items_per_thread, block_dim_x, block_dim_y, block_dim_z>(
+    d_in, d_out, min_prefix_op_t<type, mode>{prefix});
+
+  c2h::host_vector<type> h_out = d_in;
+  host_scan(
+    mode,
+    h_out,
+    [](type a, type b) {
+      return std::min(a, b);
+    },
+    prefix);
+
+  REQUIRE(h_out == d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan_api.cu
new file mode 100644
index 000000000..16daea416
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_scan_api.cu
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_scan.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cuda/std/numeric>
+
+#include <c2h/catch2_test_helper.cuh>
+
+constexpr int num_items_per_thread = 2;
+constexpr int block_num_threads    = 64;
+
+// example-begin inclusive-scan-array-init-value
+__global__ void InclusiveBlockScanKernel(int* output)
+{
+  // Specialize BlockScan for a 1D block of 64 threads of type int
+  using block_scan_t   = cub::BlockScan<int, 64>;
+  using temp_storage_t = block_scan_t::TempStorage;
+
+  // Allocate shared memory for BlockScan
+  __shared__ temp_storage_t temp_storage;
+
+  int initial_value = 1;
+  int thread_data[] = {
+    +1 * ((int) threadIdx.x * num_items_per_thread), // item 0
+    -1 * ((int) threadIdx.x * num_items_per_thread + 1) // item 1
+  };
+  //  input: {[0, -1], [2, -3],[4, -5], ... [126, -127]}
+
+  // Collectively compute the block-wide inclusive scan max
+  block_scan_t(temp_storage).InclusiveScan(thread_data, thread_data, initial_value, cub::Max());
+
+  // output: {[1, 1], [2, 2],[3, 3], ... [126, 126]}
+  // ...
+  // example-end inclusive-scan-array-init-value
+  output[threadIdx.x * 2]     = thread_data[0];
+  output[threadIdx.x * 2 + 1] = thread_data[1];
+}
+
+C2H_TEST("Block array-based inclusive scan works with initial value", "[scan][block]")
+{
+  thrust::device_vector<int> d_out(block_num_threads * num_items_per_thread);
+
+  InclusiveBlockScanKernel<<<1, block_num_threads>>>(thrust::raw_pointer_cast(d_out.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  c2h::host_vector<int> expected(d_out.size());
+  for (size_t i = 0; i < expected.size() - 1; i += 2)
+  {
+    expected[i]     = static_cast<int>(i);
+    expected[i + 1] = static_cast<int>(i);
+  }
+
+  // When initial value = 1 for the given input the first two
+  // elements of the result are equal to 1.
+  expected[0] = 1;
+  expected[1] = 1;
+
+  REQUIRE(expected == d_out);
+}
+
+// example-begin inclusive-scan-array-aggregate-init-value
+__global__ void InclusiveBlockScanKernelAggregate(int* output, int* d_block_aggregate)
+{
+  // Specialize BlockScan for a 1D block of 64 threads of type int
+  using block_scan_t   = cub::BlockScan<int, 64>;
+  using temp_storage_t = block_scan_t::TempStorage;
+
+  // Allocate shared memory for BlockScan
+  __shared__ temp_storage_t temp_storage;
+
+  int initial_value = 1;
+  int thread_data[] = {
+    +1 * ((int) threadIdx.x * num_items_per_thread), // item 0
+    -1 * ((int) threadIdx.x * num_items_per_thread + 1) // item 1
+  };
+  //  input: {[0, -1], [2, -3],[4, -5], ... [126, -127]}
+
+  // Collectively compute the block-wide inclusive scan max
+  int block_aggregate;
+  block_scan_t(temp_storage).InclusiveScan(thread_data, thread_data, initial_value, cub::Max(), block_aggregate);
+
+  // output: {[1, 1], [2, 2],[3, 3], ... [126, 126]}
+  // block_aggregate = 126;
+  // ...
+  // example-end inclusive-scan-array-aggregate-init-value
+
+  *d_block_aggregate          = block_aggregate;
+  output[threadIdx.x * 2]     = thread_data[0];
+  output[threadIdx.x * 2 + 1] = thread_data[1];
+}
+
+C2H_TEST("Block array-based inclusive scan with block aggregate works with initial value", "[scan][block]")
+{
+  thrust::device_vector<int> d_out(block_num_threads * num_items_per_thread);
+
+  c2h::device_vector<int> d_block_aggregate(1);
+  InclusiveBlockScanKernelAggregate<<<1, block_num_threads>>>(
+    thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_block_aggregate.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  c2h::host_vector<int> expected(d_out.size());
+  for (size_t i = 0; i < expected.size() - 1; i += 2)
+  {
+    expected[i]     = static_cast<int>(i);
+    expected[i + 1] = static_cast<int>(i);
+  }
+
+  // When initial value = 1 for the given input the first two
+  // elements of the result are equal to 1.
+  expected[0] = 1;
+  expected[1] = 1;
+
+  REQUIRE(d_out == expected);
+  REQUIRE(d_block_aggregate[0] == 126);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_shuffle.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_shuffle.cu
new file mode 100644
index 000000000..812125702
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_shuffle.cu
@@ -0,0 +1,348 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockMergeSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/block/block_shuffle.cuh>
+
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int BlockDimX, int BlockDimY, int BlockDimZ, int ItemsPerThread, class T, class ActionT>
+__global__ void block_shuffle_kernel(T* data, ActionT action)
+{
+  using block_shuffle_t = cub::BlockShuffle<T, BlockDimX, BlockDimY, BlockDimZ>;
+  using temp_storage_t  = typename block_shuffle_t::TempStorage;
+
+  __shared__ temp_storage_t temp_storage;
+
+  T thread_data[ItemsPerThread];
+
+  data += cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ) * ItemsPerThread;
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_data[item] = data[item];
+  }
+  __syncthreads();
+
+  block_shuffle_t block_shuffle(temp_storage);
+  action(block_shuffle, thread_data);
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    data[item] = thread_data[item];
+  }
+}
+
+struct up_op_t
+{
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Up(thread_data, thread_data);
+  }
+};
+
+struct offset_op_t
+{
+  int m_distance;
+
+  __host__ offset_op_t(int distance)
+      : m_distance(distance)
+  {}
+
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Offset(thread_data[0], thread_data[0], m_distance);
+  }
+};
+
+struct rotate_op_t
+{
+  unsigned int m_distance;
+
+  __host__ rotate_op_t(unsigned int distance)
+      : m_distance(distance)
+  {}
+
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Rotate(thread_data[0], thread_data[0], m_distance);
+  }
+};
+
+template <class T>
+struct up_with_suffix_op_t
+{
+  int m_target_thread_id;
+  T* m_d_suffix_ptr;
+
+  __host__ up_with_suffix_op_t(int target_thread_id, T* d_suffix_ptr)
+      : m_target_thread_id(target_thread_id)
+      , m_d_suffix_ptr(d_suffix_ptr)
+  {}
+
+  template <class BlockShuffleT, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    T suffix{};
+
+    block_shuffle.Up(thread_data, thread_data, suffix);
+
+    if (cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z) == m_target_thread_id)
+    {
+      m_d_suffix_ptr[0] = suffix;
+    }
+  }
+};
+
+struct down_op_t
+{
+  template <class BlockShuffleT, class T, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    block_shuffle.Down(thread_data, thread_data);
+  }
+};
+
+template <class T>
+struct down_with_prefix_op_t
+{
+  int m_target_thread_id;
+  T* m_d_prefix_ptr;
+
+  __host__ down_with_prefix_op_t(int target_thread_id, T* d_prefix_ptr)
+      : m_target_thread_id(target_thread_id)
+      , m_d_prefix_ptr(d_prefix_ptr)
+  {}
+
+  template <class BlockShuffleT, int ItemsPerThread>
+  __device__ void operator()(BlockShuffleT& block_shuffle, T (&thread_data)[ItemsPerThread]) const
+  {
+    T prefix{};
+
+    block_shuffle.Down(thread_data, thread_data, prefix);
+
+    if (cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z) == m_target_thread_id)
+    {
+      m_d_prefix_ptr[0] = prefix;
+    }
+  }
+};
+
+template <int ItemsPerThread, int BlockDimX, int BlockDimY, int BlockDimZ, class T, class ActionT>
+void block_shuffle(c2h::device_vector<T>& data, ActionT action)
+{
+  dim3 block(BlockDimX, BlockDimY, BlockDimZ);
+  block_shuffle_kernel<BlockDimX, BlockDimY, BlockDimZ, ItemsPerThread>
+    <<<1, block>>>(thrust::raw_pointer_cast(data.data()), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% MULTI_DIM mdim 0:1
+// %PARAM% DIM_IDX dim_idx 0:1:2
+
+#if MULTI_DIM
+using block_dim_xs = c2h::enum_type_list<int, 7, 32, 64>;
+using block_dim_yz = c2h::enum_type_list<int, 2>;
+#else
+using block_dim_xs = c2h::enum_type_list<int, 64, 512, 1024>;
+using block_dim_yz = c2h::enum_type_list<int, 1>;
+#endif
+
+using block_dim_x = c2h::enum_type_list<int, c2h::get<DIM_IDX, block_dim_xs>::value>;
+
+using types                  = c2h::type_list<std::int32_t, std::int64_t>;
+using items_per_thread       = c2h::enum_type_list<int, 1, 2, 15>;
+using single_item_per_thread = c2h::enum_type_list<int, 1>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread = c2h::get<1, TestType>::value;
+  static constexpr int block_dim_x      = c2h::get<2, TestType>::value;
+  static constexpr int block_dim_y      = c2h::get<3, TestType>::value;
+  static constexpr int block_dim_z      = block_dim_y;
+  static constexpr int threads_in_block = block_dim_x * block_dim_y * block_dim_z;
+  static constexpr int tile_size        = items_per_thread * threads_in_block;
+};
+
+C2H_TEST("Block shuffle offset works", "[shuffle][block]", types, single_item_per_thread, block_dim_x, block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const int distance = GENERATE_COPY(take(4, random(1 - params::tile_size, params::tile_size - 1)));
+
+  c2h::host_vector<type> h_data = d_data;
+  c2h::host_vector<type> h_ref(params::tile_size);
+
+  for (int i = 0; i < static_cast<int>(h_data.size()); i++)
+  {
+    const int source = i + distance;
+    h_ref[i]         = (source >= 0) && (source < params::tile_size) ? h_data[source] : h_data[i];
+  }
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, offset_op_t{distance});
+
+  REQUIRE(h_ref == d_data);
+}
+
+C2H_TEST("Block shuffle rotate works", "[shuffle][block]", types, single_item_per_thread, block_dim_x, block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  c2h::device_vector<type> d_ref = d_data;
+
+  const unsigned int distance = GENERATE_COPY(take(4, random(0, params::tile_size - 1)));
+
+  c2h::host_vector<type> h_ref = d_data;
+  std::rotate(h_ref.begin(), h_ref.begin() + distance, h_ref.end());
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, rotate_op_t{distance});
+
+  REQUIRE(h_ref == d_data);
+}
+
+C2H_TEST("Block shuffle up works", "[shuffle][block]", types, items_per_thread, block_dim_x, block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  c2h::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin(), d_data.end() - 1, d_ref.begin() + 1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_ref.begin());
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, up_op_t{});
+
+  REQUIRE(d_ref == d_data);
+}
+
+C2H_TEST("Block shuffle up works when suffix is required",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::threads_in_block - 1)));
+
+  c2h::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin(), d_data.end() - 1, d_ref.begin() + 1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_ref.begin());
+
+  c2h::device_vector<type> d_suffix(1);
+  c2h::device_vector<type> d_suffix_ref(1);
+  thrust::copy(d_data.end() - 1, d_data.end(), d_suffix_ref.begin());
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, up_with_suffix_op_t<type>{target_thread_id, thrust::raw_pointer_cast(d_suffix.data())});
+
+  REQUIRE(d_ref == d_data);
+  REQUIRE(d_suffix_ref == d_suffix);
+}
+
+C2H_TEST("Block shuffle down works", "[shuffle][block]", types, items_per_thread, block_dim_x, block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  c2h::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin() + 1, d_data.end(), d_ref.begin());
+  thrust::copy(d_data.end() - 1, d_data.end(), d_ref.end() - 1);
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, down_op_t{});
+
+  REQUIRE(d_ref == d_data);
+}
+
+C2H_TEST("Block shuffle down works when prefix is required",
+         "[shuffle][block]",
+         types,
+         items_per_thread,
+         block_dim_x,
+         block_dim_yz)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_data(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_data);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::threads_in_block - 1)));
+
+  c2h::device_vector<type> d_ref(params::tile_size);
+  thrust::copy(d_data.begin() + 1, d_data.end(), d_ref.begin());
+  thrust::copy(d_data.end() - 1, d_data.end(), d_ref.end() - 1);
+
+  c2h::device_vector<type> d_prefix(1);
+  c2h::device_vector<type> d_prefix_ref(1);
+  thrust::copy(d_data.begin(), d_data.begin() + 1, d_prefix_ref.begin());
+
+  block_shuffle<params::items_per_thread, params::block_dim_x, params::block_dim_y, params::block_dim_z>(
+    d_data, down_with_prefix_op_t<type>{target_thread_id, thrust::raw_pointer_cast(d_prefix.data())});
+
+  REQUIRE(d_ref == d_data);
+  REQUIRE(d_prefix_ref == d_prefix);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_store.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_store.cu
new file mode 100644
index 000000000..517dbf494
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_block_store.cu
@@ -0,0 +1,273 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_arch.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int ItemsPerThread, int ThreadsInBlock, cub::BlockStoreAlgorithm /* StoreAlgorithm */>
+struct output_idx
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) * ItemsPerThread + item;
+  }
+};
+
+template <int ItemsPerThread, int ThreadsInBlock>
+struct output_idx<ItemsPerThread, ThreadsInBlock, cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED>
+{
+  static __device__ int get(int item)
+  {
+    return static_cast<int>(threadIdx.x) + ThreadsInBlock * item;
+  }
+};
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm StoreAlgorithm>
+__global__ void kernel(std::integral_constant<bool, true>, InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  using input_t       = cub::detail::value_t<InputIteratorT>;
+  using block_store_t = cub::BlockStore<input_t, ThreadsInBlock, ItemsPerThread, StoreAlgorithm>;
+  using storage_t     = typename block_store_t::TempStorage;
+
+  __shared__ storage_t storage;
+  block_store_t block_store(storage);
+
+  input_t data[ItemsPerThread];
+
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx = output_idx<ItemsPerThread, ThreadsInBlock, StoreAlgorithm>::get(i);
+
+    if (idx < num_items)
+    {
+      data[i] = input[idx];
+    }
+  }
+
+  if (ItemsPerThread * ThreadsInBlock == num_items)
+  {
+    block_store.Store(output, data);
+  }
+  else
+  {
+    block_store.Store(output, data, num_items);
+  }
+}
+
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm /* StoreAlgorithm */>
+__global__ void kernel(std::integral_constant<bool, false>, InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  for (int i = 0; i < ItemsPerThread; i++)
+  {
+    const int idx = output_idx<ItemsPerThread, ThreadsInBlock, cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT>::get(i);
+
+    if (idx < num_items)
+    {
+      output[idx] = input[idx];
+    }
+  }
+}
+
+template <int ItemsPerThread,
+          int ThreadsInBlock,
+          cub::BlockStoreAlgorithm StoreAlgorithm,
+          typename InputIteratorT,
+          typename OutputIteratorT>
+void block_store(InputIteratorT input, OutputIteratorT output, int num_items)
+{
+  using input_t                       = cub::detail::value_t<InputIteratorT>;
+  using block_store_t                 = cub::BlockStore<input_t, ThreadsInBlock, ItemsPerThread, StoreAlgorithm>;
+  using storage_t                     = typename block_store_t::TempStorage;
+  constexpr bool sufficient_resources = sizeof(storage_t) <= cub::detail::max_smem_per_block;
+
+  kernel<InputIteratorT, OutputIteratorT, ItemsPerThread, ThreadsInBlock, StoreAlgorithm>
+    <<<1, ThreadsInBlock>>>(std::integral_constant<bool, sufficient_resources>{}, input, output, num_items);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+// %PARAM% IPT it 1:11
+
+using types     = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t>;
+using vec_types = c2h::type_list<long2, double2>;
+
+using even_threads_in_block = c2h::enum_type_list<int, 32, 128>;
+using odd_threads_in_block  = c2h::enum_type_list<int, 15, 65>;
+using a_block_size          = c2h::enum_type_list<int, 256>;
+
+using items_per_thread = c2h::enum_type_list<int, IPT>;
+using store_algorithm =
+  c2h::enum_type_list<cub::BlockStoreAlgorithm,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>;
+
+using odd_store_algorithm =
+  c2h::enum_type_list<cub::BlockStoreAlgorithm,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE,
+                      cub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE>;
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int items_per_thread                     = c2h::get<1, TestType>::value;
+  static constexpr int threads_in_block                     = c2h::get<2, TestType>::value;
+  static constexpr int tile_size                            = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm = c2h::get<3, TestType>::value;
+};
+
+C2H_TEST("Block store works with even block sizes",
+         "[store][block]",
+         types,
+         items_per_thread,
+         even_threads_in_block,
+         store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread, params::threads_in_block, params::store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block store works with even odd sizes",
+         "[store][block]",
+         types,
+         items_per_thread,
+         odd_threads_in_block,
+         odd_store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread, params::threads_in_block, params::store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block store works with even vector types",
+         "[store][block]",
+         vec_types,
+         items_per_thread,
+         a_block_size,
+         store_algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, params::tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_store<params::items_per_thread, params::threads_in_block, params::store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block store works with custom types", "[store][block]", items_per_thread, store_algorithm)
+{
+  using type                                                = c2h::custom_type_t<c2h::equal_comparable_t>;
+  constexpr int items_per_thread                            = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block                            = 64;
+  constexpr int tile_size                                   = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm = c2h::get<1, TestType>::value;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+
+  block_store<items_per_thread, threads_in_block, store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()),
+    thrust::raw_pointer_cast(d_output.data()),
+    static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
+
+C2H_TEST("Block store works with caching iterators", "[store][block]", items_per_thread, store_algorithm)
+{
+  using type                                                = int;
+  constexpr int items_per_thread                            = c2h::get<0, TestType>::value;
+  constexpr int threads_in_block                            = 64;
+  constexpr int tile_size                                   = items_per_thread * threads_in_block;
+  static constexpr cub::BlockStoreAlgorithm store_algorithm = c2h::get<1, TestType>::value;
+
+  c2h::device_vector<type> d_input(GENERATE_COPY(take(10, random(0, tile_size))));
+  c2h::gen(C2H_SEED(10), d_input);
+
+  c2h::device_vector<type> d_output(d_input.size());
+  cub::CacheModifiedOutputIterator<cub::CacheStoreModifier::STORE_DEFAULT, type> out(
+    thrust::raw_pointer_cast(d_output.data()));
+
+  block_store<items_per_thread, threads_in_block, store_algorithm>(
+    thrust::raw_pointer_cast(d_input.data()), out, static_cast<int>(d_input.size()));
+
+  REQUIRE(d_input == d_output);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_c2h_checked_cuda_allocator.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_c2h_checked_cuda_allocator.cu
new file mode 100644
index 000000000..6b901a3ee
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_c2h_checked_cuda_allocator.cu
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/execution_policy.h>
+
+#include <algorithm>
+#include <new> // std::bad_alloc
+
+#include "thrust/detail/execution_policy.h"
+#include <c2h/catch2_test_helper.cuh>
+
+std::size_t get_alloc_bytes()
+{
+  std::size_t free_bytes{};
+  std::size_t total_bytes{};
+  cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
+  REQUIRE(status == cudaSuccess);
+
+  // Find a size that's > free but < total, preferring to return more than total if the values are
+  // too close.
+  constexpr std::size_t one_MiB = 1024 * 1024;
+  const std::size_t alloc_bytes = ::std::max(total_bytes - one_MiB, free_bytes + one_MiB);
+  CAPTURE(free_bytes, total_bytes, alloc_bytes);
+  return alloc_bytes;
+}
+
+C2H_TEST("c2h::device_vector throws when requested allocations exceed free device memory",
+         "[c2h][checked_cuda_allocator][device_vector]")
+{
+  c2h::device_vector<char> vec;
+
+  const std::size_t alloc_bytes = get_alloc_bytes();
+  REQUIRE_THROWS_AS(vec.resize(alloc_bytes), std::bad_alloc);
+}
+
+C2H_TEST("c2h::device_policy throws when requested allocations exceed free device memory",
+         "[c2h][checked_cuda_allocator][device_policy]")
+{
+  thrust::pair<char*, std::ptrdiff_t> buffer{nullptr, 0};
+  auto policy = thrust::detail::derived_cast(thrust::detail::strip_const(c2h::device_policy));
+
+  const std::size_t alloc_bytes = get_alloc_bytes();
+  REQUIRE_THROWS_AS(
+    buffer = thrust::detail::get_temporary_buffer<char>(policy, static_cast<std::ptrdiff_t>(alloc_bytes)),
+    std::bad_alloc);
+
+  thrust::detail::return_temporary_buffer(policy, buffer.first, buffer.second);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_debug.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_debug.cu
new file mode 100644
index 000000000..e565c218c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_debug.cu
@@ -0,0 +1,38 @@
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+TEST_CASE("CubDebug returns input error", "[debug][utils]")
+{
+  REQUIRE(CubDebug(cudaSuccess) == cudaSuccess);
+  REQUIRE(CubDebug(cudaErrorInvalidConfiguration) == cudaErrorInvalidConfiguration);
+}
+
+TEST_CASE("CubDebug returns new errors", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE(error != cudaSuccess);
+  REQUIRE(CubDebug(cudaSuccess) != cudaSuccess);
+}
+
+TEST_CASE("CubDebug prefers input errors", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE(error != cudaSuccess);
+  REQUIRE(CubDebug(cudaErrorMemoryAllocation) != cudaSuccess);
+}
+
+TEST_CASE("CubDebug resets last error", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE(error != cudaSuccess);
+  REQUIRE(CubDebug(cudaSuccess) != cudaSuccess);
+  REQUIRE(CubDebug(cudaSuccess) == cudaSuccess);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_left.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
new file mode 100644
index 000000000..69c7ec355
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_left.cu
@@ -0,0 +1,313 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_adjacent_difference.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceAdjacentDifference::SubtractLeft, adjacent_difference_subtract_left);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceAdjacentDifference::SubtractLeftCopy, adjacent_difference_subtract_left_copy);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint64_t,
+                 std::int8_t,
+                 std::int64_t,
+                 ulonglong2,
+                 c2h::custom_type_t<c2h::equal_comparable_t, c2h::subtractable_t>>;
+
+using types = c2h::type_list<std::uint8_t, std::int32_t>;
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeft can run with empty input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+
+  adjacent_difference_subtract_left(in.begin(), num_items, cub::Difference{});
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy can run with empty input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+
+  adjacent_difference_subtract_left_copy(in.begin(), out.begin(), num_items, cub::Difference{});
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy does not change the input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<type> reference = in;
+  adjacent_difference_subtract_left_copy(in.begin(), thrust::discard_iterator<>(), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with iterators", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});
+
+  adjacent_difference_subtract_left(in.begin(), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with iterators", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});
+
+  adjacent_difference_subtract_left_copy(in.begin(), out.begin(), num_items, cub::Difference{});
+
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with pointers", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});
+
+  adjacent_difference_subtract_left(thrust::raw_pointer_cast(in.data()), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with pointers", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), std::minus<type>{});
+
+  adjacent_difference_subtract_left_copy(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), num_items, cub::Difference{});
+
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct cust_diff
+{
+  template <class T2, cuda::std::__enable_if_t<cuda::std::is_same<T, T2>::value, int> = 0>
+  __host__ __device__ constexpr T2 operator()(const T2& lhs, const T2& rhs) const noexcept
+  {
+    return lhs - rhs;
+  }
+
+  __host__ __device__ constexpr ulonglong2 operator()(const ulonglong2& lhs, const ulonglong2& rhs) const noexcept
+  {
+    return ulonglong2{lhs.x - rhs.x, lhs.y - rhs.y};
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeft works with custom difference",
+         "[device][adjacent_difference]",
+         all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), cust_diff<type>{});
+
+  adjacent_difference_subtract_left(in.begin(), num_items, cust_diff<type>{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with custom difference",
+         "[device][adjacent_difference]",
+         all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), cust_diff<type>{});
+
+  adjacent_difference_subtract_left_copy(in.begin(), out.begin(), num_items, cust_diff<type>{});
+
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with a different output type",
+         "[device][adjacent_difference]",
+         types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), cust_diff<type>{});
+
+  adjacent_difference_subtract_left_copy(in.begin(), out.begin(), num_items, cust_diff<type>{});
+
+  REQUIRE(reference == out);
+}
+
+struct check_difference
+{
+  int* d_error;
+
+  template <class T>
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept
+  {
+    const T result = lhs - rhs;
+    if (result != 1)
+    {
+      atomicAdd(d_error, 1);
+    }
+    return result;
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy works with large indexes", "[device][adjacent_difference]")
+{
+  constexpr cuda::std::size_t num_items = 1ll << 33;
+  c2h::device_vector<int> error(1);
+  int* d_error = thrust::raw_pointer_cast(error.data());
+  adjacent_difference_subtract_left_copy(
+    thrust::counting_iterator<cuda::std::size_t>{0}, thrust::discard_iterator<>{}, num_items, check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
+}
+
+struct invocation_counter
+{
+  __host__ explicit invocation_counter(unsigned long long* addr)
+      : counts_(addr)
+  {}
+
+  template <class T>
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept
+  {
+    // Use legacy atomics to support testing on older archs:
+    atomicAdd(counts_, 1ull);
+    return lhs - rhs;
+  }
+
+private:
+  unsigned long long* counts_;
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractLeftCopy uses right number of invocations", "[device][adjacent_difference]")
+{
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<unsigned long long> counts(1, 0);
+  adjacent_difference_subtract_left_copy(
+    thrust::counting_iterator<cuda::std::size_t>{0},
+    thrust::discard_iterator<>(),
+    num_items,
+    invocation_counter{thrust::raw_pointer_cast(counts.data())});
+
+  REQUIRE(counts.front() == static_cast<unsigned long long>(num_items - 1));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_right.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
new file mode 100644
index 000000000..93590232e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_adjacent_difference_substract_right.cu
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_adjacent_difference.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+
+#include <algorithm>
+#include <numeric>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceAdjacentDifference::SubtractRight, adjacent_difference_subtract_right);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceAdjacentDifference::SubtractRightCopy, adjacent_difference_subtract_right_copy);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint64_t,
+                 std::int8_t,
+                 std::int64_t,
+                 ulonglong2,
+                 c2h::custom_type_t<c2h::equal_comparable_t, c2h::subtractable_t>>;
+
+using types = c2h::type_list<std::uint8_t, std::int32_t>;
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRight can run with empty input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+
+  adjacent_difference_subtract_right(in.begin(), num_items, cub::Difference{});
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy can run with empty input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+
+  adjacent_difference_subtract_right_copy(in.begin(), out.begin(), num_items, cub::Difference{});
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy does not change the input", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<type> reference = in;
+  adjacent_difference_subtract_right_copy(in.begin(), thrust::discard_iterator<>(), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+template <class T>
+struct ref_diff
+{
+  template <class T2, cuda::std::__enable_if_t<cuda::std::is_same<T, T2>::value, int> = 0>
+  __host__ __device__ constexpr T2 operator()(const T2& lhs, const T2& rhs) const noexcept
+  {
+    return rhs - lhs;
+  }
+
+  __host__ __device__ constexpr ulonglong2 operator()(const ulonglong2& lhs, const ulonglong2& rhs) const noexcept
+  {
+    return ulonglong2{rhs.x - lhs.x, rhs.y - lhs.y};
+  }
+
+  __host__ __device__ constexpr ulonglong4 operator()(const ulonglong4& lhs, const ulonglong4& rhs) const noexcept
+  {
+    return ulonglong4{rhs.x - lhs.x, rhs.y - lhs.y, rhs.z - lhs.z, rhs.w - lhs.w};
+  }
+
+  __host__ __device__ constexpr long2 operator()(const long2& lhs, const long2& rhs) const noexcept
+  {
+    return long2{rhs.x - lhs.x, rhs.y - lhs.y};
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRight works with iterators", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right(in.begin(), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy works with iterators", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right_copy(in.begin(), out.begin(), num_items, cub::Difference{});
+
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRight works with pointers", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right(thrust::raw_pointer_cast(in.data()), num_items, cub::Difference{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy works with pointers", "[device][adjacent_difference]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right_copy(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), num_items, cub::Difference{});
+
+  REQUIRE(reference == out);
+}
+
+struct cust_diff
+{
+  template <class T>
+  __host__ __device__ constexpr T operator()(const T& lhs, const T& rhs) const noexcept
+  {
+    return lhs - rhs;
+  }
+
+  __host__ __device__ constexpr ulonglong2 operator()(const ulonglong2& lhs, const ulonglong2& rhs) const noexcept
+  {
+    return ulonglong2{lhs.x - rhs.x, lhs.y - rhs.y};
+  }
+
+  __host__ __device__ constexpr ulonglong4 operator()(const ulonglong4& lhs, const ulonglong4& rhs) const noexcept
+  {
+    return ulonglong4{lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w};
+  }
+
+  __host__ __device__ constexpr long2 operator()(const long2& lhs, const long2& rhs) const noexcept
+  {
+    return long2{lhs.x - rhs.x, lhs.y - rhs.y};
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRight works with custom difference",
+         "[device][adjacent_difference]",
+         all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 5; // GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right(in.begin(), num_items, cust_diff{});
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy works with custom difference",
+         "[device][adjacent_difference]",
+         types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right_copy(in.begin(), out.begin(), num_items, cust_diff{});
+
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy works with a different output type",
+         "[device][adjacent_difference]",
+         types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::host_vector<type> h_in = in;
+  c2h::host_vector<type> reference(num_items);
+  std::adjacent_difference(h_in.begin(), h_in.end(), reference.begin(), ref_diff<type>{});
+  std::rotate(reference.begin(), reference.begin() + 1, reference.end());
+  reference.back() = h_in.back();
+
+  adjacent_difference_subtract_right_copy(in.begin(), out.begin(), num_items, cust_diff{});
+
+  REQUIRE(reference == out);
+}
+
+struct check_difference
+{
+  int* d_error;
+
+  template <class T>
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept
+  {
+    const T result = rhs - lhs;
+    if (result != 1)
+    {
+      atomicAdd(d_error, 1);
+    }
+    return result;
+  }
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy works with large indexes", "[device][adjacent_difference]")
+{
+  constexpr cuda::std::size_t num_items = 1ll << 33;
+  c2h::device_vector<int> error(1);
+  int* d_error = thrust::raw_pointer_cast(error.data());
+  adjacent_difference_subtract_right_copy(
+    thrust::counting_iterator<cuda::std::size_t>{0}, thrust::discard_iterator<>{}, num_items, check_difference{d_error});
+  const int h_error = error[0];
+  REQUIRE(h_error == 0);
+}
+
+struct invocation_counter
+{
+  __host__ explicit invocation_counter(unsigned long long* addr)
+      : counts_(addr)
+  {}
+
+  template <class T>
+  __device__ T operator()(const T& lhs, const T& rhs) const noexcept
+  {
+    // Use legacy atomics to support testing on older archs:
+    atomicAdd(counts_, 1ull);
+    return lhs - rhs;
+  }
+
+private:
+  unsigned long long* counts_;
+};
+
+C2H_TEST("DeviceAdjacentDifference::SubtractRightCopy uses right number of invocations",
+         "[device][adjacent_difference]")
+{
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<unsigned long long> counts(1, 0);
+  adjacent_difference_subtract_right_copy(
+    thrust::counting_iterator<cuda::std::size_t>{0},
+    thrust::discard_iterator<>(),
+    num_items,
+    invocation_counter{thrust::raw_pointer_cast(counts.data())});
+
+  REQUIRE(counts.front() == static_cast<unsigned long long>(num_items - 1));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_bulk.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_bulk.cu
new file mode 100644
index 000000000..b92d182b5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_bulk.cu
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <cuda/std/type_traits>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::Bulk, device_bulk);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+template <class T>
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    static_assert(cuda::std::is_same<T, OffsetT>::value, "T and OffsetT must be the same type");
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+C2H_TEST("Device bulk works", "[bulk][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  c2h::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_bulk(num_items, incrementer_t<offset_t>{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_decoupled_look_back.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_decoupled_look_back.cu
new file mode 100644
index 000000000..1f5602db1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_decoupled_look_back.cu
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#undef NDEBUG
+#include <cub/device/device_scan.cuh>
+
+#include <cassert>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <class ScanTileStateT>
+__global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
+{
+  tile_state.InitializeStatus(blocks_in_grid);
+}
+
+template <class MessageT>
+__global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state, MessageT* tile_data)
+{
+  using scan_op_t         = cub::Sum;
+  using scan_tile_state_t = cub::ScanTileState<MessageT>;
+  using tile_prefix_op    = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
+  using temp_storage_t    = typename tile_prefix_op::TempStorage;
+
+  // Allocate temp storage in shared memory
+  __shared__ temp_storage_t temp_storage;
+
+  scan_op_t scan_op{};
+  constexpr unsigned int threads_in_warp = 32;
+  const unsigned int tid                 = threadIdx.x;
+
+  // Construct prefix op
+  tile_prefix_op prefix(tile_state, temp_storage, scan_op);
+  const unsigned int tile_idx = prefix.GetTileIdx();
+
+  // "Compute" tile aggregate
+  MessageT tile_aggregate = tile_data[tile_idx];
+
+  if (tile_idx == 0)
+  {
+    // There are no blocks to look back to, immediately set the inclusive state
+    if (tid == 0)
+    {
+      tile_state.SetInclusive(tile_idx, tile_aggregate);
+      tile_data[tile_idx] = tile_aggregate;
+    }
+  }
+  else
+  {
+    // Only the first warp in the block can perform the look back
+    const unsigned int warp_id = tid / threads_in_warp;
+
+    if (warp_id == 0)
+    {
+      // Perform the decoupled look-back
+      // Invocation of the prefix will block until the look-back is complete.
+      MessageT exclusive_prefix = prefix(tile_aggregate);
+
+      if (tid == 0)
+      {
+        MessageT inclusive_prefix = scan_op(exclusive_prefix, tile_aggregate);
+        tile_data[tile_idx]       = inclusive_prefix;
+      }
+    }
+    __syncthreads();
+
+    assert(tile_data[tile_idx] == prefix.GetInclusivePrefix());
+    assert(tile_aggregate == prefix.GetBlockAggregate());
+  }
+}
+
+using message_types = c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>;
+
+template <class MessageT>
+c2h::host_vector<MessageT> compute_reference(const c2h::device_vector<MessageT>& tile_aggregates)
+{
+  if (tile_aggregates.empty())
+  {
+    return {};
+  }
+
+  c2h::host_vector<MessageT> reference = tile_aggregates;
+  MessageT* h_reference                = thrust::raw_pointer_cast(reference.data());
+
+  MessageT aggregate = h_reference[0];
+  for (std::size_t i = 1; i < reference.size(); i++)
+  {
+    aggregate += h_reference[i];
+    h_reference[i] = aggregate;
+  }
+
+  return reference;
+}
+
+C2H_TEST("Decoupled look-back works with various message types", "[decoupled look-back][device]", message_types)
+{
+  using message_t         = typename c2h::get<0, TestType>;
+  using scan_tile_state_t = cub::ScanTileState<message_t>;
+
+  constexpr int max_tiles = 1024 * 1024;
+  const int num_tiles     = GENERATE_COPY(take(10, random(1, max_tiles)));
+
+  c2h::device_vector<message_t> tile_data(num_tiles);
+  message_t* d_tile_data = thrust::raw_pointer_cast(tile_data.data());
+
+  c2h::gen(C2H_SEED(2), tile_data);
+  c2h::host_vector<message_t> reference = compute_reference(tile_data);
+
+  // Query temporary storage requirements
+  std::size_t temp_storage_bytes{};
+  scan_tile_state_t::AllocationSize(num_tiles, temp_storage_bytes);
+
+  // Allocate temporary storage
+  c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Initialize temporary storage
+  scan_tile_state_t tile_status;
+  cudaError_t status = tile_status.Init(num_tiles, d_temp_storage, temp_storage_bytes);
+  REQUIRE(status == cudaSuccess);
+
+  constexpr unsigned int threads_in_init_block = 256;
+  const unsigned int blocks_in_init_grid       = ::cuda::ceil_div(num_tiles, threads_in_init_block);
+  init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, num_tiles);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  // Launch decoupled look-back
+  constexpr unsigned int threads_in_block = 256;
+  decoupled_look_back_kernel<<<num_tiles, threads_in_block>>>(tile_status, d_tile_data);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  REQUIRE(reference == tile_data);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for.cu
new file mode 100644
index 000000000..8fec01be1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for.cu
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/equal.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEach, device_for_each);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachN, device_for_each_n);
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+struct referencing_operator_t
+{
+  const std::size_t* d_input;
+  const std::size_t magic_value;
+
+  __device__ void operator()(const std::size_t& i) const
+  {
+    if (i == magic_value)
+    {
+      const std::size_t* d_ptr    = &i;
+      const auto offset           = static_cast<std::size_t>(d_ptr - d_input);
+      const_cast<std::size_t&>(i) = offset;
+    }
+  }
+};
+
+C2H_TEST("Device for each works", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  c2h::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(c2h::device_policy, input.begin(), input.end(), offset_t{});
+  c2h::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each works with bad operators", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const std::size_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const std::size_t magic_value = num_items + 1;
+  c2h::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each(input.begin(), input.end(), referencing_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(c2h::device_policy, input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+C2H_TEST("Device for each works with unaligned vectors", "[for][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  c2h::device_vector<int> counts(num_items);
+  c2h::device_vector<int> input(num_items + offset);
+  thrust::sequence(c2h::device_policy, input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items =
+    static_cast<int>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+C2H_TEST("Device for each n works", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  c2h::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(c2h::device_policy, input.begin(), input.end(), offset_t{});
+
+  c2h::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each n works with bad operators", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const std::size_t magic_value = num_items + 1;
+  c2h::device_vector<std::size_t> input(num_items, magic_value);
+  const std::size_t* d_input = thrust::raw_pointer_cast(input.data());
+
+  device_for_each_n(input.begin(), num_items, referencing_operator_t{d_input, magic_value});
+
+  REQUIRE(thrust::equal(c2h::device_policy, input.begin(), input.end(), thrust::make_counting_iterator(std::size_t{})));
+}
+
+C2H_TEST("Device for each n works with unaligned vectors", "[for][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  c2h::device_vector<int> counts(num_items);
+  c2h::device_vector<int> input(num_items + offset);
+  thrust::sequence(c2h::device_policy, input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each works with couting iterator", "[for][device]")
+{
+  using offset_t               = int;
+  constexpr offset_t max_items = 5000000;
+  constexpr offset_t min_items = 1;
+  const offset_t num_items     = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const auto it = cub::CountingInputIterator<int>{0};
+  c2h::device_vector<int> counts(num_items);
+  device_for_each(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_api.cu
new file mode 100644
index 000000000..c4fa2c06b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_api.cu
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include <c2h/catch2_test_helper.cuh>
+
+// example-begin bulk-square-t
+struct square_t
+{
+  int* d_ptr;
+
+  __device__ void operator()(int i)
+  {
+    d_ptr[i] *= d_ptr[i];
+  }
+};
+// example-end bulk-square-t
+
+// example-begin bulk-square-ref-t
+struct square_ref_t
+{
+  __device__ void operator()(int& i)
+  {
+    i *= i;
+  }
+};
+// example-end bulk-square-ref-t
+
+// example-begin bulk-odd-count-t
+struct odd_count_t
+{
+  int* d_count;
+
+  __device__ void operator()(int i)
+  {
+    if (i % 2 == 1)
+    {
+      atomicAdd(d_count, 1);
+    }
+  }
+};
+// example-end bulk-odd-count-t
+
+C2H_TEST("Device bulk works with temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform bulk operation
+  cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device bulk works without temporary storage", "[bulk][device]")
+{
+  // example-begin bulk-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_t op{thrust::raw_pointer_cast(vec.data())};
+
+  cub::DeviceFor::Bulk(vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end bulk-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device for each n works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device for each n works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-n-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device for each works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each operation
+  cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device for each works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  square_ref_t op{};
+
+  cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {1, 4, 9, 16};
+  // example-end for-each-wo-temp-storage
+
+  REQUIRE(vec == expected);
+}
+
+C2H_TEST("Device for each n copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+C2H_TEST("Device for each n copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-n-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-n-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+C2H_TEST("Device for each copy works with temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  // 1) Get temp storage size
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  // 2) Allocate temp storage
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // 3) Perform for each n operation
+  cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-temp-storage
+
+  REQUIRE(count == expected);
+}
+
+C2H_TEST("Device for each copy works without temporary storage", "[for_each][device]")
+{
+  // example-begin for-each-copy-wo-temp-storage
+  thrust::device_vector<int> vec = {1, 2, 3, 4};
+  thrust::device_vector<int> count(1);
+  odd_count_t op{thrust::raw_pointer_cast(count.data())};
+
+  cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);
+
+  thrust::device_vector<int> expected = {2};
+  // example-end for-each-copy-wo-temp-storage
+
+  REQUIRE(count == expected);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_copy.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_copy.cu
new file mode 100644
index 000000000..822065d41
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_copy.cu
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include <thrust/count.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/sequence.h>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopy, device_for_each_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachCopyN, device_for_each_copy_n);
+
+using offset_type = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t, std::int64_t>;
+
+struct incrementer_t
+{
+  int* d_counts;
+
+  template <class OffsetT>
+  __device__ void operator()(OffsetT i)
+  {
+    atomicAdd(d_counts + i, 1); // Check if `i` was served more than once
+  }
+};
+
+template <class OffsetT>
+class offset_proxy_t
+{
+  OffsetT m_offset;
+
+public:
+  __host__ __device__ offset_proxy_t(OffsetT offset)
+      : m_offset(offset)
+  {}
+
+  __host__ __device__ operator OffsetT() const
+  {
+    return m_offset;
+  }
+};
+
+C2H_TEST("Device for each works", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  using offset_t = int;
+
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  c2h::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(c2h::device_policy, input.begin(), input.end(), offset_t{});
+
+  c2h::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy(input.begin(), input.end(), incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each works with unaligned vectors", "[for_copy][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  c2h::device_vector<int> counts(num_items);
+  c2h::device_vector<int> input(num_items + offset);
+  thrust::sequence(c2h::device_policy, input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy(d_input, d_input + num_items, incrementer_t{d_counts});
+
+  const int num_of_once_marked_items =
+    static_cast<int>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each n works", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  c2h::device_vector<offset_proxy_t<offset_t>> input(num_items, offset_t{});
+  thrust::sequence(c2h::device_policy, input.begin(), input.end(), offset_t{});
+
+  c2h::device_vector<int> counts(num_items);
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+
+  device_for_each_copy_n(input.begin(), num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each n works with unaligned vectors", "[for_copy][device]", offset_type)
+{
+  using offset_t = c2h::get<0, TestType>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const auto num_items = static_cast<offset_t>(GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    })));
+
+  const int offset = GENERATE(1, 2, 3);
+
+  c2h::device_vector<int> counts(num_items);
+  c2h::device_vector<int> input(num_items + offset);
+  thrust::sequence(c2h::device_policy, input.begin() + offset, input.end());
+
+  int* d_counts = thrust::raw_pointer_cast(counts.data());
+  int* d_input  = thrust::raw_pointer_cast(input.data()) + offset;
+
+  device_for_each_copy_n(d_input, num_items, incrementer_t{d_counts});
+
+  const auto num_of_once_marked_items =
+    static_cast<offset_t>(thrust::count(c2h::device_policy, counts.begin(), counts.end(), 1));
+
+  REQUIRE(num_of_once_marked_items == num_items);
+}
+
+C2H_TEST("Device for each works with couting iterator", "[for][device]")
+{
+  using offset_t               = int;
+  constexpr offset_t max_items = 5000000;
+  constexpr offset_t min_items = 1;
+  const offset_t num_items     = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  const auto it = cub::CountingInputIterator<int>{0};
+  c2h::device_vector<int> counts(num_items);
+  device_for_each_copy(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())});
+
+  const auto num_of_once_marked_items = static_cast<offset_t>(thrust::count(counts.begin(), counts.end(), 1));
+  REQUIRE(num_of_once_marked_items == num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_utils.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_utils.cu
new file mode 100644
index 000000000..4ddcff247
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_for_utils.cu
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T>
+struct value_t
+{
+  __device__ void operator()(T) {}
+};
+
+template <class T>
+struct const_ref_t
+{
+  __device__ void operator()(const T&) {}
+};
+
+template <class T>
+struct rref_t
+{
+  __device__ void operator()(T&&) {}
+};
+
+template <class T>
+struct value_ret_t
+{
+  __device__ T operator()(T v)
+  {
+    return v;
+  }
+};
+
+template <class T>
+struct ref_t
+{
+  __device__ void operator()(T&) {}
+};
+
+struct tpl_value_t
+{
+  template <class T>
+  __device__ void operator()(T)
+  {}
+};
+
+template <class T>
+struct overload_value_t
+{
+  __device__ void operator()(T) {}
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+struct value_const_t
+{
+  __device__ void operator()(T) const {}
+};
+
+template <class T>
+void test()
+{
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_const_t<T>>::value);
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<T, value_ret_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, rref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, const_ref_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, overload_value_t<T>>::value);
+  STATIC_REQUIRE(!cub::detail::for_each::has_unique_value_overload<T, tpl_value_t>::value);
+}
+
+C2H_TEST("Device for utils correctly detect value overloads", "[for][device]")
+{
+  test<int>();
+  test<double>();
+
+  // conversions do not work ;(
+  STATIC_REQUIRE(cub::detail::for_each::has_unique_value_overload<char, value_t<int>>::value);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_histogram.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_histogram.cu
new file mode 100644
index 000000000..f65c9bad2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_histogram.cu
@@ -0,0 +1,645 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_histogram.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include <cuda/std/__algorithm_>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <limits>
+#include <tuple>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/extended_types.cuh>
+#include <c2h/vector.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceHistogram::HistogramEven, histogram_even);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceHistogram::HistogramRange, histogram_range);
+
+DECLARE_TMPL_LAUNCH_WRAPPER(cub::DeviceHistogram::MultiHistogramEven,
+                            multi_histogram_even,
+                            ESCAPE_LIST(int Channels, int ActiveChannels),
+                            ESCAPE_LIST(Channels, ActiveChannels));
+
+DECLARE_TMPL_LAUNCH_WRAPPER(cub::DeviceHistogram::MultiHistogramRange,
+                            multi_histogram_range,
+                            ESCAPE_LIST(int Channels, int ActiveChannels),
+                            ESCAPE_LIST(Channels, ActiveChannels));
+
+namespace cs = cuda::std;
+using cs::array;
+using cs::size_t;
+
+template <typename T>
+auto cast_if_half_pointer(T* p) -> T*
+{
+  return p;
+}
+
+#if TEST_HALF_T
+auto cast_if_half_pointer(half_t* p) -> __half*
+{
+  return reinterpret_cast<__half*>(p);
+}
+
+auto cast_if_half_pointer(const half_t* p) -> const __half*
+{
+  return reinterpret_cast<const __half*>(p);
+}
+#endif
+
+template <typename T>
+using caller_vector = c2h::
+#if TEST_LAUNCH == 1
+  device_vector<T>;
+#else
+  host_vector<T>;
+#endif
+
+template <typename T, size_t N>
+auto to_caller_vector_of_ptrs(array<c2h::device_vector<T>, N>& in)
+  -> caller_vector<decltype(cast_if_half_pointer(cs::declval<T*>()))>
+{
+  c2h::host_vector<decltype(cast_if_half_pointer(cs::declval<T*>()))> r(N);
+  for (size_t i = 0; i < N; i++)
+  {
+    r[i] = cast_if_half_pointer(thrust::raw_pointer_cast(in[i].data()));
+  }
+  return r;
+}
+
+template <int Channels, typename CounterT, size_t ActiveChannels, typename SampleT, typename TransformOp, typename OffsetT>
+auto compute_reference_result(
+  const c2h::host_vector<SampleT>& h_samples,
+  const TransformOp& sample_to_bin_index,
+  const array<int, ActiveChannels>& num_levels,
+  OffsetT width,
+  OffsetT height,
+  OffsetT row_pitch) -> array<c2h::host_vector<CounterT>, ActiveChannels>
+{
+  auto h_histogram = array<c2h::host_vector<CounterT>, ActiveChannels>{};
+  for (size_t c = 0; c < ActiveChannels; ++c)
+  {
+    h_histogram[c].resize(num_levels[c] - 1);
+  }
+  for (OffsetT row = 0; row < height; ++row)
+  {
+    for (OffsetT pixel = 0; pixel < width; ++pixel)
+    {
+      for (size_t c = 0; c < ActiveChannels; ++c)
+      {
+        // TODO(bgruber): use an mdspan to access h_samples
+        const auto offset = row * (row_pitch / sizeof(SampleT)) + pixel * Channels + c;
+        const int bin     = sample_to_bin_index(static_cast<int>(c), h_samples[offset]);
+        if (bin >= 0 && bin < static_cast<int>(h_histogram[c].size())) // if bin is valid
+        {
+          ++h_histogram[c][bin];
+        }
+      }
+    }
+  }
+  return h_histogram;
+}
+
+template <size_t ActiveChannels, typename LevelT>
+auto setup_bin_levels_for_even(const array<int, ActiveChannels>& num_levels, LevelT max_level, int max_level_count)
+  -> array<array<LevelT, ActiveChannels>, 2>
+{
+  array<array<LevelT, ActiveChannels>, 2> levels;
+  auto& lower_level = levels[0];
+  auto& upper_level = levels[1];
+
+  // Create upper and lower levels between between [0:max_level], getting narrower with each channel. Example:
+  //    max_level = 256
+  //   num_levels = { 257, 129,  65 }
+  //  lower_level = {   0,  64,  96 }
+  //  upper_level = { 256, 192, 160 }
+
+  // TODO(bgruber): eventually, we could just pick a random lower/upper bound for each channel
+
+  const auto min_bin_width = max_level / (max_level_count - 1);
+  REQUIRE(min_bin_width > 0);
+
+  for (size_t c = 0; c < ActiveChannels; ++c)
+  {
+    const int num_bins        = num_levels[c] - 1;
+    const auto min_hist_width = num_bins * min_bin_width;
+    lower_level[c]            = static_cast<LevelT>(max_level / 2 - min_hist_width / 2);
+    upper_level[c]            = static_cast<LevelT>(max_level / 2 + min_hist_width / 2);
+    CAPTURE(c, num_levels[c]);
+    REQUIRE(lower_level[c] < upper_level[c]);
+  }
+  return levels;
+}
+
+template <size_t ActiveChannels, typename LevelT>
+auto setup_bin_levels_for_range(const array<int, ActiveChannels>& num_levels, LevelT max_level, int max_level_count)
+  -> array<c2h::host_vector<LevelT>, ActiveChannels>
+{
+  // TODO(bgruber): eventually, we could just pick random levels for each channel
+
+  const auto min_bin_width = max_level / (max_level_count - 1);
+  REQUIRE(min_bin_width > 0);
+
+  array<c2h::host_vector<LevelT>, ActiveChannels> levels;
+  for (size_t c = 0; c < ActiveChannels; ++c)
+  {
+    levels[c].resize(num_levels[c]);
+    const int num_bins        = num_levels[c] - 1;
+    const auto min_hist_width = num_bins * min_bin_width;
+    const auto lower_level    = (max_level / 2 - min_hist_width / 2);
+    for (int l = 0; l < num_levels[c]; ++l)
+    {
+      levels[c][l] = static_cast<LevelT>(lower_level + l * min_bin_width);
+      if (l > 0)
+      {
+        REQUIRE(levels[c][l - 1] < levels[c][l]);
+      }
+    }
+  }
+  return levels;
+}
+
+template <size_t ActiveChannels>
+auto generate_level_counts_to_test(int max_level_count) -> array<int, ActiveChannels>
+{
+  // TODO(bgruber): eventually, just pick a random number of levels per channel
+
+  // first channel tests maximum number of levels, later channels less and less
+  array<int, ActiveChannels> r{max_level_count};
+  for (size_t c = 1; c < ActiveChannels; ++c)
+  {
+    r[c] = r[c - 1] / 2 + 1;
+  }
+  return r;
+}
+
+struct bit_and_anything
+{
+  template <typename T>
+  _CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T
+  {
+    using U = typename cub::Traits<T>::UnsignedBits;
+    return ::cuda::std::bit_cast<T>(static_cast<U>(::cuda::std::bit_cast<U>(a) & ::cuda::std::bit_cast<U>(b)));
+  }
+};
+
+template <typename SampleT, int Channels, size_t ActiveChannels, typename CounterT, typename LevelT, typename OffsetT>
+void test_even_and_range(LevelT max_level, int max_level_count, OffsetT width, OffsetT height, int entropy_reduction = 0)
+{
+  const auto padding_bytes = static_cast<OffsetT>(GENERATE(size_t{0}, 13 * sizeof(SampleT)));
+  CAPTURE(
+    c2h::type_name<SampleT>(),
+    c2h::type_name<CounterT>(),
+    c2h::type_name<LevelT>(),
+    c2h::type_name<OffsetT>(),
+    Channels,
+    ActiveChannels,
+    CoutCast(max_level),
+    max_level_count,
+    width,
+    height,
+    padding_bytes,
+    entropy_reduction);
+
+  // Prepare input image (samples)
+  const OffsetT row_pitch = width * Channels * sizeof(SampleT) + padding_bytes;
+  const auto num_levels   = generate_level_counts_to_test<ActiveChannels>(max_level_count);
+
+  const OffsetT total_samples = height * (row_pitch / sizeof(SampleT));
+  c2h::device_vector<SampleT> d_samples;
+  d_samples.resize(total_samples);
+
+  if (entropy_reduction >= 0)
+  {
+    c2h::gen(C2H_SEED(1), d_samples, SampleT{0}, static_cast<SampleT>(max_level));
+    if (entropy_reduction > 0)
+    {
+      c2h::device_vector<SampleT> tmp(d_samples.size());
+      for (int i = 0; i < entropy_reduction; ++i)
+      {
+        c2h::gen(C2H_SEED(1), tmp);
+        thrust::transform(
+          c2h::device_policy, d_samples.cbegin(), d_samples.cend(), tmp.cbegin(), d_samples.begin(), bit_and_anything{});
+      }
+    }
+  }
+
+  auto h_samples = c2h::host_vector<SampleT>(d_samples);
+
+  // Allocate output histogram
+  auto d_histogram = array<c2h::device_vector<CounterT>, ActiveChannels>();
+  for (size_t c = 0; c < ActiveChannels; ++c)
+  {
+    d_histogram[c].resize(num_levels[c] - 1);
+  }
+
+  SECTION("HistogramEven")
+  {
+    // Setup levels
+    const auto levels       = setup_bin_levels_for_even(num_levels, max_level, max_level_count);
+    const auto& lower_level = levels[0]; // TODO(bgruber): use structured bindings in C++17
+    const auto& upper_level = levels[1];
+    CAPTURE(lower_level, upper_level);
+
+    // Compute reference result
+    auto fp_scales = array<LevelT, ActiveChannels>{}; // only used when LevelT is floating point
+    std::ignore    = fp_scales; // casting to void was insufficient. TODO(bgruber): use [[maybe_unsued]] in C++17
+    for (size_t c = 0; c < ActiveChannels; ++c)
+    {
+      _CCCL_IF_CONSTEXPR (!cs::is_integral<LevelT>::value)
+      {
+        fp_scales[c] = static_cast<LevelT>(num_levels[c] - 1) / static_cast<LevelT>(upper_level[c] - lower_level[c]);
+      }
+    }
+
+    auto sample_to_bin_index = [&](int channel, SampleT sample) {
+      using common_t             = typename cs::common_type<LevelT, SampleT>::type;
+      const auto n               = num_levels[channel];
+      const auto max             = static_cast<common_t>(upper_level[channel]);
+      const auto min             = static_cast<common_t>(lower_level[channel]);
+      const auto promoted_sample = static_cast<common_t>(sample);
+      if (promoted_sample < min || promoted_sample >= max)
+      {
+        return n; // out of range
+      }
+      _CCCL_IF_CONSTEXPR (cs::is_integral<LevelT>::value)
+      {
+        // Accurate bin computation following the arithmetic we guarantee in the HistoEven docs
+        return static_cast<int>(static_cast<uint64_t>(promoted_sample - min) * static_cast<uint64_t>(n - 1)
+                                / static_cast<uint64_t>(max - min));
+      }
+      else
+      {
+        return static_cast<int>((sample - min) * fp_scales[channel]);
+      }
+      _CCCL_UNREACHABLE();
+    };
+    auto h_histogram = compute_reference_result<Channels, CounterT>(
+      h_samples, sample_to_bin_index, num_levels, width, height, row_pitch);
+
+    // Compute result and verify
+    {
+      const auto* sample_ptr = cast_if_half_pointer(thrust::raw_pointer_cast(d_samples.data()));
+      _CCCL_IF_CONSTEXPR (ActiveChannels == 1 && Channels == 1)
+      {
+        histogram_even(
+          sample_ptr,
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram[0].data())),
+          num_levels[0],
+          cast_if_half_pointer(lower_level.data())[0],
+          cast_if_half_pointer(upper_level.data())[0],
+          width,
+          height,
+          row_pitch);
+      }
+      else
+      {
+        auto d_histogram_ptrs    = to_caller_vector_of_ptrs(d_histogram);
+        const auto d_num_levels  = caller_vector<int>(num_levels.begin(), num_levels.end());
+        const auto d_lower_level = caller_vector<LevelT>(lower_level.begin(), lower_level.end());
+        const auto d_upper_level = caller_vector<LevelT>(upper_level.begin(), upper_level.end());
+        multi_histogram_even<Channels, ActiveChannels>(
+          sample_ptr,
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram_ptrs.data())),
+          thrust::raw_pointer_cast(d_num_levels.data()),
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_lower_level.data())),
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_upper_level.data())),
+          width,
+          height,
+          row_pitch);
+      }
+    }
+    for (size_t c = 0; c < ActiveChannels; ++c)
+    {
+      CHECK(h_histogram[c] == d_histogram[c]);
+    }
+  }
+
+  SECTION("HistogramRange")
+  {
+    // Setup levels
+    const auto h_levels = setup_bin_levels_for_range(num_levels, max_level, max_level_count);
+    CAPTURE(h_levels);
+
+    // Compute reference result
+    const auto sample_to_bin_index = [&](int channel, SampleT sample) {
+      const auto* l  = h_levels[channel].data();
+      const auto n   = static_cast<int>(h_levels[channel].size());
+      const auto* ub = std::upper_bound(l, l + n, static_cast<LevelT>(sample));
+      return ub == l /* sample smaller than first bin */ ? n : static_cast<int>(std::distance(l, ub) - 1);
+    };
+    auto h_histogram = compute_reference_result<Channels, CounterT>(
+      h_samples, sample_to_bin_index, num_levels, width, height, row_pitch);
+
+    // Compute result and verify
+    {
+      const auto* sample_ptr = cast_if_half_pointer(thrust::raw_pointer_cast(d_samples.data()));
+      auto d_levels          = array<c2h::device_vector<LevelT>, ActiveChannels>{};
+      std::copy(h_levels.begin(), h_levels.end(), d_levels.begin());
+      _CCCL_IF_CONSTEXPR (ActiveChannels == 1 && Channels == 1)
+      {
+        histogram_range(
+          sample_ptr,
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram[0].data())),
+          num_levels[0],
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_levels[0].data())),
+          width,
+          height,
+          row_pitch);
+      }
+      else
+      {
+        auto d_histogram_ptrs   = to_caller_vector_of_ptrs(d_histogram);
+        const auto d_num_levels = caller_vector<int>(num_levels.begin(), num_levels.end());
+        const auto level_ptrs   = to_caller_vector_of_ptrs(d_levels);
+        multi_histogram_range<Channels, ActiveChannels>(
+          sample_ptr,
+          cast_if_half_pointer(thrust::raw_pointer_cast(d_histogram_ptrs.data())),
+          thrust::raw_pointer_cast(d_num_levels.data()),
+          cast_if_half_pointer(thrust::raw_pointer_cast(level_ptrs.data())),
+          width,
+          height,
+          row_pitch);
+      }
+    }
+    for (size_t c = 0; c < ActiveChannels; ++c)
+    {
+      CHECK(h_histogram[c] == d_histogram[c]);
+    }
+  }
+}
+
+using types =
+  c2h::type_list<std::int8_t,
+                 std::uint8_t,
+                 std::int16_t,
+                 std::uint16_t,
+                 std::int32_t,
+                 std::uint32_t,
+                 std::int64_t,
+                 std::uint64_t,
+#if TEST_HALF_T
+                 half_t,
+#endif
+                 float,
+                 double>;
+
+C2H_TEST("DeviceHistogram::Histogram* basic use", "[histogram][device]", types)
+{
+  using sample_t = c2h::get<0, TestType>;
+  using level_t =
+    typename cs::conditional<cub::NumericTraits<sample_t>::CATEGORY == cub::FLOATING_POINT, sample_t, int>::type;
+  // Max for int8/uint8 is 2^8, for half_t is 2^10. Beyond, we would need a different level generation
+  const auto max_level       = level_t{sizeof(sample_t) == 1 ? 126 : 1024};
+  const auto max_level_count = (sizeof(sample_t) == 1 ? 126 : 1024) + 1;
+  test_even_and_range<sample_t, 4, 3, int>(max_level, max_level_count, 1920, 1080);
+}
+
+// TODO(bgruber): float produces INFs in the HistogramRange test setup AND the HistogramEven implementation
+// This test covers int32 and int64 arithmetic for bin computation
+C2H_TEST("DeviceHistogram::Histogram* large levels", "[histogram][device]", c2h::remove<types, float>)
+{
+  using sample_t             = c2h::get<0, TestType>;
+  using level_t              = sample_t;
+  const auto max_level_count = 128;
+  auto max_level             = cub::NumericTraits<level_t>::Max();
+  _CCCL_IF_CONSTEXPR (sizeof(sample_t) > sizeof(int))
+  {
+    max_level /= static_cast<level_t>(max_level_count - 1); // cf. overflow detection in ScaleTransform::MayOverflow
+  }
+  test_even_and_range<sample_t, 4, 3, int>(max_level, max_level_count, 1920, 1080);
+}
+
+C2H_TEST("DeviceHistogram::Histogram* odd image sizes", "[histogram][device]")
+{
+  using sample_t                = int;
+  using level_t                 = int;
+  constexpr sample_t max_level  = 256;
+  constexpr int max_level_count = 256 + 1;
+
+  using P      = cs::pair<int, int>;
+  const auto p = GENERATE(P{1920, 0}, P{0, 0}, P{0, 1080}, P{1, 1}, P{15, 1}, P{1, 15}, P{10000, 1}, P{1, 10000});
+  test_even_and_range<sample_t, 4, 3, int, level_t, int>(max_level, max_level_count, p.first, p.second);
+}
+
+C2H_TEST("DeviceHistogram::Histogram* entropy", "[histogram][device]")
+{
+  const int entropy_reduction = GENERATE(-1, 3, 5); // entropy_reduction = -1 -> all samples == 0
+  test_even_and_range<int, 4, 3, int>(256, 256 + 1, 1920, 1080, entropy_reduction);
+}
+
+template <int Channels, int ActiveChannels>
+struct ChannelConfig
+{
+  static constexpr auto channels        = Channels;
+  static constexpr auto active_channels = ActiveChannels;
+};
+
+C2H_TEST_LIST("DeviceHistogram::Histogram* channel configs",
+              "[histogram][device]",
+              ChannelConfig<1, 1>,
+              ChannelConfig<3, 3>,
+              ChannelConfig<4, 3>,
+              ChannelConfig<4, 4>)
+{
+  test_even_and_range<int, TestType::channels, TestType::active_channels, int, int, int>(256, 256 + 1, 128, 32);
+}
+
+// Testing only HistogramEven is fine, because HistogramRange shares the loading logic and the different binning
+// implementations are not affected by the iterator.
+C2H_TEST("DeviceHistogram::HistogramEven sample iterator", "[histogram_even][device]")
+{
+  using sample_t                 = int;
+  const auto width               = 100;
+  const auto padding             = 13; // in elements
+  const auto height              = 30;
+  constexpr auto channels        = 4;
+  constexpr auto active_channels = 3;
+  const auto row_pitch           = (width + padding) * channels * static_cast<int>(sizeof(sample_t));
+  const auto total_values        = (width + padding) * channels * height;
+
+  const auto num_levels  = array<int, active_channels>{11, 3, 2};
+  const auto lower_level = caller_vector<int>{0, -10, cs::numeric_limits<int>::lowest()};
+  const auto upper_level = caller_vector<int>{total_values, 10, cs::numeric_limits<int>::max()};
+
+  auto sample_iterator = cub::CountingInputIterator<sample_t>(0);
+
+  // Channel #0: 0, 4,  8, 12
+  // Channel #1: 1, 5,  9, 13
+  // Channel #2: 2, 6, 10, 14
+  // unused:     3, 7, 11, 15
+
+  auto d_histogram = array<c2h::device_vector<int>, active_channels>();
+  for (int c = 0; c < active_channels; ++c)
+  {
+    d_histogram[c].resize(num_levels[c] - 1);
+  }
+
+  multi_histogram_even<channels, active_channels>(
+    sample_iterator,
+    thrust::raw_pointer_cast(to_caller_vector_of_ptrs(d_histogram).data()),
+    thrust::raw_pointer_cast(caller_vector<int>(num_levels.begin(), num_levels.end()).data()),
+    thrust::raw_pointer_cast(lower_level.data()),
+    thrust::raw_pointer_cast(upper_level.data()),
+    width,
+    height,
+    row_pitch);
+
+  CHECK(d_histogram[0] == c2h::host_vector<int>(10, (width * height) / 10));
+  CHECK(d_histogram[1] == c2h::host_vector<int>{0, 3});
+  CHECK(d_histogram[2] == c2h::host_vector<int>{width * height});
+}
+
+// Regression: https://github.com/NVIDIA/cub/issues/479
+C2H_TEST("DeviceHistogram::Histogram* regression NVIDIA/cub#479", "[histogram][device]")
+{
+  test_even_and_range<float, 4, 3, int>(12, 7, 1920, 1080);
+}
+
+C2H_TEST("DeviceHistogram::Histogram* down-conversion size_t to int", "[histogram][device]")
+{
+  _CCCL_IF_CONSTEXPR (sizeof(size_t) != sizeof(int))
+  {
+    using offset_t = cs::make_signed<size_t>::type;
+    test_even_and_range<unsigned char, 4, 3, int>(256, 256 + 1, offset_t{1920}, offset_t{1080});
+  }
+}
+
+C2H_TEST("DeviceHistogram::HistogramRange levels/samples aliasing", "[histogram_range][device]")
+{
+  constexpr int num_levels = 7;
+  constexpr int h_samples[]{
+    0,  2,  4,  6,  8,  10, 12, // levels
+    1, // bin 0
+    3,  3, // bin 1
+    5,  5,  5, // bin 2
+    7,  7,  7,  7, // bin 3
+    9,  9,  9,  9,  9, // bin 4
+    11, 11, 11, 11, 11, 11 // bin 5
+  };
+
+  auto d_histogram = c2h::device_vector<int>(num_levels - 1);
+  auto d_samples   = c2h::device_vector<int>(cs::begin(h_samples), cs::end(h_samples));
+  histogram_range(
+    thrust::raw_pointer_cast(d_samples.data()),
+    thrust::raw_pointer_cast(d_histogram.data()),
+    num_levels,
+    thrust::raw_pointer_cast(d_samples.data()), // Alias levels with samples (fancy way to `d_histogram[bin]++`).
+    static_cast<int>(d_samples.size()));
+
+  auto h_histogram = c2h::host_vector<int>(d_histogram);
+  for (int bin = 0; bin < num_levels - 1; bin++)
+  {
+    // Each bin should contain `bin + 1` samples, plus one extra, since samples also contain levels.
+    CHECK(h_histogram[bin] == bin + 2);
+  }
+}
+
+// We cannot use launch wrappers for this test, since it checks error codes explicitly.
+#if TEST_LAUNCH == 0
+// Our bin computation for HistogramEven is guaranteed only for when (max_level - min_level) * num_bins does not
+// overflow using uint64_t arithmetic. In case of overflow, we expect cudaErrorInvalidValue to be returned.
+C2H_TEST_LIST("DeviceHistogram::HistogramEven bin computation does not overflow",
+              "[histogram_even][device]",
+              uint8_t,
+              uint16_t,
+              uint32_t,
+              uint64_t)
+{
+  using sample_t                 = TestType;
+  using counter_t                = uint32_t;
+  constexpr sample_t lower_level = 0;
+  constexpr sample_t upper_level = cs::numeric_limits<sample_t>::max();
+  constexpr auto num_samples     = 1000;
+  auto d_samples                 = cub::CountingInputIterator<sample_t>{0UL};
+  auto d_histo_out               = c2h::device_vector<counter_t>(1024);
+  const auto num_bins            = GENERATE(1, 2);
+
+  // Verify we always initializes temp_storage_bytes
+  constexpr size_t canary_bytes = 3;
+  size_t temp_storage_bytes     = canary_bytes;
+  const auto error1             = cub::DeviceHistogram::HistogramEven(
+    nullptr,
+    temp_storage_bytes,
+    d_samples,
+    raw_pointer_cast(d_histo_out.data()),
+    num_bins + 1,
+    lower_level,
+    upper_level,
+    num_samples);
+  // CHECK(error1 == ???); // TODO(bgruber): add a new check? what is expected? It's neither 0 or 1.
+  std::ignore = error1;
+  CHECK(temp_storage_bytes != canary_bytes);
+
+  auto temp_storage = c2h::device_vector<char>(temp_storage_bytes);
+  const auto error2 = cub::DeviceHistogram::HistogramEven(
+    raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    d_samples,
+    raw_pointer_cast(d_histo_out.data()),
+    num_bins + 1,
+    lower_level,
+    upper_level,
+    num_samples);
+
+  // Since test #1 is just a single bin, we expect it to succeed
+  // Since we promote up to 64-bit integer arithmetic we expect tests to not overflow for types of
+  // up to 4 bytes. For 64-bit and wider types, we do not perform further promotion to even wider
+  // types, hence we expect cudaErrorInvalidValue to be returned to indicate of a potential overflow
+  // Ensure we do not return an error on querying temporary storage requirements
+  CHECK(error2 == (num_bins == 1 || sizeof(sample_t) <= 4UL ? cudaSuccess : cudaErrorInvalidValue));
+}
+#endif // TEST_LAUNCH == 0
+
+// Regression test for https://github.com/NVIDIA/cub/issues/489: integer rounding errors lead to incorrect bin detection
+C2H_TEST("DeviceHistogram::HistogramEven bin calculation regression", "[histogram_even][device]")
+{
+  constexpr int num_levels   = 8;
+  const auto h_histogram_ref = c2h::host_vector<int>{1, 5, 0, 2, 1, 0, 0};
+  const auto d_samples       = c2h::device_vector<int>{2, 6, 7, 2, 3, 0, 2, 2, 6, 999};
+  constexpr int lower_level  = 0;
+  constexpr int upper_level  = 12;
+
+  auto d_histogram = c2h::device_vector<int>(h_histogram_ref.size());
+  histogram_even(
+    thrust::raw_pointer_cast(d_samples.data()),
+    thrust::raw_pointer_cast(d_histogram.data()),
+    num_levels,
+    lower_level,
+    upper_level,
+    static_cast<int>(d_samples.size()));
+  CHECK(h_histogram_ref == d_histogram);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge.cu
new file mode 100644
index 000000000..d456cfe08
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge.cu
@@ -0,0 +1,463 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <test_util.h>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys);
+
+// TODO(bgruber): replace the following by the CUB device API directly, once we have figured out how to handle different
+// offset types
+namespace detail
+{
+template <typename KeyIteratorIn1,
+          typename KeyIteratorIn2,
+          typename KeyIteratorOut,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<>>
+CUB_RUNTIME_FUNCTION static cudaError_t merge_keys_custom_offset_type(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  KeyIteratorIn1 keys_in1,
+  Offset num_keys1,
+  KeyIteratorIn2 keys_in2,
+  Offset num_keys2,
+  KeyIteratorOut keys_out,
+  CompareOp compare_op = {},
+  cudaStream_t stream  = 0)
+{
+  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergeKeys");
+  return cub::detail::merge::dispatch_t<
+    KeyIteratorIn1,
+    cub::NullType*,
+    KeyIteratorIn2,
+    cub::NullType*,
+    KeyIteratorOut,
+    cub::NullType*,
+    Offset,
+    CompareOp>::dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         keys_in1,
+                         nullptr,
+                         num_keys1,
+                         keys_in2,
+                         nullptr,
+                         num_keys2,
+                         keys_out,
+                         nullptr,
+                         compare_op,
+                         stream);
+}
+
+template <typename KeyIteratorIn1,
+          typename ValueIteratorIn1,
+          typename KeyIteratorIn2,
+          typename ValueIteratorIn2,
+          typename KeyIteratorOut,
+          typename ValueIteratorOut,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<>>
+CUB_RUNTIME_FUNCTION static cudaError_t merge_pairs_custom_offset_type(
+  void* d_temp_storage,
+  std::size_t& temp_storage_bytes,
+  KeyIteratorIn1 keys_in1,
+  ValueIteratorIn1 values_in1,
+  Offset num_pairs1,
+  KeyIteratorIn2 keys_in2,
+  ValueIteratorIn2 values_in2,
+  Offset num_pairs2,
+  KeyIteratorOut keys_out,
+  ValueIteratorOut values_out,
+  CompareOp compare_op = {},
+  cudaStream_t stream  = 0)
+{
+  CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMerge::MergePairs");
+  return cub::detail::merge::dispatch_t<
+    KeyIteratorIn1,
+    ValueIteratorIn1,
+    KeyIteratorIn2,
+    ValueIteratorIn2,
+    KeyIteratorOut,
+    ValueIteratorOut,
+    Offset,
+    CompareOp>::dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         keys_in1,
+                         values_in1,
+                         num_pairs1,
+                         keys_in2,
+                         values_in2,
+                         num_pairs2,
+                         keys_out,
+                         values_out,
+                         compare_op,
+                         stream);
+}
+} // namespace detail
+
+DECLARE_LAUNCH_WRAPPER(detail::merge_keys_custom_offset_type, merge_keys_custom_offset_type);
+DECLARE_LAUNCH_WRAPPER(detail::merge_pairs_custom_offset_type, merge_pairs_custom_offset_type);
+
+using types = c2h::type_list<std::uint8_t, std::int16_t, std::uint32_t, double>;
+
+// gevtushenko: there is no code path in CUB and Thrust that leads to unsigned offsets, so let's safe some compile time
+using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
+
+template <typename Key,
+          typename Offset,
+          typename CompareOp = ::cuda::std::less<Key>,
+          typename MergeKeys = decltype(::merge_keys)>
+void test_keys(Offset size1 = 3623, Offset size2 = 6346, CompareOp compare_op = {}, MergeKeys merge_keys = ::merge_keys)
+{
+  CAPTURE(c2h::type_name<Key>(), c2h::type_name<Offset>(), size1, size2);
+
+  c2h::device_vector<Key> keys1_d(size1);
+  c2h::device_vector<Key> keys2_d(size2);
+
+  c2h::gen(C2H_SEED(1), keys1_d);
+  c2h::gen(C2H_SEED(1), keys2_d);
+
+  thrust::sort(c2h::device_policy, keys1_d.begin(), keys1_d.end(), compare_op);
+  thrust::sort(c2h::device_policy, keys2_d.begin(), keys2_d.end(), compare_op);
+  // CAPTURE(keys1_d, keys2_d);
+
+  c2h::device_vector<Key> result_d(size1 + size2);
+  merge_keys(thrust::raw_pointer_cast(keys1_d.data()),
+             static_cast<Offset>(keys1_d.size()),
+             thrust::raw_pointer_cast(keys2_d.data()),
+             static_cast<Offset>(keys2_d.size()),
+             thrust::raw_pointer_cast(result_d.data()),
+             compare_op);
+
+  c2h::host_vector<Key> keys1_h = keys1_d;
+  c2h::host_vector<Key> keys2_h = keys2_d;
+  c2h::host_vector<Key> reference_h(size1 + size2);
+  std::merge(keys1_h.begin(), keys1_h.end(), keys2_h.begin(), keys2_h.end(), reference_h.begin(), compare_op);
+
+  // FIXME(bgruber): comparing std::vectors (slower than thrust vectors) but compiles a lot faster
+  CHECK((detail::to_vec(reference_h) == detail::to_vec(c2h::host_vector<Key>(result_d))));
+}
+
+C2H_TEST("DeviceMerge::MergeKeys key types", "[merge][device]", types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = int;
+  test_keys<key_t, offset_t>();
+}
+
+using large_type_fallb = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<56>::type>;
+using large_type_vsmem = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<774>::type>;
+
+struct fallback_test_policy_hub
+{
+  struct max_policy : cub::ChainedPolicy<100, max_policy, max_policy>
+  {
+    using merge_policy = cub::detail::merge::
+      agent_policy_t<128, 7, cub::BLOCK_LOAD_WARP_TRANSPOSE, cub::LOAD_DEFAULT, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  };
+};
+
+// TODO(bgruber): This test alone increases compile time from 1m16s to 8m43s. What's going on?
+C2H_TEST("DeviceMerge::MergeKeys large key types", "[merge][device]", c2h::type_list<large_type_vsmem, large_type_fallb>)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = int;
+
+  constexpr auto agent_sm = sizeof(key_t) * 128 * 7;
+  constexpr auto fallback_sm =
+    sizeof(key_t) * cub::detail::merge::fallback_BLOCK_THREADS * cub::detail::merge::fallback_ITEMS_PER_THREAD;
+  static_assert(agent_sm > cub::detail::max_smem_per_block,
+                "key_t is not big enough to exceed SM and trigger fallback policy");
+  static_assert(
+    ::cuda::std::is_same<key_t, large_type_fallb>::value == (fallback_sm <= cub::detail::max_smem_per_block),
+    "SM consumption by fallback policy should fit into max_smem_per_block");
+
+  test_keys<key_t, offset_t>(
+    3623,
+    6346,
+    ::cuda::std::less<key_t>{},
+    [](const key_t* k1, offset_t s1, const key_t* k2, offset_t s2, key_t* r, ::cuda::std::less<key_t> co) {
+      using dispatch_t = cub::detail::merge::dispatch_t<
+        const key_t*,
+        const cub::NullType*,
+        const key_t*,
+        const cub::NullType*,
+        key_t*,
+        cub::NullType*,
+        offset_t,
+        ::cuda::std::less<key_t>,
+        fallback_test_policy_hub>; // use a fixed policy for this test so the needed shared memory is deterministic
+
+      std::size_t temp_storage_bytes = 0;
+      dispatch_t::dispatch(
+        nullptr, temp_storage_bytes, k1, nullptr, s1, k2, nullptr, s2, r, nullptr, co, cudaStream_t{0});
+
+      c2h::device_vector<char> temp_storage(temp_storage_bytes);
+      dispatch_t::dispatch(
+        thrust::raw_pointer_cast(temp_storage.data()),
+        temp_storage_bytes,
+        k1,
+        nullptr,
+        s1,
+        k2,
+        nullptr,
+        s2,
+        r,
+        nullptr,
+        co,
+        cudaStream_t{0});
+    });
+}
+
+C2H_TEST("DeviceMerge::MergeKeys offset types", "[merge][device]", offset_types)
+{
+  using key_t    = int;
+  using offset_t = c2h::get<0, TestType>;
+  test_keys<key_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_keys_custom_offset_type);
+}
+
+C2H_TEST("DeviceMerge::MergeKeys input sizes", "[merge][device]")
+{
+  using key_t    = int;
+  using offset_t = int;
+  // TODO(bgruber): maybe less combinations
+  const auto size1 = offset_t{GENERATE(0, 1, 23, 123, 3234)};
+  const auto size2 = offset_t{GENERATE(0, 1, 52, 556, 56767)};
+  test_keys<key_t>(size1, size2);
+}
+
+// cannot put those in an anon namespace, or nvcc complains that the kernels have internal linkage
+using unordered_t = c2h::custom_type_t<c2h::equal_comparable_t>;
+struct order
+{
+  _CCCL_HOST_DEVICE auto operator()(const unordered_t& a, const unordered_t& b) const -> bool
+  {
+    return a.key < b.key;
+  }
+};
+
+C2H_TEST("DeviceMerge::MergeKeys no operator<", "[merge][device]")
+{
+  using key_t    = unordered_t;
+  using offset_t = int;
+  test_keys<key_t, offset_t, order>();
+}
+
+namespace
+{
+template <typename... Its>
+auto zip(Its... its) -> decltype(thrust::make_zip_iterator(its...))
+{
+  return thrust::make_zip_iterator(its...);
+}
+
+template <typename Value>
+struct key_to_value
+{
+  template <typename Key>
+  _CCCL_HOST_DEVICE auto operator()(const Key& k) const -> Value
+  {
+    Value v{};
+    convert(k, v, 0);
+    return v;
+  }
+
+  template <typename Key>
+  _CCCL_HOST_DEVICE static void convert(const Key& k, Value& v, ...)
+  {
+    v = static_cast<Value>(k);
+  }
+
+  template <template <typename> class... Policies>
+  _CCCL_HOST_DEVICE static void convert(const c2h::custom_type_t<Policies...>& k, Value& v, int)
+  {
+    v = static_cast<Value>(k.val);
+  }
+
+  template <typename Key, template <typename> class... Policies>
+  _CCCL_HOST_DEVICE static void convert(const Key& k, c2h::custom_type_t<Policies...>& v, int)
+  {
+    v     = {};
+    v.val = static_cast<decltype(v.val)>(k);
+  }
+};
+} // namespace
+
+template <typename Key,
+          typename Value,
+          typename Offset,
+          typename CompareOp  = ::cuda::std::less<Key>,
+          typename MergePairs = decltype(::merge_pairs)>
+void test_pairs(
+  Offset size1 = 200, Offset size2 = 625, CompareOp compare_op = {}, MergePairs merge_pairs = ::merge_pairs)
+{
+  CAPTURE(c2h::type_name<Key>(), c2h::type_name<Value>(), c2h::type_name<Offset>(), size1, size2);
+
+  // we start with random but sorted keys
+  c2h::device_vector<Key> keys1_d(size1);
+  c2h::device_vector<Key> keys2_d(size2);
+  c2h::gen(C2H_SEED(1), keys1_d);
+  c2h::gen(C2H_SEED(1), keys2_d);
+  thrust::sort(c2h::device_policy, keys1_d.begin(), keys1_d.end(), compare_op);
+  thrust::sort(c2h::device_policy, keys2_d.begin(), keys2_d.end(), compare_op);
+
+  // the values must be functionally dependent on the keys (equal key => equal value), since merge is unstable
+  c2h::device_vector<Value> values1_d(size1);
+  c2h::device_vector<Value> values2_d(size2);
+  thrust::transform(c2h::device_policy, keys1_d.begin(), keys1_d.end(), values1_d.begin(), key_to_value<Value>{});
+  thrust::transform(c2h::device_policy, keys2_d.begin(), keys2_d.end(), values2_d.begin(), key_to_value<Value>{});
+  //  CAPTURE(keys1_d, keys2_d, values1_d, values2_d);
+
+  // compute CUB result
+  c2h::device_vector<Key> result_keys_d(size1 + size2);
+  c2h::device_vector<Value> result_values_d(size1 + size2);
+  merge_pairs(
+    thrust::raw_pointer_cast(keys1_d.data()),
+    thrust::raw_pointer_cast(values1_d.data()),
+    static_cast<Offset>(keys1_d.size()),
+    thrust::raw_pointer_cast(keys2_d.data()),
+    thrust::raw_pointer_cast(values2_d.data()),
+    static_cast<Offset>(keys2_d.size()),
+    thrust::raw_pointer_cast(result_keys_d.data()),
+    thrust::raw_pointer_cast(result_values_d.data()),
+    compare_op);
+
+  // compute reference result
+  c2h::host_vector<Key> reference_keys_h(size1 + size2);
+  c2h::host_vector<Value> reference_values_h(size1 + size2);
+  {
+    c2h::host_vector<Key> keys1_h     = keys1_d;
+    c2h::host_vector<Value> values1_h = values1_d;
+    c2h::host_vector<Key> keys2_h     = keys2_d;
+    c2h::host_vector<Value> values2_h = values2_d;
+    using value_t                     = typename decltype(zip(keys1_h.begin(), values1_h.begin()))::value_type;
+    std::merge(zip(keys1_h.begin(), values1_h.begin()),
+               zip(keys1_h.end(), values1_h.end()),
+               zip(keys2_h.begin(), values2_h.begin()),
+               zip(keys2_h.end(), values2_h.end()),
+               zip(reference_keys_h.begin(), reference_values_h.begin()),
+               [&](const value_t& a, const value_t& b) {
+                 return compare_op(thrust::get<0>(a), thrust::get<0>(b));
+               });
+  }
+
+  // FIXME(bgruber): comparing std::vectors (slower than thrust vectors) but compiles a lot faster
+  CHECK((detail::to_vec(reference_keys_h) == detail::to_vec(c2h::host_vector<Key>(result_keys_d))));
+  CHECK((detail::to_vec(reference_values_h) == detail::to_vec(c2h::host_vector<Value>(result_values_d))));
+}
+
+C2H_TEST("DeviceMerge::MergePairs key types", "[merge][device]", types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using value_t  = int;
+  using offset_t = int;
+  test_pairs<key_t, value_t, offset_t>();
+}
+
+// TODO(bgruber): fine tune the type sizes again to hit the fallback and the vsmem policies
+// C2H_TEST("DeviceMerge::MergePairs large key types", "[merge][device]", large_types)
+// {
+//   using key_t    = c2h::get<0, TestType>;
+//   using value_t  = int;
+//   using offset_t = int;
+//   test_pairs<key_t, value_t, offset_t>();
+// }
+
+C2H_TEST("DeviceMerge::MergePairs value types", "[merge][device]", types)
+{
+  using key_t    = int;
+  using value_t  = c2h::get<0, TestType>;
+  using offset_t = int;
+  test_pairs<key_t, value_t, offset_t>();
+}
+
+C2H_TEST("DeviceMerge::MergePairs offset types", "[merge][device]", offset_types)
+{
+  using key_t    = int;
+  using value_t  = int;
+  using offset_t = c2h::get<0, TestType>;
+  test_pairs<key_t, value_t, offset_t>(3623, 6346, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+}
+
+C2H_TEST("DeviceMerge::MergePairs input sizes", "[merge][device]")
+{
+  using key_t      = int;
+  using value_t    = int;
+  using offset_t   = int;
+  const auto size1 = offset_t{GENERATE(0, 1, 23, 123, 3234234)};
+  const auto size2 = offset_t{GENERATE(0, 1, 52, 556, 56767)};
+  test_pairs<key_t, value_t>(size1, size2);
+}
+
+// this test exceeds 4GiB of memory and the range of 32-bit integers
+C2H_TEST("DeviceMerge::MergePairs really large input", "[merge][device]")
+try
+{
+  using key_t     = char;
+  using value_t   = char;
+  const auto size = std::int64_t{1} << GENERATE(30, 31, 32, 33);
+  test_pairs<key_t, value_t>(size, size, ::cuda::std::less<>{}, merge_pairs_custom_offset_type);
+}
+catch (const std::bad_alloc&)
+{
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
+}
+
+C2H_TEST("DeviceMerge::MergePairs iterators", "[merge][device]")
+{
+  using key_t             = int;
+  using value_t           = int;
+  using offset_t          = int;
+  const offset_t size1    = 363;
+  const offset_t size2    = 634;
+  const auto values_start = 123456789;
+
+  auto key_it   = thrust::counting_iterator<key_t>{};
+  auto value_it = thrust::counting_iterator<key_t>{values_start};
+
+  // compute CUB result
+  c2h::device_vector<key_t> result_keys_d(size1 + size2);
+  c2h::device_vector<value_t> result_values_d(size1 + size2);
+  merge_pairs(
+    key_it,
+    value_it,
+    size1,
+    key_it,
+    value_it,
+    size2,
+    result_keys_d.begin(),
+    result_values_d.begin(),
+    ::cuda::std::less<key_t>{});
+
+  // check result
+  c2h::host_vector<key_t> result_keys_h     = result_keys_d;
+  c2h::host_vector<value_t> result_values_h = result_values_d;
+  const auto smaller_size                   = std::min(size1, size2);
+  for (offset_t i = 0; i < static_cast<offset_t>(result_keys_h.size()); i++)
+  {
+    if (i < 2 * smaller_size)
+    {
+      CHECK(result_keys_h[i + 0] == i / 2);
+      CHECK(result_values_h[i + 0] == values_start + i / 2);
+    }
+    else
+    {
+      CHECK(result_keys_h[i] == i - smaller_size);
+      CHECK(result_values_h[i] == values_start + i - smaller_size);
+    }
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_api.cu
new file mode 100644
index 000000000..ff19e8754
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_api.cu
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <c2h/catch2_test_helper.cuh>
+
+C2H_TEST("DeviceMerge::MergeKeys API example", "[merge][device]")
+{
+  // example-begin merge-keys
+  thrust::device_vector<int> keys1{0, 2, 5};
+  thrust::device_vector<int> keys2{0, 3, 3, 4};
+  thrust::device_vector<int> result(7);
+
+  // 1) Get temp storage size
+  std::size_t temp_storage_bytes = 0;
+  cub::DeviceMerge::MergeKeys(
+    nullptr,
+    temp_storage_bytes,
+    keys1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    static_cast<int>(keys2.size()),
+    result.begin());
+
+  // 2) Allocate temp storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // 3) Perform merge operation
+  cub::DeviceMerge::MergeKeys(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    static_cast<int>(keys2.size()),
+    result.begin());
+
+  thrust::host_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  // example-end merge-keys
+  CHECK(result == expected);
+}
+
+C2H_TEST("DeviceMerge::MergePairs API example", "[merge][device]")
+{
+  // example-begin merge-pairs
+  thrust::device_vector<int> keys1{0, 2, 5};
+  thrust::device_vector<char> values1{'a', 'b', 'c'};
+  thrust::device_vector<int> keys2{0, 3, 3, 4};
+  thrust::device_vector<char> values2{'A', 'B', 'C', 'D'};
+  thrust::device_vector<int> result_keys(7);
+  thrust::device_vector<char> result_values(7);
+
+  // 1) Get temp storage size
+  std::size_t temp_storage_bytes = 0;
+  cub::DeviceMerge::MergePairs(
+    nullptr,
+    temp_storage_bytes,
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin());
+
+  // 2) Allocate temp storage
+  thrust::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // 3) Perform merge operation
+  cub::DeviceMerge::MergePairs(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin());
+
+  thrust::host_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  thrust::host_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  // example-end merge-pairs
+  CHECK(result_keys == expected_keys);
+  CHECK(result_values == expected_values);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort.cu
new file mode 100644
index 000000000..51d94e751
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort.cu
@@ -0,0 +1,501 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/equal.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+
+#include <algorithm>
+
+#include "catch2_large_array_sort_helper.cuh"
+#include "catch2_test_device_merge_sort_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortPairs, sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortPairsCopy, sort_pairs_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortPairs, stable_sort_pairs);
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortKeys, sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortKeysCopy, sort_keys_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortKeys, stable_sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortKeysCopy, stable_sort_keys_copy);
+
+using key_types =
+  c2h::type_list<std::uint8_t,
+                 std::int16_t,
+                 std::uint32_t,
+                 double,
+                 c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>>;
+using wide_key_types = c2h::type_list<std::uint32_t, double>;
+
+using value_types =
+  c2h::type_list<std::uint8_t, float, c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>>;
+
+template <typename OffsetT, typename KeyT = std::uint8_t>
+struct type_tuple
+{
+  using offset_t = OffsetT;
+  using key_t    = KeyT;
+};
+using offset_types =
+  c2h::type_list<type_tuple<std::int16_t>,
+                 type_tuple<std::int32_t>,
+                 type_tuple<std::int32_t, std::uint32_t>,
+                 type_tuple<std::uint32_t>,
+                 type_tuple<std::uint64_t>>;
+
+/**
+ * Function object that maps the targeted sorted rank of an item to a key.
+
+ * E.g., `OffsetT` is `int32_t` and `KeyT` is `float`:
+ * [  4,   2,   3,   1,   0] <= targeted key ranks
+ * [4.0, 2.0, 3.0, 1.0, 0.0] <= corresponding keys
+ */
+template <typename OffsetT, typename KeyT>
+struct rank_to_key_op_t
+{
+  __device__ __host__ KeyT operator()(const OffsetT& val)
+  {
+    return static_cast<KeyT>(val);
+  }
+};
+
+template <typename OffsetT>
+struct rank_to_key_op_t<OffsetT, c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>>
+{
+  using custom_t = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>;
+  __device__ __host__ custom_t operator()(const OffsetT& val)
+  {
+    custom_t custom_val{};
+    custom_val.key = val;
+    custom_val.val = val;
+    return custom_val;
+  }
+};
+
+/**
+ * Helps initialize custom_type_t from a zip-iterator combination of sort-key and value
+ */
+template <typename CustomT>
+struct tuple_to_custom_op_t
+{
+  template <typename KeyT, typename ValueT>
+  __device__ __host__ CustomT operator()(const thrust::tuple<KeyT, ValueT>& val)
+  {
+    CustomT custom_val{};
+    custom_val.key = static_cast<std::size_t>(thrust::get<0>(val));
+    custom_val.val = static_cast<std::size_t>(thrust::get<1>(val));
+    return custom_val;
+  }
+};
+
+/**
+ * @brief In combination with a counting iterator, this function object generates a sequence that wraps around after
+ * reaching `UnsignedIntegralKeyT`'s maximum value. E.g., for a uint8_t this maps the sequence of indexes [0, ..., 254,
+ * 255, 256, 256] -> [0, ..., 254, 255, 0, 1]
+ */
+template <typename UnsignedIntegralKeyT>
+struct index_to_key_value_op
+{
+  static constexpr std::size_t max_key_value =
+    static_cast<std::size_t>(::cuda::std::numeric_limits<UnsignedIntegralKeyT>::max());
+  static constexpr std::size_t lowest_key_value =
+    static_cast<std::size_t>(::cuda::std::numeric_limits<UnsignedIntegralKeyT>::lowest());
+  static_assert(sizeof(UnsignedIntegralKeyT) < sizeof(std::size_t),
+                "Calculation of num_distinct_key_values would overflow");
+  static constexpr std::size_t num_distinct_key_values = (max_key_value - lowest_key_value + std::size_t{1ULL});
+
+  __device__ __host__ UnsignedIntegralKeyT operator()(std::size_t index)
+  {
+    return static_cast<UnsignedIntegralKeyT>(index % num_distinct_key_values);
+  }
+};
+
+/**
+ * @brief In combination with a counting iterator, this function object helps generate the expected sorted order for a
+ * sequence generated with `index_to_key_value_op`. It respects how many remainder items there are following the last
+ * occurrence of `UnsignedIntegralKeyT`'s max value. E.g., when we use `num_total_items` of `260` with an `uint8_t`, the
+ * input sequence was:
+ * [0, ..., 254, 255, 256, 257, 258, 259] <= index
+ * [0, ..., 254, 255,   0,   1,   2,   3] <= input
+ * -----------------
+ * [0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, ..., 255] <= expected sorted order (note, [0, 3] occur twice)
+ */
+template <typename UnsignedIntegralKeyT>
+class index_to_expected_key_op
+{
+private:
+  static constexpr std::size_t max_key_value =
+    static_cast<std::size_t>(::cuda::std::numeric_limits<UnsignedIntegralKeyT>::max());
+  static constexpr std::size_t lowest_key_value =
+    static_cast<std::size_t>(::cuda::std::numeric_limits<UnsignedIntegralKeyT>::lowest());
+  static_assert(sizeof(UnsignedIntegralKeyT) < sizeof(std::size_t),
+                "Calculation of num_distinct_key_values would overflow");
+  static constexpr std::size_t num_distinct_key_values = (max_key_value - lowest_key_value + std::size_t{1ULL});
+
+  // item_count / num_distinct_key_values
+  std::size_t expected_count_per_item;
+  // num remainder items: item_count%num_distinct_key_values
+  std::size_t num_remainder_items;
+  // remainder item_count: expected_count_per_item+1
+  std::size_t remainder_item_count;
+
+public:
+  index_to_expected_key_op(std::size_t num_total_items)
+      : expected_count_per_item(num_total_items / num_distinct_key_values)
+      , num_remainder_items(num_total_items % num_distinct_key_values)
+      , remainder_item_count(expected_count_per_item + std::size_t{1ULL})
+  {}
+
+  __device__ __host__ UnsignedIntegralKeyT operator()(std::size_t index)
+  {
+    // The first (num_remainder_items * remainder_item_count) are items that appear once more often than the items that
+    // follow remainder_items_offset
+    std::size_t remainder_items_offset = num_remainder_items * remainder_item_count;
+
+    UnsignedIntegralKeyT target_item_index =
+      (index <= remainder_items_offset)
+        ?
+        // This is one of the remainder items
+        static_cast<UnsignedIntegralKeyT>(index / remainder_item_count)
+        :
+        // This is an item that appears exactly expected_count_per_item times
+        static_cast<UnsignedIntegralKeyT>(
+          num_remainder_items + ((index - remainder_items_offset) / expected_count_per_item));
+    return target_item_index;
+  }
+};
+
+/**
+ * Generates a shuffled array of key ranks. E.g., for a vector of size 5: [4, 2, 3, 1, 0]
+ */
+template <typename OffsetT>
+c2h::device_vector<OffsetT> make_shuffled_key_ranks_vector(OffsetT num_items, c2h::seed_t seed)
+{
+  c2h::device_vector<OffsetT> key_ranks(num_items);
+  thrust::sequence(c2h::device_policy, key_ranks.begin(), key_ranks.end());
+  thrust::shuffle(c2h::device_policy,
+                  key_ranks.begin(),
+                  key_ranks.end(),
+                  thrust::default_random_engine{static_cast<unsigned int>(seed.get())});
+  return key_ranks;
+}
+
+C2H_TEST("DeviceMergeSort::SortKeysCopy works", "[merge][sort][device]", wide_key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  auto key_ranks           = make_shuffled_key_ranks_vector(num_items, C2H_SEED(2));
+  c2h::device_vector<key_t> keys_in(num_items);
+  thrust::transform(
+    c2h::device_policy, key_ranks.begin(), key_ranks.end(), keys_in.begin(), rank_to_key_op_t<offset_t, key_t>{});
+
+  // Perform sort
+  c2h::device_vector<key_t> keys_out(num_items, static_cast<key_t>(42));
+  sort_keys_copy(
+    thrust::raw_pointer_cast(keys_in.data()), thrust::raw_pointer_cast(keys_out.data()), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto key_ranks_it     = thrust::make_counting_iterator(offset_t{});
+  auto keys_expected_it = thrust::make_transform_iterator(key_ranks_it, rank_to_key_op_t<offset_t, key_t>{});
+  bool results_equal    = thrust::equal(c2h::device_policy, keys_out.cbegin(), keys_out.cend(), keys_expected_it);
+  REQUIRE(results_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::SortKeys works", "[merge][sort][device]", wide_key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  auto key_ranks           = make_shuffled_key_ranks_vector(num_items, C2H_SEED(2));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  thrust::transform(
+    c2h::device_policy, key_ranks.begin(), key_ranks.end(), keys_in_out.begin(), rank_to_key_op_t<offset_t, key_t>{});
+
+  // Perform sort
+  sort_keys(thrust::raw_pointer_cast(keys_in_out.data()), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto key_ranks_it     = thrust::make_counting_iterator(offset_t{});
+  auto keys_expected_it = thrust::make_transform_iterator(key_ranks_it, rank_to_key_op_t<offset_t, key_t>{});
+  bool results_equal    = thrust::equal(c2h::device_policy, keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  REQUIRE(results_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortKeysCopy works and performs a stable sort when there are a lot sort-keys that "
+         "compare equal",
+         "[merge][sort][device]")
+{
+  using key_t    = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>;
+  using offset_t = std::size_t;
+
+  // Prepare input (generate a items that compare equally to check for stability of sort)
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  c2h::device_vector<offset_t> key_ranks(num_items);
+  c2h::gen(C2H_SEED(2), key_ranks, offset_t{}, static_cast<offset_t>(128));
+  c2h::device_vector<key_t> keys_in(num_items);
+  auto key_value_it = thrust::make_counting_iterator(offset_t{});
+  auto key_init_it  = thrust::make_zip_iterator(key_ranks.begin(), key_value_it);
+  thrust::transform(
+    c2h::device_policy, key_init_it, key_init_it + num_items, keys_in.begin(), tuple_to_custom_op_t<key_t>{});
+
+  // Perform sort
+  c2h::device_vector<key_t> keys_out(num_items, rank_to_key_op_t<offset_t, key_t>{}(42));
+  stable_sort_keys_copy(
+    thrust::raw_pointer_cast(keys_in.data()), thrust::raw_pointer_cast(keys_out.data()), num_items, custom_less_op_t{});
+
+  // Verify results
+  c2h::host_vector<key_t> keys_expected(keys_in);
+  std::stable_sort(keys_expected.begin(), keys_expected.end(), custom_less_op_t{});
+
+  REQUIRE(keys_expected == keys_out);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortKeys works", "[merge][sort][device]")
+{
+  using key_t    = c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in_out);
+
+  // Perform sort
+  stable_sort_keys(thrust::raw_pointer_cast(keys_in_out.data()), num_items, custom_less_op_t{});
+
+  // Verify results
+  c2h::host_vector<key_t> keys_expected(keys_in_out);
+  std::stable_sort(keys_expected.begin(), keys_expected.end(), custom_less_op_t{});
+
+  REQUIRE(keys_expected == keys_in_out);
+}
+
+C2H_TEST("DeviceMergeSort::SortPairsCopy works", "[merge][sort][device]", wide_key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  auto key_ranks           = make_shuffled_key_ranks_vector(num_items, C2H_SEED(2));
+  c2h::device_vector<key_t> keys_in(num_items);
+  thrust::transform(
+    c2h::device_policy, key_ranks.begin(), key_ranks.end(), keys_in.begin(), rank_to_key_op_t<offset_t, key_t>{});
+
+  // Perform sort
+  c2h::device_vector<key_t> keys_out(num_items, static_cast<key_t>(42));
+  c2h::device_vector<offset_t> values_out(num_items, static_cast<offset_t>(42));
+  sort_pairs_copy(
+    thrust::raw_pointer_cast(keys_in.data()),
+    thrust::raw_pointer_cast(key_ranks.data()),
+    thrust::raw_pointer_cast(keys_out.data()),
+    thrust::raw_pointer_cast(values_out.data()),
+    num_items,
+    custom_less_op_t{});
+
+  // Verify results
+  auto key_ranks_it       = thrust::make_counting_iterator(offset_t{});
+  auto keys_expected_it   = thrust::make_transform_iterator(key_ranks_it, rank_to_key_op_t<offset_t, key_t>{});
+  auto values_expected_it = thrust::make_counting_iterator(offset_t{});
+  bool keys_equal         = thrust::equal(c2h::device_policy, keys_out.cbegin(), keys_out.cend(), keys_expected_it);
+  bool values_equal = thrust::equal(c2h::device_policy, values_out.cbegin(), values_out.cend(), values_expected_it);
+  REQUIRE(keys_equal == true);
+  REQUIRE(values_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::SortPairs works", "[merge][sort][device]", wide_key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  auto key_ranks           = make_shuffled_key_ranks_vector(num_items, C2H_SEED(2));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  thrust::transform(
+    c2h::device_policy, key_ranks.begin(), key_ranks.end(), keys_in_out.begin(), rank_to_key_op_t<offset_t, key_t>{});
+
+  // Perform sort
+  sort_pairs(thrust::raw_pointer_cast(keys_in_out.data()),
+             thrust::raw_pointer_cast(key_ranks.data()),
+             num_items,
+             custom_less_op_t{});
+
+  // Verify results
+  auto key_ranks_it       = thrust::make_counting_iterator(offset_t{});
+  auto keys_expected_it   = thrust::make_transform_iterator(key_ranks_it, rank_to_key_op_t<offset_t, key_t>{});
+  auto values_expected_it = thrust::make_counting_iterator(offset_t{});
+  bool keys_equal   = thrust::equal(c2h::device_policy, keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  bool values_equal = thrust::equal(c2h::device_policy, key_ranks.cbegin(), key_ranks.cend(), values_expected_it);
+  REQUIRE(keys_equal == true);
+  REQUIRE(values_equal == true);
+}
+
+C2H_TEST(
+  "DeviceMergeSort::StableSortPairs works and performs a stable sort", "[merge][sort][device]", key_types, value_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using data_t   = typename c2h::get<1, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::device_vector<data_t> values_in_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in_out);
+  c2h::gen(C2H_SEED(1), values_in_out);
+
+  // Prepare host data for verification
+  c2h::host_vector<key_t> keys_expected(keys_in_out);
+  c2h::host_vector<data_t> values_expected(values_in_out);
+  auto zipped_expected_it = thrust::make_zip_iterator(keys_expected.begin(), values_expected.begin());
+  std::stable_sort(zipped_expected_it, zipped_expected_it + num_items, compare_first_lt_op_t{});
+
+  // Perform sort
+  stable_sort_pairs(thrust::raw_pointer_cast(keys_in_out.data()),
+                    thrust::raw_pointer_cast(values_in_out.data()),
+                    num_items,
+                    custom_less_op_t{});
+
+  REQUIRE(keys_expected == keys_in_out);
+  REQUIRE(values_expected == values_in_out);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortPairs works for large inputs", "[merge][sort][device]", offset_types)
+{
+  using testing_types_tuple = c2h::get<0, TestType>;
+  using key_t               = typename testing_types_tuple::key_t;
+  using offset_t            = typename testing_types_tuple::offset_t;
+
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  auto num_items_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()) - 1,
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items = static_cast<offset_t>(num_items_ull);
+
+  SECTION("Random")
+  {
+    try
+    {
+      // Initialize random input data
+      large_array_sort_helper<key_t> arrays;
+      constexpr bool is_descending = false;
+      arrays.initialize_for_unstable_key_sort(C2H_SEED(1), num_items, is_descending);
+
+      // Free extra data buffer used during initialization, but not needed for the "in-place" merge sort
+      arrays.deallocate_outputs();
+
+      // Perform sort
+      stable_sort_keys(thrust::raw_pointer_cast(arrays.keys_in.data()), num_items, custom_less_op_t{});
+
+      // Verify results
+      arrays.verify_unstable_key_sort(num_items, is_descending, arrays.keys_in);
+    }
+    catch (std::bad_alloc& e)
+    {
+      const std::size_t num_bytes = num_items * sizeof(key_t);
+      std::cerr << "Skipping merge sort test with " << num_items << " elements (" << num_bytes
+                << " bytes): " << e.what() << "\n";
+    }
+  }
+
+  SECTION("Pre-sorted input")
+  {
+    try
+    {
+      c2h::device_vector<key_t> keys_in_out(num_items);
+
+      // Pre-populated array with a constant value
+      auto counting_it = thrust::make_counting_iterator(std::size_t{0});
+      thrust::copy(counting_it, counting_it + num_items, keys_in_out.begin());
+
+      // Perform sort
+      stable_sort_keys(thrust::raw_pointer_cast(keys_in_out.data()), num_items, custom_less_op_t{});
+
+      // Perform comparison
+      auto expected_result_it = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(std::size_t{}), index_to_expected_key_op<key_t>(num_items));
+      bool is_correct = thrust::equal(expected_result_it, expected_result_it + num_items, keys_in_out.begin());
+      REQUIRE(is_correct == true);
+    }
+    catch (std::bad_alloc& e)
+    {
+      const std::size_t num_bytes = num_items * sizeof(key_t);
+      std::cerr << "Skipping merge sort test with " << num_items << " elements (" << num_bytes
+                << " bytes): " << e.what() << "\n";
+    }
+  }
+
+  SECTION("Reverse-sorted input")
+  {
+    try
+    {
+      c2h::device_vector<key_t> keys_in_out(num_items);
+
+      auto counting_it   = thrust::make_counting_iterator(std::size_t{0});
+      auto key_value_it  = thrust::make_transform_iterator(counting_it, index_to_key_value_op<key_t>{});
+      auto rev_sorted_it = thrust::make_reverse_iterator(key_value_it + num_items);
+      thrust::copy(rev_sorted_it, rev_sorted_it + num_items, keys_in_out.begin());
+
+      // Perform sort
+      stable_sort_keys(thrust::raw_pointer_cast(keys_in_out.data()), num_items, custom_less_op_t{});
+
+      // Perform comparison
+      auto expected_result_it = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(std::size_t{}), index_to_expected_key_op<key_t>(num_items));
+      bool is_correct = thrust::equal(expected_result_it, expected_result_it + num_items, keys_in_out.cbegin());
+      REQUIRE(is_correct == true);
+    }
+    catch (std::bad_alloc& e)
+    {
+      const std::size_t num_bytes = num_items * sizeof(key_t);
+      std::cerr << "Skipping merge sort test with " << num_items << " elements (" << num_bytes
+                << " bytes): " << e.what() << "\n";
+    }
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_common.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_common.cuh
new file mode 100644
index 000000000..a14492b05
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_common.cuh
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * Custom comparator that simply uses `operator <` of the given type.
+ */
+struct custom_less_op_t
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T& lhs, const T& rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+/**
+ * Custom comparator that compares a tuple type's first element using `operator <`.
+ */
+struct compare_first_lt_op_t
+{
+  /**
+   * We need to be able to have two different types for lhs and rhs, as the call to std::stable_sort with a
+   * zip-iterator, will pass a thrust::tuple for lhs and a tuple_of_iterator_references for rhs.
+   */
+  template <typename LhsT, typename RhsT>
+  __host__ __device__ bool operator()(const LhsT& lhs, const RhsT& rhs) const
+  {
+    return thrust::get<0>(lhs) < thrust::get<0>(rhs);
+  }
+};
+
+/**
+ * Function object to computes the modulo of a given value. Used within sort tests to reduce the value-range of sort
+ * keys and, hence, cause more ties between sort keys.
+ */
+template <typename T>
+struct mod_op_t
+{
+  T mod;
+  __host__ __device__ T operator()(T val) const
+  {
+    return val % mod;
+  }
+};
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_iterators.cu
new file mode 100644
index 000000000..47e7790cf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_iterators.cu
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/equal.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reverse.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+
+#include "catch2_test_device_merge_sort_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortPairs, sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortPairsCopy, sort_pairs_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortPairs, stable_sort_pairs);
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortKeys, sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::SortKeysCopy, sort_keys_copy);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortKeys, stable_sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortKeysCopy, stable_sort_keys_copy);
+
+C2H_TEST("DeviceMergeSort::SortKeysCopy works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  auto keys_counting_it    = thrust::make_counting_iterator(key_t{});
+  auto keys_in_it          = thrust::make_reverse_iterator(keys_counting_it + num_items);
+
+  // Perform sort
+  c2h::device_vector<key_t> keys_out(num_items, static_cast<key_t>(42));
+  sort_keys_copy(keys_in_it, keys_out.begin(), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_expected_it = keys_counting_it;
+  bool keys_equal       = thrust::equal(c2h::device_policy, keys_out.cbegin(), keys_out.cend(), keys_expected_it);
+  REQUIRE(keys_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortKeysCopy works with iterators and is stable", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input (ensure we have multiple sort keys that compare equal to check stability)
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  auto sort_key_it = thrust::make_transform_iterator(thrust::make_counting_iterator(key_t{}), mod_op_t<key_t>{128});
+  auto key_idx_it  = thrust::make_counting_iterator(offset_t{});
+  auto keys_in_it  = thrust::make_zip_iterator(sort_key_it, key_idx_it);
+
+  // Perform sort
+  c2h::device_vector<thrust::tuple<key_t, offset_t>> keys_out(
+    num_items, thrust::tuple<key_t, offset_t>{static_cast<key_t>(42), static_cast<offset_t>(42)});
+  stable_sort_keys_copy(keys_in_it, keys_out.begin(), num_items, compare_first_lt_op_t{});
+
+  // Verify results
+  c2h::host_vector<thrust::tuple<key_t, offset_t>> keys_expected(num_items);
+  thrust::copy(keys_in_it, keys_in_it + num_items, keys_expected.begin());
+  std::stable_sort(keys_expected.begin(), keys_expected.end(), compare_first_lt_op_t{});
+
+  REQUIRE(keys_expected == keys_out);
+}
+
+C2H_TEST("DeviceMergeSort::SortKeys works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  thrust::sequence(c2h::device_policy, keys_in_out.begin(), keys_in_out.end());
+  auto keys_in_it = thrust::make_reverse_iterator(keys_in_out.end());
+
+  // Perform sort
+  sort_keys(keys_in_it, num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_counting_it = thrust::make_counting_iterator(key_t{});
+  auto keys_expected_it = thrust::make_reverse_iterator(keys_counting_it + num_items);
+  bool keys_equal       = thrust::equal(keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  REQUIRE(keys_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortKeys works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  thrust::sequence(c2h::device_policy, keys_in_out.begin(), keys_in_out.end());
+  auto keys_in_it = thrust::make_reverse_iterator(keys_in_out.end());
+
+  // Perform sort
+  stable_sort_keys(keys_in_it, num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_counting_it = thrust::make_counting_iterator(key_t{});
+  auto keys_expected_it = thrust::make_reverse_iterator(keys_counting_it + num_items);
+  bool keys_equal       = thrust::equal(c2h::device_policy, keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  REQUIRE(keys_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::SortPairsCopy works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using data_t   = std::uint64_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  auto key_counting_it     = thrust::make_counting_iterator(key_t{});
+  auto keys_in             = thrust::make_reverse_iterator(key_counting_it + num_items);
+  auto values_in           = thrust::make_counting_iterator(data_t{}) + num_items;
+
+  // Perform sort
+  c2h::device_vector<key_t> keys_out(num_items, static_cast<key_t>(42));
+  c2h::device_vector<data_t> values_out(num_items, static_cast<data_t>(42));
+  sort_pairs_copy(keys_in, values_in, keys_out.begin(), values_out.begin(), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_expected_it   = key_counting_it;
+  auto values_expected_it = thrust::make_reverse_iterator(values_in + num_items);
+  bool keys_equal         = thrust::equal(c2h::device_policy, keys_out.cbegin(), keys_out.cend(), keys_expected_it);
+  bool values_equal = thrust::equal(c2h::device_policy, values_out.cbegin(), values_out.cend(), values_expected_it);
+  REQUIRE(keys_equal == true);
+  REQUIRE(values_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::SortPairs works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using data_t   = std::uint64_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::device_vector<data_t> values_in_out(num_items);
+  thrust::sequence(c2h::device_policy, keys_in_out.begin(), keys_in_out.end());
+  thrust::sequence(c2h::device_policy, values_in_out.begin(), values_in_out.end());
+  thrust::reverse(c2h::device_policy, values_in_out.begin(), values_in_out.end());
+  auto keys_in_it = thrust::make_reverse_iterator(keys_in_out.end());
+
+  // Perform sort
+  sort_pairs(keys_in_it, values_in_out.begin(), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_counting_it   = thrust::make_counting_iterator(key_t{});
+  auto keys_expected_it   = thrust::make_reverse_iterator(keys_counting_it + num_items);
+  auto values_expected_it = thrust::make_counting_iterator(data_t{});
+  bool keys_equal = thrust::equal(c2h::device_policy, keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  bool values_equal =
+    thrust::equal(c2h::device_policy, values_in_out.cbegin(), values_in_out.cend(), values_expected_it);
+  REQUIRE(keys_equal == true);
+  REQUIRE(values_equal == true);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortPairs works with iterators", "[merge][sort][device]")
+{
+  using key_t    = std::uint32_t;
+  using data_t   = std::uint64_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000}));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::device_vector<data_t> values_in_out(num_items);
+  thrust::sequence(c2h::device_policy, keys_in_out.begin(), keys_in_out.end());
+  thrust::sequence(c2h::device_policy, values_in_out.begin(), values_in_out.end());
+  thrust::reverse(c2h::device_policy, values_in_out.begin(), values_in_out.end());
+  auto keys_in_it = thrust::make_reverse_iterator(keys_in_out.end());
+
+  // Perform sort
+  stable_sort_pairs(keys_in_it, values_in_out.begin(), num_items, custom_less_op_t{});
+
+  // Verify results
+  auto keys_counting_it   = thrust::make_counting_iterator(key_t{});
+  auto keys_expected_it   = thrust::make_reverse_iterator(keys_counting_it + num_items);
+  auto values_expected_it = thrust::make_counting_iterator(data_t{});
+  bool keys_equal = thrust::equal(c2h::device_policy, keys_in_out.cbegin(), keys_in_out.cend(), keys_expected_it);
+  bool values_equal =
+    thrust::equal(c2h::device_policy, values_in_out.cbegin(), values_in_out.cend(), values_expected_it);
+  REQUIRE(keys_equal == true);
+  REQUIRE(values_equal == true);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_vsmem.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_vsmem.cu
new file mode 100644
index 000000000..728a575e5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_merge_sort_vsmem.cu
@@ -0,0 +1,100 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_merge_sort.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <algorithm>
+
+#include "catch2_test_device_merge_sort_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortKeys, stable_sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMergeSort::StableSortPairs, stable_sort_pairs);
+
+using key_types =
+  c2h::type_list<c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<512>::type>,
+                 c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<1024>::type>>;
+
+C2H_TEST("DeviceMergeSort::StableSortKeys works for large types", "[merge][sort][device]", key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 10000)));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in_out);
+
+  // Prepare host data for verification
+  c2h::host_vector<key_t> keys_expected(keys_in_out);
+  std::stable_sort(keys_expected.begin(), keys_expected.end(), custom_less_op_t{});
+
+  // Perform sort
+  stable_sort_keys(thrust::raw_pointer_cast(keys_in_out.data()), num_items, custom_less_op_t{});
+
+  // Verify results
+  REQUIRE(keys_expected == keys_in_out);
+}
+
+C2H_TEST("DeviceMergeSort::StableSortPairs works for large types", "[merge][sort][device]", key_types)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using data_t   = std::uint32_t;
+  using offset_t = std::int32_t;
+
+  // Prepare input
+  const offset_t num_items = GENERATE_COPY(take(2, random(1, 10000)));
+  c2h::device_vector<key_t> keys_in_out(num_items);
+  c2h::device_vector<data_t> values_in_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in_out);
+  c2h::gen(C2H_SEED(1), values_in_out);
+
+  // Prepare host data for verification
+  c2h::host_vector<key_t> keys_expected(keys_in_out);
+  c2h::host_vector<data_t> values_expected(values_in_out);
+  auto zipped_expected_it = thrust::make_zip_iterator(keys_expected.begin(), values_expected.begin());
+  std::stable_sort(zipped_expected_it, zipped_expected_it + num_items, compare_first_lt_op_t{});
+
+  // Perform sort
+  stable_sort_pairs(thrust::raw_pointer_cast(keys_in_out.data()),
+                    thrust::raw_pointer_cast(values_in_out.data()),
+                    num_items,
+                    custom_less_op_t{});
+
+  // Verify results
+  REQUIRE(keys_expected == keys_in_out);
+  REQUIRE(values_expected == values_in_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_flagged.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_flagged.cu
new file mode 100644
index 000000000..95f8d3868
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_flagged.cu
@@ -0,0 +1,440 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_partition.cuh>
+
+#include <thrust/count.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/partition.h>
+#include <thrust/reverse.h>
+
+#include <cuda/cmath>
+
+#include <algorithm>
+
+#include "catch2_test_device_select_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T, class FlagT>
+static c2h::host_vector<T> get_reference(const c2h::device_vector<T>& in, const c2h::device_vector<FlagT>& flags)
+{
+  struct selector
+  {
+    const T* ref_begin      = nullptr;
+    const FlagT* flag_begin = nullptr;
+
+    constexpr selector(const T* ref, const FlagT* flag) noexcept
+        : ref_begin(ref)
+        , flag_begin(flag)
+    {}
+
+    bool operator()(const T& val) const
+    {
+      const auto pos = &val - ref_begin;
+      return static_cast<bool>(flag_begin[pos]);
+    }
+  };
+
+  c2h::host_vector<T> reference   = in;
+  c2h::host_vector<FlagT> h_flags = flags;
+
+  const selector pred{thrust::raw_pointer_cast(reference.data()), thrust::raw_pointer_cast(h_flags.data())};
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), pred);
+  std::reverse(boundary, reference.end()); // the false partition is in reverse order
+  return reference;
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::Flagged, partition_flagged);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using types = c2h::type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
+C2H_TEST("DevicePartition::Flagged can run with empty input", "[device][partition_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::device_vector<char> flags(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 42);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DevicePartition::Flagged handles all matched", "[device][partition_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<char> flags(num_items, static_cast<char>(1));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DevicePartition::Flagged handles no matched", "[device][partition_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<char> flags(num_items, static_cast<char>(0));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  // The false partition is in reverse order
+  thrust::reverse(c2h::device_policy, out.begin(), out.end());
+
+  REQUIRE(num_selected_out[0] == 0);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DevicePartition::Flagged does not change input", "[device][partition_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<type> reference = in;
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DevicePartition::Flagged is stable", "[device][partition_flagged]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::Flagged works with iterators", "[device][partition_flagged]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::Flagged works with pointers", "[device][partition_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(
+    thrust::raw_pointer_cast(in.data()),
+    thrust::raw_pointer_cast(flags.data()),
+    thrust::raw_pointer_cast(out.data()),
+    d_num_selected_out,
+    num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+struct convertible_to_bool
+{
+  int val_;
+
+  convertible_to_bool() = default;
+  __host__ __device__ convertible_to_bool(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ operator bool() const noexcept
+  {
+    return static_cast<bool>(val_);
+  }
+  __host__ __device__ friend bool operator==(const convertible_to_bool& lhs, const int& rhs) noexcept
+  {
+    return lhs.val_ == rhs;
+  }
+  __host__ __device__ friend bool operator==(const int& lhs, const convertible_to_bool& rhs) noexcept
+  {
+    return lhs == rhs.val_;
+  }
+};
+
+C2H_TEST("DevicePartition::Flagged works with flags that are convertible to bool", "[device][partition_flagged]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> iflags(num_items);
+  c2h::gen(C2H_SEED(1), iflags, 0, 1);
+
+  c2h::device_vector<convertible_to_bool> flags = iflags;
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::Flagged works with flags that alias input", "[device][partition_flagged]")
+{
+  using type = int;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> out(num_items);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(flags, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(flags.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DevicePartition::Flagged works with different output type", "[device][partition_flagged]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::Flagged works for very large number of items", "[device][partition_flagged]", offset_types)
+try
+{
+  using type     = std::int64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  auto in       = thrust::make_counting_iterator(offset_t{0});
+  auto in_flags = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_flagged(in, in_flags, check_result_it, d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_if.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_if.cu
new file mode 100644
index 000000000..76653e9e5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_partition_if.cu
@@ -0,0 +1,361 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_partition.cuh>
+
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/partition.h>
+#include <thrust/reverse.h>
+
+#include <cuda/cmath>
+
+#include <algorithm>
+
+#include "catch2_test_device_select_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition_if);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+struct always_false_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return false;
+  }
+};
+
+struct always_true_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return true;
+  }
+};
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
+
+using types = c2h::
+  type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
+
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::int32_t, std::uint32_t, std::uint64_t>;
+
+C2H_TEST("DevicePartition::If can run with empty input", "[device][partition_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 42);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DevicePartition::If handles all matched", "[device][partition_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DevicePartition::If handles no matched", "[device][partition_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_false_t{});
+
+  // The false partition is in reverse order
+  thrust::reverse(c2h::device_policy, out.begin(), out.end());
+
+  REQUIRE(num_selected_out[0] == 0);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DevicePartition::If does not change input", "[device][partition_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<type> reference = in;
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DevicePartition::If is stable", "[device][partition_if]")
+{
+  using type = c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  // The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
+  // order
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
+  std::reverse(boundary, reference.end());
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::If works with iterators", "[device][partition_if]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  // The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
+  // order
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
+  std::reverse(boundary, reference.end());
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::If works with pointers", "[device][partition_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  // The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
+  // order
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
+  std::reverse(boundary, reference.end());
+
+  partition_if(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DevicePartition::If works with a different output type", "[device][partition_if]")
+{
+  using type = c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  // The main difference between stable_partition and DevicePartition::If is that the false partition is in reverse
+  // order
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), le);
+  std::reverse(boundary, reference.end());
+
+  partition_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(num_selected_out[0] == thrust::distance(reference.begin(), boundary));
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DevicePartition::If works for very large number of items", "[device][partition_if]", offset_types)
+try
+{
+  using type     = std::int64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  auto in = thrust::make_counting_iterator(offset_t{0});
+
+  // We select the first <cut_off_index> items and reject the rest
+  const offset_t cut_off_index = num_items / 4;
+
+  // Prepare tabulate output iterator to verify results in a memory-efficient way:
+  // We use a tabulate iterator that checks whenever the partition algorithm writes an output whether that item
+  // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+  auto expected_selected_it = thrust::make_counting_iterator(offset_t{0});
+  auto expected_rejected_it = thrust::make_reverse_iterator(
+    thrust::make_counting_iterator(offset_t{cut_off_index}) + (num_items - cut_off_index));
+  auto expected_result_op =
+    make_index_to_expected_partition_op(expected_selected_it, expected_rejected_it, cut_off_index);
+  auto expected_result_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(offset_t{0}), expected_result_op);
+  auto check_result_op = make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+  auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  partition_if(
+    in, check_result_it, d_first_num_selected_out, num_items, less_than_t<type>{static_cast<type>(cut_off_index)});
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == cut_off_index);
+  bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_custom.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_custom.cu
new file mode 100644
index 000000000..df762f5fa
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_custom.cu
@@ -0,0 +1,1485 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_radix_sort.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h> // for examples
+#include <thrust/gather.h>
+#include <thrust/reverse.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <bitset>
+#include <climits>
+#include <limits>
+
+#include "catch2_radix_sort_helper.cuh"
+#include "catch2_test_launch_helper.h"
+#include "cub/util_type.cuh"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeys, sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairs, sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeysDescending, sort_keys_descending);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairsDescending, sort_pairs_descending);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+using key   = c2h::custom_type_t<c2h::equal_comparable_t,
+                                 c2h::lexicographical_less_comparable_t,
+                                 c2h::lexicographical_greater_comparable_t>;
+using value = std::size_t;
+
+struct key_decomposer_t
+{
+  template <template <typename> class... Ps>
+  __host__ __device__ ::cuda::std::tuple<std::size_t&> operator()(c2h::custom_type_t<Ps...>& key) const
+  {
+    return {key.key};
+  }
+};
+
+struct pair_decomposer_t
+{
+  template <template <typename> class... Ps>
+  __host__ __device__ ::cuda::std::tuple<std::size_t&, std::size_t&> operator()(c2h::custom_type_t<Ps...>& key) const
+  {
+    return {key.key, key.val};
+  }
+};
+
+constexpr std::size_t bits_per_size_t = sizeof(std::size_t) * CHAR_BIT;
+constexpr std::size_t bits_per_pair_t = bits_per_size_t * 2;
+
+template <template <typename> class... Ps>
+std::bitset<bits_per_pair_t> to_bitset(c2h::custom_type_t<Ps...>& key, int begin_bit, int end_bit)
+{
+  std::bitset<bits_per_pair_t> bits(key.key);
+  bits <<= bits_per_size_t;
+  bits |= key.val;
+
+  for (int bit = 0; bit < begin_bit; bit++)
+  {
+    bits.reset(bit);
+  }
+
+  for (int bit = end_bit; bit < static_cast<int>(bits_per_pair_t); bit++)
+  {
+    bits.reset(bit);
+  }
+
+  return bits;
+}
+
+template <template <typename> class... Ps>
+void from_bitset(std::bitset<bits_per_pair_t> bits, c2h::custom_type_t<Ps...>& pair)
+{
+  pair.key = (bits >> bits_per_size_t).to_ullong();
+  bits <<= bits_per_size_t;
+  bits >>= bits_per_size_t;
+  pair.val = bits.to_ullong();
+}
+
+static c2h::host_vector<key> get_striped_keys(c2h::host_vector<key> keys, int begin_bit, int end_bit)
+{
+  if ((begin_bit > 0) || (end_bit < static_cast<int>(bits_per_pair_t)))
+  {
+    for (std::size_t i = 0; i < keys.size(); i++)
+    {
+      from_bitset(to_bitset(keys[i], begin_bit, end_bit), keys[i]);
+    }
+  }
+
+  return keys;
+}
+
+static c2h::host_vector<std::size_t>
+get_permutation(const c2h::host_vector<key>& h_keys, bool is_descending, int begin_bit, int end_bit)
+{
+  c2h::host_vector<key> h_striped_keys = get_striped_keys(h_keys, begin_bit, end_bit);
+
+  c2h::host_vector<std::size_t> h_permutation(h_keys.size());
+  thrust::sequence(h_permutation.begin(), h_permutation.end());
+
+  std::stable_sort(h_permutation.begin(), h_permutation.end(), [&](std::size_t a, std::size_t b) {
+    if (is_descending)
+    {
+      return h_striped_keys[a] > h_striped_keys[b];
+    }
+
+    return h_striped_keys[a] < h_striped_keys[b];
+  });
+
+  return h_permutation;
+}
+
+static c2h::device_vector<key>
+reference_sort_keys(const c2h::device_vector<key>& d_keys, bool is_descending, int begin_bit, int end_bit)
+{
+  c2h::host_vector<key> h_keys(d_keys);
+  c2h::host_vector<std::size_t> h_permutation = get_permutation(h_keys, is_descending, begin_bit, end_bit);
+  c2h::host_vector<key> result(d_keys.size());
+  thrust::gather(h_permutation.cbegin(), h_permutation.cend(), h_keys.cbegin(), result.begin());
+  return result;
+}
+
+static std::pair<c2h::device_vector<key>, c2h::device_vector<value>> reference_sort_pairs(
+  const c2h::device_vector<key>& d_keys,
+  const c2h::device_vector<value>& d_values,
+  bool is_descending,
+  int begin_bit,
+  int end_bit)
+{
+  c2h::host_vector<key> h_keys(d_keys);
+  c2h::host_vector<value> h_values(d_values);
+  c2h::host_vector<std::size_t> h_permutation = get_permutation(h_keys, is_descending, begin_bit, end_bit);
+
+  c2h::host_vector<key> result_keys(d_keys.size());
+  c2h::host_vector<value> result_values(d_values.size());
+  thrust::gather(h_permutation.cbegin(),
+                 h_permutation.cend(),
+                 thrust::make_zip_iterator(h_keys.cbegin(), h_values.cbegin()),
+                 thrust::make_zip_iterator(result_keys.begin(), result_values.begin()));
+
+  return std::make_pair(result_keys, result_values);
+}
+
+C2H_TEST("Device radix sort works with parts of custom i128_t", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> in_keys(num_items);
+  c2h::device_vector<key> out_keys(num_items);
+  c2h::gen(C2H_SEED(10), in_keys);
+
+  auto reference_keys = reference_sort_keys(in_keys, false, 64, 128);
+  sort_keys(
+    thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()), num_items, key_decomposer_t{});
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+C2H_TEST("Device radix descending sort works with custom i128_t", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> in_keys(num_items);
+  c2h::device_vector<key> out_keys(num_items);
+  c2h::gen(C2H_SEED(10), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference_keys      = reference_sort_keys(in_keys, is_descending, 0, 128);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      pair_decomposer_t{});
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              pair_decomposer_t{});
+  }
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+C2H_TEST("Device radix sort can sort pairs with custom i128_t keys", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> in_keys(num_items);
+  c2h::device_vector<key> out_keys(num_items);
+
+  c2h::device_vector<value> in_values(num_items);
+  c2h::device_vector<value> out_values(num_items);
+  c2h::gen(C2H_SEED(10), in_keys);
+  c2h::gen(C2H_SEED(1), in_values);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference           = reference_sort_pairs(in_keys, in_values, is_descending, 0, 128);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      num_items,
+      pair_decomposer_t{});
+  }
+  else
+  {
+    sort_pairs(thrust::raw_pointer_cast(in_keys.data()),
+               thrust::raw_pointer_cast(out_keys.data()),
+               thrust::raw_pointer_cast(in_values.data()),
+               thrust::raw_pointer_cast(out_values.data()),
+               num_items,
+               pair_decomposer_t{});
+  }
+
+  REQUIRE(reference.first == out_keys);
+  REQUIRE(reference.second == out_values);
+}
+
+C2H_TEST("Device radix sort works with custom i128_t (db)", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> keys_1(num_items);
+  c2h::device_vector<key> keys_2(num_items);
+  c2h::gen(C2H_SEED(2), keys_1);
+
+  key* d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key* d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+
+  const bool is_descending = GENERATE(false, true);
+  auto reference_keys      = reference_sort_keys(keys_1, is_descending, 0, 128);
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, keys, num_items, pair_decomposer_t{});
+
+  keys.selector = action.selector();
+  action.finalize();
+
+  c2h::device_vector<key>& out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+C2H_TEST("Device radix sort works with custom i128_t keys (db)", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> keys_1(num_items);
+  c2h::device_vector<key> keys_2(num_items);
+  c2h::gen(C2H_SEED(2), keys_1);
+
+  c2h::device_vector<value> values_1(num_items);
+  c2h::device_vector<value> values_2(num_items);
+  c2h::gen(C2H_SEED(1), values_1);
+
+  key* d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key* d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  value* d_values_1 = thrust::raw_pointer_cast(values_1.data());
+  value* d_values_2 = thrust::raw_pointer_cast(values_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+  cub::DoubleBuffer<value> values(d_values_1, d_values_2);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, 0, 128);
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, keys, values, num_items, pair_decomposer_t{});
+
+  keys.selector   = action.selector();
+  values.selector = action.selector();
+  action.finalize();
+
+  c2h::device_vector<key>& out_keys     = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+  c2h::device_vector<value>& out_values = values.Current() == d_values_1 ? values_1 : values_2;
+
+  REQUIRE(reference_keys.first == out_keys);
+  REQUIRE(reference_keys.second == out_values);
+}
+
+C2H_TEST("Device radix descending sort works with bits of custom i128_t", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(1, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> in_keys(num_items);
+  c2h::device_vector<key> out_keys(num_items);
+  c2h::gen(C2H_SEED(2), in_keys);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_keys(in_keys, is_descending, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      pair_decomposer_t{},
+      begin_bit,
+      end_bit);
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              pair_decomposer_t{},
+              begin_bit,
+              end_bit);
+  }
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+C2H_TEST("Device radix sort can sort pairs with bits of custom i128_t keys", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(1, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> in_keys(num_items);
+  c2h::device_vector<key> out_keys(num_items);
+
+  c2h::device_vector<value> in_values(num_items);
+  c2h::device_vector<value> out_values(num_items);
+  c2h::gen(C2H_SEED(2), in_keys);
+  c2h::gen(C2H_SEED(1), in_values);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference = reference_sort_pairs(in_keys, in_values, is_descending, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      num_items,
+      pair_decomposer_t{},
+      begin_bit,
+      end_bit);
+  }
+  else
+  {
+    sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      num_items,
+      pair_decomposer_t{},
+      begin_bit,
+      end_bit);
+  }
+
+  REQUIRE(reference.first == out_keys);
+  REQUIRE(reference.second == out_values);
+}
+
+C2H_TEST("Device radix sort works with bits of custom i128_t (db)", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> keys_1(num_items);
+  c2h::device_vector<key> keys_2(num_items);
+  c2h::gen(C2H_SEED(2), keys_1);
+
+  key* d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key* d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_keys(keys_1, is_descending, begin_bit, end_bit);
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, keys, num_items, pair_decomposer_t{}, begin_bit, end_bit);
+
+  keys.selector = action.selector();
+  action.finalize();
+
+  c2h::device_vector<key>& out_keys = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+
+  REQUIRE(reference_keys == out_keys);
+}
+
+C2H_TEST("Device radix sort works with bits of custom i128_t keys (db)", "[radix][sort][device]")
+{
+  constexpr int max_items = 1 << 18;
+  const int num_items     = GENERATE_COPY(take(4, random(max_items / 2, max_items)));
+
+  c2h::device_vector<key> keys_1(num_items);
+  c2h::device_vector<key> keys_2(num_items);
+  c2h::gen(C2H_SEED(2), keys_1);
+
+  c2h::device_vector<value> values_1(num_items);
+  c2h::device_vector<value> values_2(num_items);
+  c2h::gen(C2H_SEED(1), values_1);
+
+  key* d_keys_1 = thrust::raw_pointer_cast(keys_1.data());
+  key* d_keys_2 = thrust::raw_pointer_cast(keys_2.data());
+
+  value* d_values_1 = thrust::raw_pointer_cast(values_1.data());
+  value* d_values_2 = thrust::raw_pointer_cast(values_2.data());
+
+  cub::DoubleBuffer<key> keys(d_keys_1, d_keys_2);
+  cub::DoubleBuffer<value> values(d_values_1, d_values_2);
+
+  const int begin_bit      = GENERATE_COPY(take(4, random(0, 120)));
+  const int end_bit        = GENERATE_COPY(take(4, random(begin_bit, 128)));
+  const bool is_descending = GENERATE(false, true);
+
+  auto reference_keys = reference_sort_pairs(keys_1, values_1, is_descending, begin_bit, end_bit);
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, keys, values, num_items, pair_decomposer_t{}, begin_bit, end_bit);
+
+  keys.selector   = action.selector();
+  values.selector = action.selector();
+  action.finalize();
+
+  c2h::device_vector<key>& out_keys     = keys.Current() == d_keys_1 ? keys_1 : keys_2;
+  c2h::device_vector<value>& out_values = values.Current() == d_values_1 ? values_1 : values_2;
+
+  REQUIRE(reference_keys.first == out_keys);
+  REQUIRE(reference_keys.second == out_values);
+}
+
+#if TEST_LAUNCH != 1
+
+// example-begin custom-type
+struct custom_t
+{
+  float f;
+  int unused;
+  long long int lli;
+
+  custom_t() = default;
+  custom_t(float f, long long int lli)
+      : f(f)
+      , unused(42)
+      , lli(lli)
+  {}
+};
+
+struct decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<float&, long long int&> operator()(custom_t& key) const
+  {
+    return {key.f, key.lli};
+  }
+};
+// example-end custom-type
+
+static __host__ std::ostream& operator<<(std::ostream& os, const custom_t& self)
+{
+  return os << "{ " << self.f << ", " << self.lli << " }";
+}
+
+static __host__ __device__ bool operator==(const custom_t& lhs, const custom_t& rhs)
+{
+  return lhs.f == rhs.f && lhs.lli == rhs.lli;
+}
+
+C2H_TEST("Device radix sort works against some corner cases", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_output = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+1.1f, 3}, //
+      {+2.5f, 4}, //
+      {+3.7f, 5} //
+    };
+    // example-end keys
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> in = {
+      {+1.1f, 2}, //
+      {+2.5f, 1}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5}, //
+      {+3.7f, 0} //
+    };
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_output = {
+      {+3.7f, 0}, //
+      {+2.5f, 1}, //
+      {+1.1f, 2}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5} //
+    };
+    // example-end keys-descending
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {4, 0, 3, 1, 2, 5};
+    thrust::device_vector<int> vals_out(num_items);
+
+    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_vals_in, d_vals_out, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_vals_in, d_vals_out, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+1.1f, 3}, //
+      {+2.5f, 4}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 3, 4, 5};
+    // example-end pairs
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_in = {
+      {+1.1f, 2}, //
+      {+2.5f, 1}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5}, //
+      {+3.7f, 0} //
+    };
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+
+    thrust::device_vector<int> vals_in = {2, 1, 4, 3, 5, 0};
+    thrust::device_vector<int> vals_out(num_items);
+
+    const int* d_vals_in = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out      = thrust::raw_pointer_cast(vals_out.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_vals_in, d_vals_out, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_vals_in, d_vals_out, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {+3.7f, 0}, //
+      {+2.5f, 1}, //
+      {+1.1f, 2}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 4, 3, 5};
+    // example-end pairs-descending
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+}
+
+C2H_TEST("Device radix sort works against some corner cases (db)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-db
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t>& current = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+1.1f, 3}, //
+      {+2.5f, 4}, //
+      {+3.7f, 5} //
+    };
+    // example-end keys-db
+
+    REQUIRE(expected_output == current);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-db
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+1.1f, 2}, //
+      {+2.5f, 1}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5}, //
+      {+3.7f, 0} //
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t>& current = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {+3.7f, 0}, //
+      {+2.5f, 1}, //
+      {+1.1f, 2}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5} //
+    };
+    // example-end keys-descending-db
+
+    REQUIRE(expected_output == current);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-db
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+2.5f, 4}, //
+      {-2.5f, 0}, //
+      {+1.1f, 3}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    thrust::device_vector<int> vals_buf = {4, 0, 3, 1, 2, 5};
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    int* d_vals_buf     = thrust::raw_pointer_cast(vals_buf.data());
+    int* d_vals_alt_buf = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int>& current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {-2.5f, 0}, //
+      {+0.0f, 1}, //
+      {-0.0f, 2}, //
+      {+1.1f, 3}, //
+      {+2.5f, 4}, //
+      {+3.7f, 5} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 3, 4, 5};
+    // example-end pairs-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-db
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    constexpr int num_items = 6;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {+1.1f, 2}, //
+      {+2.5f, 1}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5}, //
+      {+3.7f, 0} //
+    };
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    thrust::device_vector<int> vals_buf = {2, 1, 4, 3, 5, 0};
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    int* d_vals_buf     = thrust::raw_pointer_cast(vals_buf.data());
+    int* d_vals_alt_buf = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{});
+
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{});
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int>& current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {+3.7f, 0}, //
+      {+2.5f, 1}, //
+      {+1.1f, 2}, //
+      {-0.0f, 4}, //
+      {+0.0f, 3}, //
+      {-2.5f, 5} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1, 2, 4, 3, 5};
+    // example-end pairs-descending-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+}
+
+C2H_TEST("Device radix sort works against some corner cases (bits)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-bits
+    constexpr int num_items            = 2;
+    thrust::device_vector<custom_t> in = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t> expected_output = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+    // example-end keys-bits
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-bits
+    constexpr int num_items            = 2;
+    thrust::device_vector<custom_t> in = {{42.4f, 1ll << 60}, {24.2f, 1ll << 61}};
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> out(num_items);
+
+    const custom_t* d_in = thrust::raw_pointer_cast(in.data());
+    custom_t* d_out      = thrust::raw_pointer_cast(out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t> expected_output = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+    // example-end keys-descending-bits
+
+    REQUIRE(expected_output == out);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-bits
+    constexpr int num_items                 = 2;
+    thrust::device_vector<custom_t> keys_in = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    thrust::device_vector<int> vals_in = {1, 0};
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+    thrust::device_vector<int> vals_out(num_items);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+    const int* d_vals_in      = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out           = thrust::raw_pointer_cast(vals_out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      decomposer_t{},
+      begin_bit,
+      end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      decomposer_t{},
+      begin_bit,
+      end_bit);
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-bits
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-bits
+    constexpr int num_items                 = 2;
+    thrust::device_vector<custom_t> keys_in = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+
+    thrust::device_vector<int> vals_in = {1, 0};
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_out(num_items);
+    thrust::device_vector<int> vals_out(num_items);
+
+    const custom_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
+    custom_t* d_keys_out      = thrust::raw_pointer_cast(keys_out.data());
+    const int* d_vals_in      = thrust::raw_pointer_cast(vals_in.data());
+    int* d_vals_out           = thrust::raw_pointer_cast(vals_out.data());
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      decomposer_t{},
+      begin_bit,
+      end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys_in,
+      d_keys_out,
+      d_vals_in,
+      d_vals_out,
+      num_items,
+      decomposer_t{},
+      begin_bit,
+      end_bit);
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-descending-bits
+
+    REQUIRE(expected_keys == keys_out);
+    REQUIRE(expected_vals == vals_out);
+  }
+}
+
+C2H_TEST("Device radix sort works against some corner cases (bits) (db)", "[radix][sort][device]")
+{
+  SECTION("Keys")
+  {
+    // example-begin keys-bits-db
+    constexpr int num_items = 2;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeys(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+    // example-end keys-bits-db
+
+    REQUIRE(expected_output == current_keys);
+  }
+
+  SECTION("KeysDescending")
+  {
+    // example-begin keys-descending-bits-db
+    constexpr int num_items                  = 2;
+    thrust::device_vector<custom_t> keys_buf = {
+      //
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<custom_t> expected_output = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+    // example-end keys-descending-bits-db
+
+    REQUIRE(expected_output == current_keys);
+  }
+
+  SECTION("Pairs")
+  {
+    // example-begin pairs-bits-db
+    constexpr int num_items                  = 2;
+    thrust::device_vector<custom_t> keys_buf = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    thrust::device_vector<int> vals_buf = {1, 0};
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000001110000011001100110011010 00100000000000...0000
+    // decompose(in[1]) = 01000010001010011001100110011010 00010000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+    int* d_vals_buf          = thrust::raw_pointer_cast(vals_buf.data());
+    int* d_vals_alt_buf      = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int>& current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-bits-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+
+  SECTION("PairsDescending")
+  {
+    // example-begin pairs-descending-bits-db
+    constexpr int num_items = 2;
+
+    thrust::device_vector<custom_t> keys_buf = {
+      {42.4f, 1ll << 60}, //
+      {24.2f, 1ll << 61} //
+    };
+
+    thrust::device_vector<int> vals_buf = {1, 0};
+
+    constexpr int begin_bit = sizeof(long long int) * 8 - 4; // 60
+    constexpr int end_bit   = sizeof(long long int) * 8 + 4; // 68
+
+    // Decomposition orders the bits as follows:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = 01000010001010011001100110011010 00010000000000...0000
+    // decompose(in[1]) = 01000001110000011001100110011010 00100000000000...0000
+    //                    <-----------  higher bits  /  lower bits  ----------->
+    //
+    // The bit subrange `[60, 68)` specifies differentiating key bits:
+    //
+    //                    <------------- fp32 -----------> <------ int64 ------>
+    // decompose(in[0]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0001xxxxxxxxxx...xxxx
+    // decompose(in[1]) = xxxxxxxxxxxxxxxxxxxxxxxxxxxx1010 0010xxxxxxxxxx...xxxx
+    //                    <-----------  higher bits  /  lower bits  ----------->
+
+    thrust::device_vector<custom_t> keys_alt_buf(num_items);
+    thrust::device_vector<int> vals_alt_buf(num_items);
+
+    custom_t* d_keys_buf     = thrust::raw_pointer_cast(keys_buf.data());
+    custom_t* d_keys_alt_buf = thrust::raw_pointer_cast(keys_alt_buf.data());
+    int* d_vals_buf          = thrust::raw_pointer_cast(vals_buf.data());
+    int* d_vals_alt_buf      = thrust::raw_pointer_cast(vals_alt_buf.data());
+
+    cub::DoubleBuffer<custom_t> d_keys(d_keys_buf, d_keys_alt_buf);
+    cub::DoubleBuffer<int> d_vals(d_vals_buf, d_vals_alt_buf);
+
+    // 1) Get temp storage size
+    std::uint8_t* d_temp_storage{};
+    std::size_t temp_storage_bytes{};
+
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    // 2) Allocate temp storage
+    thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+    d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+    // 3) Sort keys
+    cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, num_items, decomposer_t{}, begin_bit, end_bit);
+
+    thrust::device_vector<custom_t>& current_keys = //
+      d_keys.Current() == d_keys_buf ? keys_buf : keys_alt_buf;
+
+    thrust::device_vector<int>& current_vals = //
+      d_vals.Current() == d_vals_buf ? vals_buf : vals_alt_buf;
+
+    thrust::device_vector<custom_t> expected_keys = {
+      {24.2f, 1ll << 61}, //
+      {42.4f, 1ll << 60} //
+    };
+
+    thrust::device_vector<int> expected_vals = {0, 1};
+    // example-end pairs-descending-bits-db
+
+    REQUIRE(expected_keys == current_keys);
+    REQUIRE(expected_vals == current_vals);
+  }
+}
+#endif
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_keys.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_keys.cu
new file mode 100644
index 000000000..118df09b3
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_keys.cu
@@ -0,0 +1,531 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/memory.h>
+#include <thrust/scatter.h>
+#include <thrust/transform.h>
+
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <new> // bad_alloc
+
+#include "catch2_large_array_sort_helper.cuh"
+#include "catch2_radix_sort_helper.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeys, sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortKeysDescending, sort_keys_descending);
+
+// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
+
+// TODO:
+// - int128
+// - uint128
+
+// The unsigned integer for the given byte count should be first:
+#if TEST_KEY_BITS == 8
+using key_types            = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
+#  define NO_FP_KEY_TYPES
+#elif TEST_KEY_BITS == 16
+// clang-format off
+using key_types = c2h::type_list<
+    cuda::std::uint16_t
+  , cuda::std::int16_t
+#ifdef TEST_HALF_T
+  , half_t
+#endif
+#ifdef TEST_BF_T
+  , bfloat16_t
+#endif
+  >;
+// clang-format on
+using bit_window_key_types = c2h::type_list<cuda::std::uint16_t, cuda::std::int16_t>;
+#  define NO_FP_KEY_TYPES
+#elif TEST_KEY_BITS == 32
+using key_types            = c2h::type_list<cuda::std::uint32_t, cuda::std::int32_t, float>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint32_t, cuda::std::int32_t>;
+using fp_key_types         = c2h::type_list<float>;
+#elif TEST_KEY_BITS == 64
+using key_types            = c2h::type_list<cuda::std::uint64_t, cuda::std::int64_t, double>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint64_t, cuda::std::int64_t>;
+using fp_key_types         = c2h::type_list<double>;
+#endif
+
+// Used for tests that just need a single type for testing:
+using single_key_type = c2h::type_list<c2h::get<0, key_types>>;
+
+// Index types used for NumItemsT testing. cub::detail::ChooseOffsetT only selects 32/64 bit unsigned types:
+using num_items_types = c2h::type_list<cuda::std::uint32_t, cuda::std::uint64_t>;
+
+C2H_TEST("DeviceRadixSort::SortKeys: basic testing", "[keys][radix][sort][device]", key_types)
+{
+  using key_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t min_num_items = 1 << 5;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items =
+    GENERATE_COPY(std::size_t{0}, std::size_t{1}, take(8, random(min_num_items, max_num_items)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  const int num_key_seeds = 3;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  REQUIRE(ref_keys == out_keys);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: bit windows", "[keys][radix][sort][device]", bit_window_key_types)
+{
+  using key_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+
+  constexpr int num_bits = sizeof(key_t) * CHAR_BIT;
+  // Explicitly use values<>({}) to workaround bug catchorg/Catch2#2040:
+  const int begin_bit = GENERATE_COPY(values<int>({0, num_bits / 3, 3 * num_bits / 4, num_bits}));
+  const int end_bit   = GENERATE_COPY(values<int>({0, num_bits / 3, 3 * num_bits / 4, num_bits}));
+  if (end_bit < begin_bit || (begin_bit == 0 && end_bit == num_bits))
+  {
+    // SKIP(); Not available until Catch2 3.3.0
+    return;
+  }
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit,
+      end_bit);
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit,
+              end_bit);
+  }
+
+  REQUIRE(ref_keys == out_keys);
+}
+
+#ifndef NO_FP_KEY_TYPES
+
+C2H_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sort][device]", fp_key_types)
+{
+  using key_t  = c2h::get<0, TestType>;
+  using bits_t = typename cub::Traits<key_t>::UnsignedBits;
+
+  constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
+  const key_t positive_zero      = ::cuda::std::bit_cast<key_t>(bits_t(0));
+  const key_t negative_zero      = ::cuda::std::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  // Sprinkle some positive and negative zeros randomly throughout the keys:
+  {
+    const size_t num_indices = num_items / 128;
+    c2h::device_vector<std::size_t> indices(num_indices);
+    for (int i = 0; i < 2; ++i)
+    {
+      c2h::gen(C2H_SEED(1), indices, std::size_t(0), num_items);
+      auto begin = thrust::make_constant_iterator(i == 0 ? positive_zero : negative_zero);
+      auto end   = begin + num_indices;
+      thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), in_keys.begin());
+    }
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  // Perform a bitwise comparison to ensure that 0 != -0:
+  REQUIRE_BITWISE_EQ(ref_keys, out_keys);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: NaN handling", "[keys][radix][sort][device]", fp_key_types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using limits_t = cuda::std::numeric_limits<key_t>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  // Sprinkle some NaNs randomly throughout the keys:
+  {
+    const size_t num_indices = num_items / 128;
+    c2h::device_vector<std::size_t> indices(num_indices);
+    bool has_nans = false;
+    for (int i = 0; i < 2; ++i)
+    {
+      const bool supported = i == 0 ? limits_t::has_signaling_NaN : limits_t::has_quiet_NaN;
+      const key_t nan_val  = i == 0 ? limits_t::signaling_NaN() : limits_t::quiet_NaN();
+
+      if (supported)
+      {
+        has_nans = true;
+        c2h::gen(C2H_SEED(1), indices, std::size_t(0), num_items);
+        auto begin = thrust::make_constant_iterator(nan_val);
+        auto end   = begin + num_indices;
+        thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), in_keys.begin());
+      }
+    }
+    if (!has_nans)
+    {
+      // SKIP(); Not available until Catch2 3.3.0
+      return;
+    }
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  REQUIRE_EQ_WITH_NAN_MATCHING(ref_keys, out_keys);
+}
+
+#endif // !NO_FP_KEY_TYPES
+
+C2H_TEST("DeviceRadixSort::SortKeys: entropy reduction", "[keys][radix][sort][device]", single_key_type)
+{
+  using key_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  c2h::device_vector<key_t> in_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  // Repeatedly bitwise-and random keys together. This increases the likelyhood
+  // of duplicate keys.
+  const int entropy_reduction = GENERATE(1, 3, 9, 15);
+  {
+    c2h::device_vector<key_t> tmp(num_items);
+    for (int i = 0; i < entropy_reduction; ++i)
+    {
+      c2h::gen(C2H_SEED(1), tmp);
+      thrust::transform(
+        c2h::device_policy, in_keys.cbegin(), in_keys.cend(), tmp.cbegin(), in_keys.begin(), thrust::bit_and<key_t>{});
+    }
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  c2h::device_vector<key_t> out_keys(num_items);
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  REQUIRE(ref_keys == out_keys);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: uniform values", "[keys][radix][sort][device]", key_types)
+{
+  using key_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  c2h::device_vector<key_t> in_keys(num_items, key_t(4));
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  c2h::device_vector<key_t> out_keys(num_items);
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  REQUIRE(ref_keys == out_keys);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: NumItemsT", "[keys][radix][sort][device]", single_key_type, num_items_types)
+{
+  using key_t       = c2h::get<0, TestType>;
+  using num_items_t = c2h::get<1, TestType>;
+
+  constexpr num_items_t min_num_items = 1 << 5;
+  constexpr num_items_t max_num_items = 1 << 20;
+  const num_items_t num_items =
+    GENERATE_COPY(num_items_t{0}, num_items_t{1}, take(8, random(min_num_items, max_num_items)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  c2h::device_vector<key_t> out_keys(num_items);
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(thrust::raw_pointer_cast(in_keys.data()),
+              thrust::raw_pointer_cast(out_keys.data()),
+              num_items,
+              begin_bit<key_t>(),
+              end_bit<key_t>());
+  }
+
+  REQUIRE(ref_keys == out_keys);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: DoubleBuffer API", "[keys][radix][sort][device]", single_key_type)
+{
+  using key_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  c2h::device_vector<key_t> in_keys(num_items);
+
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  const bool is_descending = GENERATE(false, true);
+
+  auto ref_keys = radix_sort_reference(in_keys, is_descending);
+
+  c2h::device_vector<key_t> out_keys(num_items);
+  cub::DoubleBuffer<key_t> key_buffer(
+    thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()));
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, key_buffer, num_items, begin_bit<key_t>(), end_bit<key_t>());
+
+  key_buffer.selector = action.selector();
+  action.finalize();
+
+  auto& keys = key_buffer.selector == 0 ? in_keys : out_keys;
+
+  REQUIRE(ref_keys == keys);
+}
+
+template <typename key_t, typename num_items_t>
+void do_large_offset_test(std::size_t num_items)
+{
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, is_descending);
+
+  try
+  {
+    large_array_sort_helper<key_t> arrays;
+    arrays.initialize_for_unstable_key_sort(C2H_SEED(1), num_items, is_descending);
+
+    TIME(c2h::cpu_timer timer);
+
+    double_buffer_sort_t action(is_descending);
+    action.initialize();
+    const num_items_t typed_num_items = static_cast<num_items_t>(num_items);
+    launch(action, arrays.keys_buffer, typed_num_items, begin_bit<key_t>(), end_bit<key_t>());
+
+    arrays.keys_buffer.selector = action.selector();
+    action.finalize();
+
+    auto& sorted_keys = arrays.keys_buffer.selector == 0 ? arrays.keys_in : arrays.keys_out;
+
+    TIME(timer.print_elapsed_seconds_and_reset("Device sort"));
+
+    arrays.verify_unstable_key_sort(num_items, is_descending, sorted_keys);
+  }
+  catch (std::bad_alloc& e)
+  {
+    (void) e;
+#ifdef DEBUG_CHECKED_ALLOC_FAILURE
+    const std::size_t num_bytes = num_items * sizeof(key_t);
+    std::cerr
+      << "Skipping radix sort test with " << num_items << " elements (" << num_bytes << " bytes): " << e.what() << "\n";
+#endif // DEBUG_CHECKED_ALLOC_FAILURE
+  }
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: 32-bit overflow check", "[large][keys][radix][sort][device]", single_key_type)
+{
+  using key_t       = c2h::get<0, TestType>;
+  using num_items_t = std::uint32_t;
+
+  // Test problem sizes near and at the maximum offset value to ensure that internal calculations
+  // do not overflow.
+  constexpr std::size_t max_offset    = std::numeric_limits<num_items_t>::max();
+  constexpr std::size_t min_num_items = max_offset - 5;
+  constexpr std::size_t max_num_items = max_offset;
+  const std::size_t num_items         = GENERATE_COPY(min_num_items, max_num_items);
+
+  do_large_offset_test<key_t, num_items_t>(num_items);
+}
+
+C2H_TEST("DeviceRadixSort::SortKeys: Large Offsets", "[large][keys][radix][sort][device]", single_key_type)
+{
+  using key_t       = c2h::get<0, TestType>;
+  using num_items_t = std::uint64_t;
+
+  constexpr std::size_t min_num_items = std::size_t{1} << 32;
+  constexpr std::size_t max_num_items = std::size_t{1} << 33;
+  const std::size_t num_items         = GENERATE_COPY(take(2, random(min_num_items, max_num_items)));
+
+  do_large_offset_test<key_t, num_items_t>(num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_pairs.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_pairs.cu
new file mode 100644
index 000000000..505e46699
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_radix_sort_pairs.cu
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/memory.h>
+
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <new> // bad_alloc
+
+#include "catch2_large_array_sort_helper.cuh"
+#include "catch2_radix_sort_helper.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairs, sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRadixSort::SortPairsDescending, sort_pairs_descending);
+
+using custom_value_t = c2h::custom_type_t<c2h::equal_comparable_t>;
+using value_types    = c2h::type_list<cuda::std::uint8_t, cuda::std::uint64_t, custom_value_t>;
+
+// cub::detail::ChooseOffsetsT only selected 32/64 bit unsigned types:
+using num_items_types = c2h::type_list<cuda::std::uint32_t, cuda::std::uint64_t>;
+
+C2H_TEST("DeviceRadixSort::SortPairs: Basic testing", "[pairs][radix][sort][device]", value_types, num_items_types)
+{
+  using key_t       = cuda::std::uint32_t;
+  using value_t     = c2h::get<0, TestType>;
+  using num_items_t = c2h::get<1, TestType>;
+
+  constexpr num_items_t min_num_items = 1 << 5;
+  constexpr num_items_t max_num_items = 1 << 20;
+  const num_items_t num_items =
+    GENERATE_COPY(num_items_t{0}, num_items_t{1}, take(5, random(min_num_items, max_num_items)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+
+  const int num_key_seeds   = 1;
+  const int num_value_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  const bool is_descending = GENERATE(false, true);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      num_items,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_pairs(thrust::raw_pointer_cast(in_keys.data()),
+               thrust::raw_pointer_cast(out_keys.data()),
+               thrust::raw_pointer_cast(in_values.data()),
+               thrust::raw_pointer_cast(out_values.data()),
+               num_items,
+               begin_bit<key_t>(),
+               end_bit<key_t>());
+  }
+
+  auto refs        = radix_sort_reference(in_keys, in_values, is_descending);
+  auto& ref_keys   = refs.first;
+  auto& ref_values = refs.second;
+
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+
+C2H_TEST("DeviceRadixSort::SortPairs: DoubleBuffer API", "[pairs][radix][sort][device]", value_types)
+{
+  using key_t   = cuda::std::uint32_t;
+  using value_t = c2h::get<0, TestType>;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+
+  const int num_key_seeds   = 1;
+  const int num_value_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  const bool is_descending = GENERATE(false, true);
+
+  cub::DoubleBuffer<key_t> key_buffer(
+    thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()));
+  cub::DoubleBuffer<value_t> value_buffer(
+    thrust::raw_pointer_cast(in_values.data()), thrust::raw_pointer_cast(out_values.data()));
+
+  double_buffer_sort_t action(is_descending);
+  action.initialize();
+  launch(action, key_buffer, value_buffer, num_items, begin_bit<key_t>(), end_bit<key_t>());
+
+  key_buffer.selector   = action.selector();
+  value_buffer.selector = action.selector();
+  action.finalize();
+
+  auto refs        = radix_sort_reference(in_keys, in_values, is_descending);
+  auto& ref_keys   = refs.first;
+  auto& ref_values = refs.second;
+
+  auto& keys   = key_buffer.selector == 0 ? in_keys : out_keys;
+  auto& values = value_buffer.selector == 0 ? in_values : out_values;
+
+  REQUIRE(ref_keys == keys);
+  REQUIRE(ref_values == values);
+}
+
+template <typename key_t, typename value_t, typename num_items_t>
+void do_large_offset_test(std::size_t num_items)
+{
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, is_descending);
+
+  try
+  {
+    large_array_sort_helper<key_t, value_t> arrays;
+    arrays.initialize_for_stable_pair_sort(C2H_SEED(1), num_items, is_descending);
+
+    TIME(c2h::cpu_timer timer);
+
+    double_buffer_sort_t action(is_descending);
+    action.initialize();
+    const num_items_t typed_num_items = static_cast<num_items_t>(num_items);
+    launch(action, arrays.keys_buffer, arrays.values_buffer, typed_num_items, begin_bit<key_t>(), end_bit<key_t>());
+
+    TIME(timer.print_elapsed_seconds_and_reset("Device sort"));
+
+    arrays.keys_buffer.selector   = action.selector();
+    arrays.values_buffer.selector = action.selector();
+    action.finalize();
+
+    auto& keys   = arrays.keys_buffer.selector == 0 ? arrays.keys_in : arrays.keys_out;
+    auto& values = arrays.values_buffer.selector == 0 ? arrays.values_in : arrays.values_out;
+
+    arrays.verify_stable_pair_sort(num_items, is_descending, keys, values);
+  }
+  catch (std::bad_alloc& e)
+  {
+    (void) e;
+#ifdef DEBUG_CHECKED_ALLOC_FAILURE
+    const std::size_t num_bytes = num_items * (sizeof(key_t) + sizeof(value_t));
+    std::cerr
+      << "Skipping radix sort test with " << num_items << " elements (" << num_bytes << " bytes): " << e.what() << "\n";
+#endif // DEBUG_CHECKED_ALLOC_FAILURE
+  }
+}
+
+C2H_TEST("DeviceRadixSort::SortPairs: 32-bit overflow check", "[large][pairs][radix][sort][device]")
+{
+  using key_t       = std::uint8_t;
+  using value_t     = std::uint8_t;
+  using num_items_t = std::uint32_t;
+
+  // Test problem size at the maximum offset value to ensure that internal calculations do not overflow.
+  const std::size_t num_items = std::numeric_limits<num_items_t>::max();
+
+  do_large_offset_test<key_t, value_t, num_items_t>(num_items);
+}
+
+C2H_TEST("DeviceRadixSort::SortPairs: Large Offsets", "[large][pairs][radix][sort][device]")
+{
+  using key_t       = std::uint8_t;
+  using value_t     = std::uint8_t;
+  using num_items_t = std::uint64_t;
+
+  constexpr std::size_t min_num_items = std::size_t{1} << 32;
+  constexpr std::size_t max_num_items = min_num_items + (std::size_t{1} << 30);
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(min_num_items, max_num_items)));
+
+  do_large_offset_test<key_t, value_t, num_items_t>(num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cu
new file mode 100644
index 000000000..7f1103d2e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cu
@@ -0,0 +1,243 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Reduce, device_reduce);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Sum, device_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Min, device_min);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMin, device_arg_min);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_TYPES types 0:1:2:3:4
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_pair<std::uint8_t>, type_pair<std::int8_t, std::int32_t>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_pair<std::int32_t>, type_pair<std::int64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list = c2h::type_list<type_pair<uchar3>, type_pair<ulonglong4>>;
+#elif TEST_TYPES == 3
+// clang-format off
+using full_type_list = c2h::type_list<
+type_pair<custom_t>
+#if TEST_HALF_T
+, type_pair<half_t> // testing half
+#endif
+#if TEST_BF_T
+, type_pair<bfloat16_t> // testing bf16
+
+>;
+#endif
+// clang-format on
+#elif TEST_TYPES == 4
+// DPX SIMD instructions
+using full_type_list = c2h::type_list<type_pair<std::uint16_t>, type_pair<std::int16_t>>;
+#endif
+
+/**
+ * @brief Input data generation mode
+ */
+enum class gen_data_t : int
+{
+  /// Uniform random data generation
+  GEN_TYPE_RANDOM,
+  /// Constant value as input data
+  GEN_TYPE_CONST
+};
+
+C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using item_t   = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = int32_t;
+
+  constexpr int max_items    = 5000000;
+  constexpr int min_items    = 1;
+  constexpr int num_segments = 1;
+
+  // Generate the input sizes to test for
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  // Input data generation to test
+  const gen_data_t data_gen_mode = GENERATE_COPY(gen_data_t::GEN_TYPE_RANDOM, gen_data_t::GEN_TYPE_CONST);
+
+  // Generate input data
+  c2h::device_vector<item_t> in_items(num_items);
+  if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM)
+  {
+    c2h::gen(C2H_SEED(2), in_items);
+  }
+  else
+  {
+    item_t default_constant{};
+    init_default_constant(default_constant);
+    thrust::fill(c2h::device_policy, in_items.begin(), in_items.end(), default_constant);
+  }
+  auto d_in_it = thrust::raw_pointer_cast(in_items.data());
+
+#if TEST_TYPES != 4
+  SECTION("reduce")
+  {
+    using op_t = cub::Sum;
+
+    // Binary reduction operator
+    auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
+
+    // Prepare verification data
+    using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
+    output_t expected_result =
+      static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    using init_t  = cub::detail::value_t<decltype(unwrap_it(d_out_it))>;
+    device_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items, reduction_op, init_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_result[0]);
+  }
+#endif // TEST_TYPES != 4
+
+// Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
+// to pseudo associativity of the addition operation over floating point numbers
+#if TEST_TYPES != 3
+  SECTION("sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
+
+    // Prepare verification data
+    output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = unwrap_it(thrust::raw_pointer_cast(out_result.data()));
+    device_sum(d_in_it, d_out_it, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result[0]);
+  }
+#endif
+
+  SECTION("min")
+  {
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = *std::min_element(host_items.cbegin(), host_items.cend());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_min(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result[0]);
+  }
+
+  SECTION("max")
+  {
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = *std::max_element(host_items.cbegin(), host_items.cend());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_max(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result[0]);
+  }
+
+#if TEST_TYPES != 4
+  SECTION("argmax")
+  {
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = std::max_element(host_items.cbegin(), host_items.cend());
+
+    // Run test
+
+    using result_t = cub::KeyValuePair<int, unwrap_value_t<output_t>>;
+    c2h::device_vector<result_t> out_result(num_segments);
+    device_arg_max(unwrap_it(d_in_it), thrust::raw_pointer_cast(out_result.data()), num_items);
+
+    // Verify result
+    result_t gpu_result = out_result[0];
+    output_t gpu_value  = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
+    REQUIRE(expected_result[0] == gpu_value);
+    REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
+  }
+
+  SECTION("argmin")
+  {
+    // Prepare verification data
+    c2h::host_vector<item_t> host_items(in_items);
+    auto expected_result = std::min_element(host_items.cbegin(), host_items.cend());
+
+    // Run test
+    using result_t = cub::KeyValuePair<int, unwrap_value_t<output_t>>;
+    c2h::device_vector<result_t> out_result(num_segments);
+    device_arg_min(unwrap_it(d_in_it), thrust::raw_pointer_cast(out_result.data()), num_items);
+
+    // Verify result
+    result_t gpu_result = out_result[0];
+    output_t gpu_value  = static_cast<output_t>(gpu_result.value); // Explicitly rewrap the gpu value
+    REQUIRE(expected_result[0] == gpu_value);
+    REQUIRE((expected_result - host_items.cbegin()) == gpu_result.key);
+  }
+#endif
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cuh
new file mode 100644
index 000000000..c6268adfc
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce.cuh
@@ -0,0 +1,499 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/thread/thread_operators.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_type.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+#include <c2h/test_util_vec.cuh>
+#include <nv/target>
+
+CUB_NAMESPACE_BEGIN
+
+#if TEST_HALF_T
+// Half support is provided by SM53+. We currently test against a few older architectures.
+// The specializations below can be removed once we drop these architectures.
+template <>
+__host__ __device__ __forceinline__ //
+  __half
+  Min::operator()(__half& a, __half& b) const
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_53, (return CUB_MIN(a, b);), (return CUB_MIN(__half2float(a), __half2float(b));));
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+  KeyValuePair<int, __half>
+  ArgMin::operator()(const KeyValuePair<int, __half>& a, const KeyValuePair<int, __half>& b) const
+{
+  const float av = __half2float(a.value);
+  const float bv = __half2float(b.value);
+
+  if ((bv < av) || ((av == bv) && (b.key < a.key)))
+  {
+    return b;
+  }
+
+  return a;
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+  __half
+  Max::operator()(__half& a, __half& b) const
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_53, (return CUB_MAX(a, b);), (return CUB_MAX(__half2float(a), __half2float(b));));
+}
+
+template <>
+__host__ __device__ __forceinline__ //
+  KeyValuePair<int, __half>
+  ArgMax::operator()(const KeyValuePair<int, __half>& a, const KeyValuePair<int, __half>& b) const
+{
+  const float av = __half2float(a.value);
+  const float bv = __half2float(b.value);
+
+  if ((bv > av) || ((av == bv) && (b.key < a.key)))
+  {
+    return b;
+  }
+
+  return a;
+}
+#endif // TEST_HALF_T
+
+/**
+ * @brief Introduces the required NumericTraits for `c2h::custom_type_t`.
+ */
+template <template <typename> class... Policies>
+struct NumericTraits<c2h::custom_type_t<Policies...>>
+{
+  using custom_t                     = c2h::custom_type_t<Policies...>;
+  static constexpr Category CATEGORY = NOT_A_NUMBER;
+  enum
+  {
+    PRIMITIVE = false,
+    NULL_TYPE = false,
+  };
+  __host__ __device__ static custom_t Max()
+  {
+    custom_t val{};
+    val.key = NumericTraits<decltype(std::declval<custom_t>().key)>::Max();
+    val.val = NumericTraits<decltype(std::declval<custom_t>().val)>::Max();
+    return val;
+  }
+
+  __host__ __device__ static custom_t Lowest()
+  {
+    custom_t val{};
+    val.key = NumericTraits<decltype(std::declval<custom_t>().key)>::Lowest();
+    val.val = NumericTraits<decltype(std::declval<custom_t>().val)>::Lowest();
+    return val;
+  }
+};
+
+template <typename Key, typename Value>
+static std::ostream& operator<<(std::ostream& os, const KeyValuePair<Key, Value>& val)
+{
+  os << '(' << val.key << ',' << val.value << ')';
+  return os;
+}
+
+template <typename Key, typename Value>
+__host__ __device__ __forceinline__ bool
+operator==(const KeyValuePair<Key, Value>& lhs, const KeyValuePair<Key, Value>& rhs)
+{
+  return lhs.key == rhs.key && lhs.value == rhs.value;
+}
+
+CUB_NAMESPACE_END
+
+// Comparing results computed on CPU and GPU for extended floating point types is impossible.
+// For instance, when used with a constant iterator of two, the accumulator in sequential reference
+// computation (CPU) bumps into the 4096 limits, which will never change (`4096 + 2 = 4096`).
+// Meanwhile, per-thread aggregates (`2 * 16 = 32`) are accumulated within and among thread blocks,
+// yielding `inf` as a result. No reasonable epsilon can be selected to compare `inf` with `4096`.
+// To make `__half` and `__nv_bfloat16` arithmetic associative, the function object below raises
+// extended floating points to the area of unsigned short integers. This allows us to test large
+// inputs with few code-path differences in device algorithms.
+struct ExtendedFloatSum
+{
+  template <class T>
+  __host__ __device__ T operator()(T a, T b) const
+  {
+    T result{};
+    result.__x = a.raw() + b.raw();
+    return result;
+  }
+
+#if TEST_HALF_T
+  __host__ __device__ __half operator()(__half a, __half b) const
+  {
+    uint16_t result = this->operator()(half_t{a}, half_t(b)).raw();
+    return reinterpret_cast<__half&>(result);
+  }
+#endif
+
+#if TEST_BF_T
+  __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const
+  {
+    uint16_t result = this->operator()(bfloat16_t{a}, bfloat16_t(b)).raw();
+    return reinterpret_cast<__nv_bfloat16&>(result);
+  }
+#endif
+};
+
+template <class It>
+inline It unwrap_it(It it)
+{
+  return it;
+}
+
+#if TEST_HALF_T
+inline __half* unwrap_it(half_t* it)
+{
+  return reinterpret_cast<__half*>(it);
+}
+
+template <class OffsetT>
+inline cub::ConstantInputIterator<__half, OffsetT> unwrap_it(cub::ConstantInputIterator<half_t, OffsetT> it)
+{
+  half_t wrapped_val = *it;
+  __half val         = wrapped_val.operator __half();
+  return cub::ConstantInputIterator<__half, OffsetT>(val);
+}
+#endif
+
+#if TEST_BF_T
+inline __nv_bfloat16* unwrap_it(bfloat16_t* it)
+{
+  return reinterpret_cast<__nv_bfloat16*>(it);
+}
+
+template <class OffsetT>
+cub::ConstantInputIterator<__nv_bfloat16, OffsetT> inline unwrap_it(cub::ConstantInputIterator<bfloat16_t, OffsetT> it)
+{
+  bfloat16_t wrapped_val = *it;
+  __nv_bfloat16 val      = wrapped_val.operator __nv_bfloat16();
+  return cub::ConstantInputIterator<__nv_bfloat16, OffsetT>(val);
+}
+#endif
+
+template <typename T>
+using unwrap_value_t = typename std::remove_reference<decltype(*unwrap_it(std::declval<T*>()))>::type;
+
+template <class WrappedItT, //
+          class ItT = decltype(unwrap_it(std::declval<WrappedItT>()))>
+std::integral_constant<bool, !std::is_same<WrappedItT, ItT>::value> //
+  inline reference_extended_fp(WrappedItT)
+{
+  return {};
+}
+
+inline ExtendedFloatSum unwrap_op(std::true_type /* extended float */, cub::Sum) //
+{
+  return {};
+}
+
+template <bool V, class OpT>
+inline OpT unwrap_op(std::integral_constant<bool, V> /* base case */, OpT op)
+{
+  return op;
+}
+
+/**
+ * @brief Initializes the given item type with a constant non-zero value.
+ */
+template <typename T>
+inline void init_default_constant(T& val)
+{
+  val = T{2};
+}
+
+template <template <typename> class... Policies>
+inline void init_default_constant(c2h::custom_type_t<Policies...>& val)
+{
+  val.key = 2;
+  val.val = 2;
+}
+
+inline void init_default_constant(uchar3& val)
+{
+  val = uchar3{2, 2, 2};
+}
+
+inline void init_default_constant(ulonglong4& val)
+{
+  val = ulonglong4{2, 2, 2, 2};
+}
+
+template <typename InputItT,
+          typename OffsetItT,
+          typename SizeItT,
+          typename ReductionOpT,
+          typename InitT,
+          typename ResultOutItT>
+inline void compute_host_reference(
+  InputItT h_in,
+  OffsetItT h_offsets,
+  SizeItT h_sizes_begin,
+  std::size_t num_segments,
+  ReductionOpT reduction_op,
+  InitT init,
+  ResultOutItT h_data_out)
+{
+  for (std::size_t segment = 0; segment < num_segments; segment++)
+  {
+    auto seg_begin = h_in + h_offsets[segment];
+    auto seg_end   = seg_begin + h_sizes_begin[segment];
+    // TODO Should this be using cub accumulator t?
+    h_data_out[segment] =
+      static_cast<cub::detail::value_t<ResultOutItT>>(std::accumulate(seg_begin, seg_end, init, reduction_op));
+  }
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification taking an
+ * arbitrary host-accessible input iterator.
+ */
+template <typename InputItT, typename ReductionOpT, typename AccumulatorT>
+inline AccumulatorT
+compute_single_problem_reference(InputItT h_in_begin, InputItT h_in_end, ReductionOpT reduction_op, AccumulatorT init)
+{
+  constexpr std::size_t num_segments = 1;
+  c2h::host_vector<AccumulatorT> h_results(num_segments);
+
+  compute_host_reference(
+    h_in_begin,
+    thrust::make_constant_iterator(0),
+    thrust::make_constant_iterator(thrust::distance(h_in_begin, h_in_end)),
+    num_segments,
+    reduction_op,
+    init,
+    h_results.begin());
+
+  return *h_results.begin();
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification, taking a
+ * c2h::device_vector.
+ */
+template <typename ItemT, typename ReductionOpT, typename AccumulatorT>
+inline AccumulatorT
+compute_single_problem_reference(const c2h::device_vector<ItemT>& d_in, ReductionOpT reduction_op, AccumulatorT init)
+{
+  constexpr std::size_t num_segments = 1;
+  c2h::host_vector<ItemT> h_items(d_in);
+  c2h::host_vector<AccumulatorT> h_results(num_segments);
+
+  return compute_single_problem_reference(h_items.cbegin(), h_items.cend(), reduction_op, init);
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification, taking a
+ * c2h::device_vector of input items and a c2h::device_vector of offsets into the segments.
+ */
+template <typename ItemT, typename OffsetT, typename ReductionOpT, typename AccumulatorT, typename ResultItT>
+void compute_segmented_problem_reference(
+  const c2h::device_vector<ItemT>& d_in,
+  const c2h::device_vector<OffsetT>& d_offsets,
+  ReductionOpT reduction_op,
+  AccumulatorT init,
+  ResultItT h_results)
+{
+  c2h::host_vector<ItemT> h_items(d_in);
+  c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  auto offsets_it = h_offsets.cbegin();
+  auto seg_sizes_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), [offsets_it](std::size_t i) {
+      return offsets_it[i + 1] - offsets_it[i];
+    });
+  std::size_t num_segments = h_offsets.size() - 1;
+
+  compute_host_reference(
+    h_items.cbegin(), h_offsets.cbegin(), seg_sizes_it, num_segments, reduction_op, init, h_results);
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification, taking a
+ * host-accessible input iterator and a c2h::device_vector of offsets into the segments.
+ */
+template <typename InputItT, typename OffsetT, typename ReductionOpT, typename AccumulatorT, typename ResultItT>
+void compute_segmented_problem_reference(
+  InputItT in_it,
+  const c2h::device_vector<OffsetT>& d_offsets,
+  ReductionOpT reduction_op,
+  AccumulatorT init,
+  ResultItT h_results)
+{
+  c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  auto offsets_it = h_offsets.cbegin();
+  auto seg_sizes_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), [offsets_it](std::size_t i) {
+      return offsets_it[i + 1] - offsets_it[i];
+    });
+  std::size_t num_segments = h_offsets.size() - 1;
+
+  compute_host_reference(in_it, h_offsets.cbegin(), seg_sizes_it, num_segments, reduction_op, init, h_results);
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification, taking a
+ * c2h::device_vector of input items and a c2h::device_vector of offsets into the segments.
+ */
+template <typename ItemT, typename OffsetT, typename ResultItT>
+void compute_segmented_argmin_reference(
+  const c2h::device_vector<ItemT>& d_in, const c2h::device_vector<OffsetT>& d_offsets, ResultItT h_results)
+{
+  c2h::host_vector<ItemT> h_items(d_in);
+  c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  const auto num_segments = h_offsets.size() - 1;
+  for (std::size_t seg = 0; seg < num_segments; seg++)
+  {
+    if (h_offsets[seg] >= h_offsets[seg + 1])
+    {
+      h_results[seg] = {1, cub::Traits<ItemT>::Max()};
+    }
+    else
+    {
+      auto expected_result_it =
+        std::min_element(h_items.cbegin() + h_offsets[seg], h_items.cbegin() + h_offsets[seg + 1]);
+      int result_offset = static_cast<int>(thrust::distance((h_items.cbegin() + h_offsets[seg]), expected_result_it));
+      h_results[seg]    = {result_offset, *expected_result_it};
+    }
+  }
+}
+
+/**
+ * @brief Helper function to compute the reference solution for result verification, taking a
+ * c2h::device_vector of input items and a c2h::device_vector of offsets into the segments.
+ */
+template <typename ItemT, typename OffsetT, typename ResultItT>
+void compute_segmented_argmax_reference(
+  const c2h::device_vector<ItemT>& d_in, const c2h::device_vector<OffsetT>& d_offsets, ResultItT h_results)
+{
+  c2h::host_vector<ItemT> h_items(d_in);
+  c2h::host_vector<OffsetT> h_offsets(d_offsets);
+  const auto num_segments = h_offsets.size() - 1;
+  for (std::size_t seg = 0; seg < num_segments; seg++)
+  {
+    if (h_offsets[seg] >= h_offsets[seg + 1])
+    {
+      h_results[seg] = {1, cub::Traits<ItemT>::Lowest()};
+    }
+    else
+    {
+      auto expected_result_it =
+        std::max_element(h_items.cbegin() + h_offsets[seg], h_items.cbegin() + h_offsets[seg + 1]);
+      int result_offset = static_cast<int>(thrust::distance((h_items.cbegin() + h_offsets[seg]), expected_result_it));
+      h_results[seg]    = {result_offset, *expected_result_it};
+    }
+  }
+}
+
+/**
+ * @brief Helper function to compute the reference solution for unique keys (i.e., collapsing each
+ * run of equal keys into a single key).
+ */
+template <typename InputItT, typename OutputItT>
+inline OutputItT compute_unique_keys_reference(InputItT h_in_begin, std::size_t num_keys, OutputItT h_out_it)
+{
+  if (num_keys == 0)
+  {
+    return h_out_it;
+  }
+  *h_out_it++ = h_in_begin[0];
+  for (std::size_t i = 1; i < num_keys; i++)
+  {
+    if (!(h_in_begin[i - 1] == h_in_begin[i]))
+    {
+      *h_out_it = h_in_begin[i];
+      h_out_it++;
+    }
+  }
+  return h_out_it;
+}
+
+/**
+ * @brief Helper function to compute the reference solution for unique keys (i.e., collapsing each
+ * run of equal keys into a single key).
+ */
+template <typename ItemT>
+inline c2h::host_vector<ItemT> compute_unique_keys_reference(const c2h::device_vector<ItemT>& d_keys)
+{
+  c2h::host_vector<ItemT> h_keys(d_keys);
+  c2h::host_vector<ItemT> h_unique_keys_out(d_keys.size());
+
+  auto end_it = compute_unique_keys_reference(h_keys.cbegin(), h_keys.size(), h_unique_keys_out.begin());
+  h_unique_keys_out.resize(thrust::distance(h_unique_keys_out.begin(), end_it));
+  return h_unique_keys_out;
+}
+
+/**
+ * @brief Helper class template to facilitate specifying input/output type pairs along with the key
+ * type for reduce-by-key algorithms.
+ */
+template <typename InputT, typename OutputT = InputT, typename KeyT = std::int32_t>
+struct type_triple
+{
+  using input_t  = InputT;
+  using output_t = OutputT;
+  using key_t    = KeyT;
+};
+
+/**
+ * @brief Helper class template to facilitate specifying input/output type pairs.
+ */
+template <typename InputT, typename OutputT = InputT>
+struct type_pair
+{
+  using input_t  = InputT;
+  using output_t = OutputT;
+};
+
+/**
+ * @brief Helper class template to facilitate accessing types specified by type-parameterized tests.
+ */
+template <typename TestType>
+struct params_t
+{
+  using type_pair_t = typename c2h::get<0, TestType>;
+  using item_t      = typename type_pair_t::input_t;
+  using output_t    = typename type_pair_t::output_t;
+};
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key.cu
new file mode 100644
index 000000000..a1107aaf7
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key.cu
@@ -0,0 +1,175 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ReduceByKey, device_reduce_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_TYPES types 0:1:2:3
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_triple<std::uint8_t>, type_triple<std::int8_t, std::int32_t, custom_t>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_triple<std::int32_t>, type_triple<std::int64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list = c2h::type_list<type_triple<uchar3, uchar3, custom_t>, type_triple<ulonglong4>>;
+#elif TEST_TYPES == 3
+// clang-format off
+using full_type_list = c2h::type_list<
+type_triple<custom_t>
+#if TEST_HALF_T
+, type_triple<half_t> // testing half
+#endif
+#if TEST_BF_T
+, type_triple<bfloat16_t> // testing bf16
+#endif
+>;
+// clang-format on
+#endif
+
+C2H_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using value_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using key_t    = typename params::type_pair_t::key_t;
+  using offset_t = uint32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Number of items
+  const offset_t num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate (a segment is a series of consecutive equal keys)
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+
+  // Get array of keys from segment offsets
+  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  c2h::device_vector<key_t> segment_keys(num_items);
+  c2h::init_key_segments(segment_offsets, segment_keys);
+  auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data());
+
+  // Generate input data
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(C2H_SEED(2), in_values);
+  auto d_values_it = thrust::raw_pointer_cast(in_values.data());
+
+  SECTION("sum")
+  {
+    using op_t = cub::Sum;
+
+    // Binary reduction operator
+    auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});
+
+    // Prepare verification data
+    using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
+    c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
+
+    // Run test
+    c2h::device_vector<offset_t> num_unique_keys(1);
+    c2h::device_vector<key_t> out_unique_keys(num_segments);
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it      = thrust::raw_pointer_cast(out_result.data());
+    auto d_keys_out_it = thrust::raw_pointer_cast(out_unique_keys.data());
+    device_reduce_by_key(
+      d_keys_it,
+      d_keys_out_it,
+      unwrap_it(d_values_it),
+      unwrap_it(d_out_it),
+      thrust::raw_pointer_cast(num_unique_keys.data()),
+      reduction_op,
+      num_items);
+
+    // Verify result
+    REQUIRE(num_segments == num_unique_keys[0]);
+    REQUIRE(expected_result == out_result);
+    REQUIRE(expected_keys == out_unique_keys);
+  }
+
+  SECTION("min")
+  {
+    using op_t = cub::Min;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(
+      in_values, segment_offsets, op_t{}, cub::NumericTraits<value_t>::Max(), expected_result.begin());
+    c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
+
+    // Run test
+    c2h::device_vector<offset_t> num_unique_keys(1);
+    c2h::device_vector<key_t> out_unique_keys(num_segments);
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_result_out_it = thrust::raw_pointer_cast(out_result.data());
+    auto d_keys_out_it   = thrust::raw_pointer_cast(out_unique_keys.data());
+    device_reduce_by_key(
+      d_keys_it,
+      d_keys_out_it,
+      unwrap_it(d_values_it),
+      unwrap_it(d_result_out_it),
+      thrust::raw_pointer_cast(num_unique_keys.data()),
+      op_t{},
+      num_items);
+
+    // Verify result
+    REQUIRE(num_segments == num_unique_keys[0]);
+    REQUIRE(expected_result == out_result);
+    REQUIRE(expected_keys == out_unique_keys);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key_iterators.cu
new file mode 100644
index 000000000..adfc05aa9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_by_key_iterators.cu
@@ -0,0 +1,115 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ReduceByKey, device_reduce_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+// List of types to test
+using custom_t           = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+using iterator_type_list = c2h::type_list<type_triple<custom_t>, type_triple<std::int64_t, std::int64_t, custom_t>>;
+
+C2H_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]", iterator_type_list)
+{
+  using params   = params_t<TestType>;
+  using value_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using key_t    = typename params::type_pair_t::key_t;
+  using offset_t = uint32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Number of items
+  const offset_t num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate (a segment is a series of consecutive equal keys)
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+
+  // Get array of keys from segment offsets
+  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  c2h::device_vector<key_t> segment_keys(num_items);
+  c2h::init_key_segments(segment_offsets, segment_keys);
+  auto d_keys_it = segment_keys.cbegin();
+
+  // Prepare input data
+  value_t default_constant{};
+  init_default_constant(default_constant);
+  auto value_it = thrust::make_constant_iterator(default_constant);
+
+  using op_t = cub::Sum;
+
+  // Prepare verification data
+  using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
+  c2h::host_vector<output_t> expected_result(num_segments);
+  compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
+  c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);
+
+  // Run test
+  c2h::device_vector<offset_t> num_unique_keys(1);
+  c2h::device_vector<key_t> out_unique_keys(num_segments);
+  c2h::device_vector<output_t> out_result(num_segments);
+  auto d_result_out_it = thrust::raw_pointer_cast(out_result.data());
+  auto d_keys_out_it   = out_unique_keys.begin();
+  device_reduce_by_key(
+    d_keys_it,
+    d_keys_out_it,
+    value_it,
+    d_result_out_it,
+    thrust::raw_pointer_cast(num_unique_keys.data()),
+    op_t{},
+    num_items);
+
+  // Verify result
+  REQUIRE(expected_result == out_result);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_fp_inf.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_fp_inf.cu
new file mode 100644
index 000000000..fa7e86cc2
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_fp_inf.cu
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include <cuda/std/limits>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMin, device_arg_min);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+C2H_TEST("Device reduce arg{min,max} works with inf items", "[reduce][device]")
+{
+  using in_t     = float;
+  using offset_t = int;
+  using out_t    = cub::KeyValuePair<offset_t, in_t>;
+
+  constexpr int n     = 10;
+  constexpr float inf = ::cuda::std::numeric_limits<float>::infinity();
+
+  c2h::device_vector<out_t> out(1);
+  out_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  /**
+   * ArgMin should return max value for empty input. This interferes with
+   * input data containing infinity values. This test checks that ArgMin
+   * works correctly with infinity values.
+   */
+  SECTION("InfInArgMin")
+  {
+    c2h::device_vector<in_t> in(n, inf);
+    const in_t* d_in = thrust::raw_pointer_cast(in.data());
+
+    device_arg_min(d_in, d_out, n);
+
+    const out_t result = out[0];
+    REQUIRE(result.key == 0);
+    REQUIRE(result.value == inf);
+  }
+
+  /**
+   * ArgMax should return lowest value for empty input. This interferes with
+   * input data containing infinity values. This test checks that ArgMax
+   * works correctly with infinity values.
+   */
+  SECTION("InfInArgMax")
+  {
+    c2h::device_vector<in_t> in(n, -inf);
+    const in_t* d_in = thrust::raw_pointer_cast(in.data());
+
+    device_arg_max(d_in, d_out, n);
+
+    const out_t result = out[0];
+    REQUIRE(result.key == 0);
+    REQUIRE(result.value == -inf);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_iterators.cu
new file mode 100644
index 000000000..74a534b00
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_reduce_iterators.cu
@@ -0,0 +1,147 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Reduce, device_reduce);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Sum, device_sum);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+// List of types to test
+using custom_t           = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+using iterator_type_list = c2h::type_list<type_pair<custom_t>, type_pair<std::int64_t>>;
+
+/**
+ * @brief Helper function to test large problem sizes, including problems requiring 64-bit offset
+ * types.
+ */
+template <typename T, typename offset_t>
+void test_big_indices_helper(offset_t num_items)
+{
+  thrust::constant_iterator<T> const_iter(T{1});
+  c2h::device_vector<std::size_t> out(1);
+  std::size_t* d_out = thrust::raw_pointer_cast(out.data());
+  device_sum(const_iter, d_out, num_items);
+  std::size_t result = out[0];
+
+  REQUIRE(result == num_items);
+}
+
+C2H_TEST("Device sum works for big indices", "[reduce][device]")
+{
+  test_big_indices_helper<std::size_t, std::uint32_t>(1ull << 30);
+  test_big_indices_helper<std::size_t, std::uint32_t>(1ull << 31);
+  test_big_indices_helper<std::size_t, std::uint32_t>((1ull << 32) - 1);
+  test_big_indices_helper<std::size_t, std::uint64_t>(1ull << 33);
+}
+
+C2H_TEST("Device reduce works with fancy input iterators", "[reduce][device]", iterator_type_list)
+{
+  using params   = params_t<TestType>;
+  using item_t   = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = int32_t;
+
+  constexpr int max_items    = 5000000;
+  constexpr int min_items    = 1;
+  constexpr int num_segments = 1;
+
+  // Generate the input sizes to test for
+  const int num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  // Prepare input data
+  item_t default_constant{};
+  init_default_constant(default_constant);
+  auto in_it = thrust::make_constant_iterator(default_constant);
+
+  using op_t   = cub::Sum;
+  using init_t = output_t;
+
+  // Binary reduction operator
+  auto reduction_op = op_t{};
+
+  // Prepare verification data
+  using accum_t            = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
+  output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{});
+
+  // Run test
+  c2h::device_vector<output_t> out_result(num_segments);
+  auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+  device_reduce(in_it, d_out_it, num_items, reduction_op, init_t{});
+
+  // Verify result
+  REQUIRE(expected_result == out_result[0]);
+}
+
+C2H_TEST("Device reduce compiles with discard output iterator", "[reduce][device]", iterator_type_list)
+{
+  using params   = params_t<TestType>;
+  using item_t   = typename params::item_t;
+  using output_t = typename params::output_t;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  // Generate the input sizes to test for
+  const int num_items = GENERATE_COPY(values({
+    min_items,
+    max_items,
+  }));
+
+  // Prepare input data
+  item_t default_constant{};
+  init_default_constant(default_constant);
+  auto in_it = thrust::make_constant_iterator(default_constant);
+
+  using op_t   = cub::Sum;
+  using init_t = output_t;
+
+  // Binary reduction operator
+  auto reduction_op = op_t{};
+
+  // Run test
+  device_reduce(in_it, thrust::make_discard_iterator(), num_items, reduction_op, init_t{});
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode.cu
new file mode 100644
index 000000000..e01880d46
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode.cu
@@ -0,0 +1,266 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_run_length_encode.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRunLengthEncode::Encode, run_length_encode);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint64_t,
+                 std::int8_t,
+                 std::int64_t,
+                 ulonglong2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using types = c2h::type_list<std::uint32_t, std::int8_t>;
+
+#if 0 // DeviceRunLengthEncode::Encode cannot handle empty inputs
+      // https://github.com/NVIDIA/cccl/issues/426
+C2H_TEST("DeviceRunLengthEncode::Encode can handle empty input", "[device][run_length_encode]")
+{
+  constexpr int num_items = 0;
+  c2h::device_vector<int> in(num_items);
+  c2h::device_vector<int> out_num_runs(1, 42);
+
+  // Note intentionally no discard_iterator as we want to ensure nothing is written to the output arrays
+  run_length_encode(in.begin(),
+                    static_cast<int*>(nullptr),
+                    static_cast<int*>(nullptr),
+                    thrust::raw_pointer_cast(out_num_runs.data()),
+                    num_items);
+
+  REQUIRE(out_num_runs.front() == num_items);
+}
+#endif
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle a single element", "[device][run_length_encode]")
+{
+  constexpr int num_items = 1;
+  c2h::device_vector<int> in(num_items, 42);
+  c2h::device_vector<int> out_unique(num_items);
+  c2h::device_vector<int> out_counts(num_items);
+  c2h::device_vector<int> out_num_runs(num_items, -1);
+
+  run_length_encode(in.begin(), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  REQUIRE(out_unique.front() == 42);
+  REQUIRE(out_counts.front() == 1);
+  REQUIRE(out_num_runs.front() == num_items);
+}
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle different counting types", "[device][run_length_encode]")
+{
+  constexpr int num_items = 1;
+  c2h::device_vector<int> in(num_items, 42);
+  c2h::device_vector<int> out_unique(num_items);
+  c2h::device_vector<cuda::std::size_t> out_counts(num_items);
+  c2h::device_vector<std::int16_t> out_num_runs(num_items);
+
+  run_length_encode(in.begin(), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  REQUIRE(out_unique.front() == 42);
+  REQUIRE(out_counts.front() == 1);
+  REQUIRE(out_num_runs.front() == static_cast<std::int16_t>(num_items));
+}
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle all unique", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 10;
+  c2h::device_vector<type> out_unique(num_items);
+  c2h::device_vector<int> out_counts(num_items);
+  c2h::device_vector<int> out_num_runs(1);
+
+  run_length_encode(
+    thrust::make_counting_iterator(type{}), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  c2h::device_vector<type> reference_unique(num_items);
+  thrust::sequence(c2h::device_policy, reference_unique.begin(), reference_unique.end(), type{}); // [0, 1, 2, ...,
+                                                                                                  // num_items -1]
+  c2h::device_vector<int> reference_counts(num_items, 1); // [1, 1, ..., 1]
+  c2h::device_vector<int> reference_num_runs(1, num_items); // [num_items]
+
+  REQUIRE(out_unique == reference_unique);
+  REQUIRE(out_counts == reference_counts);
+  REQUIRE(out_num_runs == reference_num_runs);
+}
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle all equal", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 10;
+  c2h::device_vector<type> in(num_items, type{1});
+  c2h::device_vector<type> out_unique(1);
+  c2h::device_vector<int> out_counts(1);
+  c2h::device_vector<int> out_num_runs(1);
+
+  run_length_encode(in.begin(), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  c2h::device_vector<type> reference_unique(1, type{1}); // [1]
+  c2h::device_vector<int> reference_counts(1, num_items); // [num_items]
+  c2h::device_vector<int> reference_num_runs(1, 1); // [1]
+
+  REQUIRE(out_unique == reference_unique);
+  REQUIRE(out_counts == reference_counts);
+  REQUIRE(out_num_runs == reference_num_runs);
+}
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle iterators", "[device][run_length_encode]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out_unique(num_items);
+  c2h::device_vector<int> out_counts(num_items);
+  c2h::device_vector<int> out_num_runs(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  run_length_encode(in.begin(), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  // trim output
+  out_unique.resize(out_num_runs.front());
+  out_counts.resize(out_num_runs.front());
+
+  c2h::host_vector<type> reference_out = in;
+  reference_out.erase(std::unique(reference_out.begin(), reference_out.end()), reference_out.end());
+  REQUIRE(out_unique == reference_out);
+}
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle pointers", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out_unique(num_items);
+  c2h::device_vector<int> out_counts(num_items);
+  c2h::device_vector<int> out_num_runs(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  run_length_encode(
+    thrust::raw_pointer_cast(in.data()),
+    thrust::raw_pointer_cast(out_unique.data()),
+    thrust::raw_pointer_cast(out_counts.data()),
+    thrust::raw_pointer_cast(out_num_runs.data()),
+    num_items);
+
+  // trim output
+  out_unique.resize(out_num_runs.front());
+  out_counts.resize(out_num_runs.front());
+
+  c2h::host_vector<type> reference_out = in;
+  reference_out.erase(std::unique(reference_out.begin(), reference_out.end()), reference_out.end());
+  REQUIRE(out_unique == reference_out);
+}
+
+#if 0 // https://github.com/NVIDIA/cccl/issues/400
+template<class T>
+struct convertible_from_T {
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept : val_(val) {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept { return val_; }
+};
+
+C2H_TEST("DeviceRunLengthEncode::Encode works with a different output type", "[device][run_length_encode]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out_unique(num_items);
+  c2h::device_vector<int>  out_counts(num_items);
+  c2h::device_vector<int>  out_num_runs(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  run_length_encode(in.begin(),
+                    out_unique.begin(),
+                    out_counts.begin(),
+                    out_num_runs.begin(),
+                    num_items);
+
+  // trim output
+  out_unique.resize(out_num_runs.front());
+
+  c2h::host_vector<convertible_from_T<type>> reference_out = in;
+  reference_out.erase(std::unique(reference_out.begin(), reference_out.end()), reference_out.end());
+  REQUIRE(out_unique == reference_out);
+}
+#endif // https://github.com/NVIDIA/cccl/issues/400
+
+C2H_TEST("DeviceRunLengthEncode::Encode can handle leading NaN", "[device][run_length_encode]")
+{
+  using type = double;
+
+  constexpr int num_items = 10;
+  c2h::device_vector<type> in(num_items);
+  thrust::sequence(c2h::device_policy, in.begin(), in.end(), 0.0);
+  c2h::device_vector<type> out_unique(num_items);
+  c2h::device_vector<int> out_counts(num_items);
+  c2h::device_vector<int> out_num_runs(1);
+
+  c2h::device_vector<type> reference_unique = in;
+  in.front()                                = std::numeric_limits<type>::quiet_NaN();
+
+  run_length_encode(in.begin(), out_unique.begin(), out_counts.begin(), out_num_runs.begin(), num_items);
+
+  c2h::device_vector<int> reference_counts(num_items, 1); // [1, 1, ..., 1]
+  c2h::device_vector<int> reference_num_runs(1, num_items); // [num_items]
+
+  // turn the NaN into something else to make it comparable
+  out_unique.front()       = 42.0;
+  reference_unique.front() = 42.0;
+
+  REQUIRE(out_unique == reference_unique);
+  REQUIRE(out_counts == reference_counts);
+  REQUIRE(out_num_runs == reference_num_runs);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
new file mode 100644
index 000000000..31e5491fe
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_run_length_encode_non_trivial_runs.cu
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_run_length_encode.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceRunLengthEncode::NonTrivialRuns, run_length_encode);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint64_t,
+                 std::int8_t,
+                 std::int64_t,
+                 ulonglong2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using types = c2h::type_list<std::uint32_t, std::int8_t>;
+
+#if 0 // DeviceRunLengthEncode::NonTrivialRuns cannot handle inputs with one or less elements
+      // https://github.com/NVIDIA/cccl/issues/426
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle empty input", "[device][run_length_encode]")
+{
+  constexpr int num_items = 0;
+  c2h::device_vector<int> out_num_runs(1, 42);
+
+  // Note intentionally no discard_iterator as we want to ensure nothing is written to the output arrays
+  run_length_encode(static_cast<int*>(nullptr),
+                    static_cast<int*>(nullptr),
+                    static_cast<int*>(nullptr),
+                    thrust::raw_pointer_cast(out_num_runs.data()),
+                    num_items);
+
+  REQUIRE(out_num_runs.front() == 0);
+}
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle a single element", "[device][run_length_encode]")
+{
+  constexpr int num_items = 1;
+  c2h::device_vector<int> out_num_runs(1, 42);
+
+  // Note intentionally no discard_iterator as we want to ensure nothing is written to the output arrays
+  run_length_encode(static_cast<int*>(nullptr),
+                    static_cast<int*>(nullptr),
+                    static_cast<int*>(nullptr),
+                    thrust::raw_pointer_cast(out_num_runs.data()),
+                    num_items);
+
+  REQUIRE(out_num_runs.front() == 0);
+}
+#endif
+
+#if 0 // DeviceRunLengthEncode::NonTrivialRuns cannot handle inputs larger than INT32_MAX
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle large indexes", "[device][run_length_encode]")
+{
+  constexpr cuda::std::size_t num_items = 1ull << 33;
+  c2h::device_vector<cuda::std::size_t> out_num_runs(1, -1);
+
+  // Note intentionally no discard_iterator as we want to ensure nothing is written to the output arrays
+  run_length_encode(thrust::make_counting_iterator(cuda::std::size_t{0}),
+                    static_cast<cuda::std::size_t*>(nullptr),
+                    static_cast<cuda::std::size_t*>(nullptr),
+                    out_num_runs.begin(),
+                    num_items);
+
+  REQUIRE(out_num_runs.front() == 0);
+}
+#endif
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle different counting types", "[device][run_length_encode]")
+{
+  constexpr int num_items = 1;
+  c2h::device_vector<int> in(num_items, 42);
+  c2h::device_vector<int> out_num_runs(1, 42);
+
+  // Note intentionally no discard_iterator as we want to ensure nothing is written to the output
+  // arrays
+  run_length_encode(
+    in.begin(),
+    static_cast<cuda::std::size_t*>(nullptr),
+    static_cast<std::uint16_t*>(nullptr),
+    out_num_runs.begin(),
+    num_items);
+
+  REQUIRE(out_num_runs.front() == 0);
+}
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle all unique", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 10;
+  c2h::device_vector<int> out_num_runs(1, -1);
+
+  run_length_encode(
+    thrust::make_counting_iterator(type{}),
+    static_cast<int*>(nullptr),
+    static_cast<int*>(nullptr),
+    out_num_runs.begin(),
+    num_items);
+
+  REQUIRE(out_num_runs.front() == 0);
+}
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle all equal", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 10;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<int> out_offsets(1, -1);
+  c2h::device_vector<int> out_lengths(1, -1);
+  c2h::device_vector<int> out_num_runs(1, -1);
+  c2h::gen(C2H_SEED(2), in);
+  thrust::fill(c2h::device_policy, in.begin(), in.end(), in.front());
+
+  run_length_encode(in.begin(), out_offsets.begin(), out_lengths.begin(), out_num_runs.begin(), num_items);
+
+  REQUIRE(out_offsets.front() == 0);
+  REQUIRE(out_lengths.front() == num_items);
+  REQUIRE(out_num_runs.front() == 1);
+}
+
+template <class T, class Index>
+bool validate_results(
+  const c2h::device_vector<T>& in,
+  const c2h::device_vector<Index>& out_offsets,
+  const c2h::device_vector<Index>& out_lengths,
+  const c2h::device_vector<Index>& out_num_runs,
+  const int num_items)
+{
+  const c2h::host_vector<T>& h_in               = in;
+  const c2h::host_vector<Index>& h_out_offsets  = out_offsets;
+  const c2h::host_vector<Index>& h_out_lengths  = out_lengths;
+  const c2h::host_vector<Index>& h_out_num_runs = out_num_runs;
+
+  const cuda::std::size_t num_runs = static_cast<cuda::std::size_t>(h_out_num_runs.front());
+  for (cuda::std::size_t run = 0; run < num_runs; ++run)
+  {
+    const cuda::std::size_t first_index = static_cast<cuda::std::size_t>(h_out_offsets[run]);
+    const cuda::std::size_t final_index = first_index + static_cast<cuda::std::size_t>(h_out_lengths[run]);
+
+    // Ensure we started a new run
+    if (first_index > 0)
+    {
+      if (h_in[first_index] == h_in[first_index - 1])
+      {
+        return false;
+      }
+    }
+
+    // Ensure the run is valid
+    const auto first_elem = h_in[first_index];
+    const auto all_equal  = [first_elem](const T& elem) -> bool {
+      return first_elem == elem;
+    };
+    if (!std::all_of(h_in.begin() + first_index + 1, h_in.begin() + final_index, all_equal))
+    {
+      return false;
+    }
+
+    // Ensure the run is of maximal length
+    if (final_index < static_cast<cuda::std::size_t>(num_items))
+    {
+      if (h_in[first_index] == h_in[final_index])
+      {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle iterators", "[device][run_length_encode]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<int> out_offsets(num_items, -1);
+  c2h::device_vector<int> out_lengths(num_items, -1);
+  c2h::device_vector<int> out_num_runs(1, -1);
+  c2h::gen(C2H_SEED(2), in);
+
+  run_length_encode(in.begin(), out_offsets.begin(), out_lengths.begin(), out_num_runs.begin(), num_items);
+
+  out_offsets.resize(out_num_runs.front());
+  out_lengths.resize(out_num_runs.front());
+  REQUIRE(validate_results(in, out_offsets, out_lengths, out_num_runs, num_items));
+}
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns can handle pointers", "[device][run_length_encode]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<int> out_offsets(num_items, -1);
+  c2h::device_vector<int> out_lengths(num_items, -1);
+  c2h::device_vector<int> out_num_runs(1, -1);
+  c2h::gen(C2H_SEED(2), in);
+
+  run_length_encode(
+    thrust::raw_pointer_cast(in.data()),
+    thrust::raw_pointer_cast(out_offsets.data()),
+    thrust::raw_pointer_cast(out_lengths.data()),
+    thrust::raw_pointer_cast(out_num_runs.data()),
+    num_items);
+
+  out_offsets.resize(out_num_runs.front());
+  out_lengths.resize(out_num_runs.front());
+  REQUIRE(validate_results(in, out_offsets, out_lengths, out_num_runs, num_items));
+}
+
+// Guard against #293
+template <bool TimeSlicing>
+struct device_rle_policy_hub
+{
+  static constexpr int threads = 96;
+  static constexpr int items   = 15;
+
+  struct Policy350 : cub::ChainedPolicy<350, Policy350, Policy350>
+  {
+    using RleSweepPolicyT = cub::
+      AgentRlePolicy<threads, items, cub::BLOCK_LOAD_DIRECT, cub::LOAD_DEFAULT, TimeSlicing, cub::BLOCK_SCAN_WARP_SCANS>;
+  };
+
+  using MaxPolicy = Policy350;
+};
+
+struct CustomDeviceRunLengthEncode
+{
+  template <bool TimeSlicing,
+            typename InputIteratorT,
+            typename OffsetsOutputIteratorT,
+            typename LengthsOutputIteratorT,
+            typename NumRunsOutputIteratorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t NonTrivialRuns(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OffsetsOutputIteratorT d_offsets_out,
+    LengthsOutputIteratorT d_lengths_out,
+    NumRunsOutputIteratorT d_num_runs_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    using OffsetT    = int; // Signed integer type for global offsets
+    using EqualityOp = cub::Equality; // Default == operator
+
+    return cub::DeviceRleDispatch<InputIteratorT,
+                                  OffsetsOutputIteratorT,
+                                  LengthsOutputIteratorT,
+                                  NumRunsOutputIteratorT,
+                                  EqualityOp,
+                                  OffsetT,
+                                  device_rle_policy_hub<TimeSlicing>>::
+      Dispatch(d_temp_storage,
+               temp_storage_bytes,
+               d_in,
+               d_offsets_out,
+               d_lengths_out,
+               d_num_runs_out,
+               EqualityOp(),
+               num_items,
+               stream);
+  }
+};
+
+DECLARE_LAUNCH_WRAPPER(CustomDeviceRunLengthEncode::NonTrivialRuns<true>, run_length_encode_293_true);
+DECLARE_LAUNCH_WRAPPER(CustomDeviceRunLengthEncode::NonTrivialRuns<false>, run_length_encode_293_false);
+
+using time_slicing = c2h::type_list<std::true_type, std::false_type>;
+
+C2H_TEST("DeviceRunLengthEncode::NonTrivialRuns does not run out of memory", "[device][run_length_encode]", time_slicing)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using policy_hub_t = device_rle_policy_hub<type::value>;
+
+  constexpr int tile_size    = policy_hub_t::threads * policy_hub_t::items;
+  constexpr int num_items    = 2 * tile_size;
+  constexpr int magic_number = num_items + 1;
+
+  c2h::host_vector<int> h_keys(num_items);
+  thrust::sequence(h_keys.begin(), h_keys.begin() + tile_size);
+
+  int expected_non_trivial_runs = 0;
+  int value                     = tile_size;
+  int large_group_size          = 3;
+  for (int i = 0; i < tile_size; i++)
+  {
+    int j = 0;
+    for (; j < large_group_size && i < tile_size; ++j, ++i)
+    {
+      h_keys[tile_size + i] = value;
+    }
+    if (j == large_group_size)
+    {
+      ++expected_non_trivial_runs;
+    }
+    ++value;
+
+    if (i < tile_size)
+    {
+      h_keys[tile_size + i] = value;
+    }
+    ++value;
+  }
+
+  // in #293 we were writing before the output arrays. So add a sentinel element in front to check
+  // against OOB writes
+  c2h::device_vector<int> in = h_keys;
+  c2h::device_vector<int> out_offsets(num_items + 1, -1);
+  c2h::device_vector<int> out_lengths(num_items + 1, -1);
+  c2h::device_vector<int> out_num_runs(1, -1);
+  out_offsets.front() = magic_number;
+  out_lengths.front() = magic_number;
+
+  if (type::value)
+  {
+    run_length_encode_293_true(
+      in.begin(), out_offsets.begin() + 1, out_lengths.begin() + 1, out_num_runs.begin(), num_items);
+  }
+  else
+  {
+    run_length_encode_293_false(
+      in.begin(), out_offsets.begin() + 1, out_lengths.begin() + 1, out_num_runs.begin(), num_items);
+  }
+
+  REQUIRE(out_num_runs.front() == expected_non_trivial_runs);
+  REQUIRE(out_lengths.front() == magic_number);
+  REQUIRE(out_offsets.front() == magic_number);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cu
new file mode 100644
index 000000000..4ceb6d738
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cu
@@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_device_scan.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanInit, device_inclusive_scan_with_init);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSum, device_exclusive_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSum, device_inclusive_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScan, device_inclusive_scan);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_TYPES types 0:1:2:3
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_pair<std::uint8_t, std::int32_t>, type_pair<std::int8_t>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_pair<std::int32_t>, type_pair<std::uint64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list = c2h::type_list<type_pair<uchar3>, type_pair<ulonglong4>>;
+#elif TEST_TYPES == 3
+// clang-format off
+using full_type_list = c2h::type_list<
+type_pair<custom_t>
+#if TEST_HALF_T
+, type_pair<half_t> // testing half
+#endif
+#if TEST_BF_T
+, type_pair<bfloat16_t> // testing bf16
+#endif
+>;
+// clang-format on
+#endif
+
+/**
+ * @brief Input data generation mode
+ */
+enum class gen_data_t : int
+{
+  /// Uniform random data generation
+  GEN_TYPE_RANDOM,
+  /// Constant value as input data
+  GEN_TYPE_CONST
+};
+
+C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using input_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = int32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  // Input data generation to test
+  const gen_data_t data_gen_mode = GENERATE_COPY(gen_data_t::GEN_TYPE_RANDOM, gen_data_t::GEN_TYPE_CONST);
+
+  // Generate input data
+  c2h::device_vector<input_t> in_items(num_items);
+  if (data_gen_mode == gen_data_t::GEN_TYPE_RANDOM)
+  {
+    c2h::gen(C2H_SEED(2), in_items);
+  }
+  else
+  {
+    input_t default_constant{};
+    init_default_constant(default_constant);
+    thrust::fill(c2h::device_policy, in_items.begin(), in_items.end(), default_constant);
+  }
+  auto d_in_it = thrust::raw_pointer_cast(in_items.data());
+
+// Skip DeviceScan::InclusiveSum and DeviceScan::ExclusiveSum tests for extended floating-point
+// types because of unbounded epsilon due to pseudo associativity of the addition operation over
+// floating point numbers
+#if TEST_TYPES != 3
+  SECTION("inclusive sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_reference(host_items.cbegin(), host_items.cend(), expected_result.begin(), op_t{}, accum_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_inclusive_sum(d_in_it, d_out_it, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_inclusive_sum(d_in_it, d_in_it, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+
+  SECTION("exclusive sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(host_items.cbegin(), host_items.cend(), expected_result.begin(), accum_t{}, op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_exclusive_sum(d_in_it, d_out_it, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_exclusive_sum(d_in_it, d_in_it, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+#endif
+
+  SECTION("inclusive scan")
+  {
+    using op_t    = cub::Min;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_reference(
+      host_items.cbegin(), host_items.cend(), expected_result.begin(), op_t{}, cub::NumericTraits<accum_t>::Max());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_inclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), op_t{}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_inclusive_scan(unwrap_it(d_in_it), unwrap_it(d_in_it), op_t{}, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+
+  SECTION("inclusive scan with init value")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Scan operator
+    auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    accum_t init_value{};
+    init_default_constant(init_value);
+    compute_inclusive_scan_reference(
+      host_items.cbegin(), host_items.cend(), expected_result.begin(), scan_op, init_value);
+
+    device_inclusive_scan_with_init(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, init_value, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_inclusive_scan_with_init(unwrap_it(d_in_it), unwrap_it(d_in_it), scan_op, init_value, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+
+  SECTION("exclusive scan")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Scan operator
+    auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(
+      host_items.cbegin(), host_items.cend(), expected_result.begin(), accum_t{}, scan_op);
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    using init_t  = cub::detail::value_t<decltype(unwrap_it(d_out_it))>;
+    device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, init_t{}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_in_it), scan_op, init_t{}, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+
+  SECTION("exclusive scan with future-init value")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Scan operator
+    auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
+
+    // Prepare verification data
+    accum_t init_value{};
+    init_default_constant(init_value);
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(
+      host_items.cbegin(), host_items.cend(), expected_result.begin(), init_value, scan_op);
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    using init_t  = cub::detail::value_t<decltype(unwrap_it(d_out_it))>;
+    c2h::device_vector<init_t> d_initial_value(1);
+    d_initial_value[0]     = static_cast<init_t>(*unwrap_it(&init_value));
+    auto future_init_value = cub::FutureValue<init_t>(thrust::raw_pointer_cast(d_initial_value.data()));
+    device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, future_init_value, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<input_t, output_t>::value)
+    {
+      device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_in_it), scan_op, future_init_value, num_items);
+
+      // Verify result
+      REQUIRE(expected_result == in_items);
+    }
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cuh
new file mode 100644
index 000000000..527a0b63f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan.cuh
@@ -0,0 +1,189 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/detail/type_traits.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+/**
+ * @brief Helper class template to facilitate specifying input/output type pairs along with the key
+ * type for *-by-key algorithms, and an equality operator type.
+ */
+template <typename InputT, typename OutputT = InputT, typename KeyT = std::int32_t, typename EqualityOpT = cub::Equality>
+struct type_quad
+{
+  using input_t  = InputT;
+  using output_t = OutputT;
+  using key_t    = KeyT;
+  using eq_op_t  = EqualityOpT;
+};
+
+/**
+ * @brief Mod2Equality (used for integral keys, making keys more likely to equal each other)
+ */
+struct Mod2Equality
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const
+  {
+    return (a % 2) == (b % 2);
+  }
+};
+
+template <typename InputIt, typename OutputIt, typename InitT, typename BinaryOp>
+void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op)
+{
+  using value_t  = cub::detail::value_t<InputIt>;
+  using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
+  using output_t = cub::detail::value_t<OutputIt>;
+  accum_t acc    = static_cast<accum_t>(init);
+  for (; first != last; ++first)
+  {
+    *result++ = static_cast<output_t>(acc);
+    acc       = op(acc, *first);
+  }
+}
+
+template <typename InputIt, typename OutputIt, typename BinaryOp, typename InitT>
+void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init)
+{
+  using value_t  = cub::detail::value_t<InputIt>;
+  using accum_t  = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
+  using output_t = cub::detail::value_t<OutputIt>;
+  accum_t acc    = static_cast<accum_t>(init);
+  for (; first != last; ++first)
+  {
+    acc       = op(acc, *first);
+    *result++ = static_cast<output_t>(acc);
+  }
+}
+
+template <typename ValueInItT,
+          typename KeyInItT,
+          typename ValuesOutItT,
+          typename ScanOpT,
+          typename EqualityOpT,
+          typename InitT>
+void compute_exclusive_scan_by_key_reference(
+  ValueInItT h_values_it,
+  KeyInItT h_keys_it,
+  ValuesOutItT result_out_it,
+  ScanOpT scan_op,
+  EqualityOpT equality_op,
+  InitT init,
+  std::size_t num_items)
+{
+  using value_t  = cub::detail::value_t<ValueInItT>;
+  using accum_t  = ::cuda::std::__accumulator_t<ScanOpT, value_t, InitT>;
+  using output_t = cub::detail::value_t<ValuesOutItT>;
+
+  if (num_items > 0)
+  {
+    for (std::size_t i = 0; i < num_items;)
+    {
+      accum_t val       = static_cast<accum_t>(h_values_it[i]);
+      result_out_it[i]  = init;
+      accum_t inclusive = static_cast<accum_t>(scan_op(init, val));
+
+      ++i;
+
+      for (; i < num_items && equality_op(h_keys_it[i - 1], h_keys_it[i]); ++i)
+      {
+        val              = static_cast<accum_t>(h_values_it[i]);
+        result_out_it[i] = static_cast<output_t>(inclusive);
+        inclusive        = static_cast<accum_t>(scan_op(inclusive, val));
+      }
+    }
+  }
+}
+
+template <typename ValueT, typename KeyT, typename ValuesOutItT, typename ScanOpT, typename EqualityOpT, typename InitT>
+void compute_exclusive_scan_by_key_reference(
+  const c2h::device_vector<ValueT>& d_values,
+  const c2h::device_vector<KeyT>& d_keys,
+  ValuesOutItT result_out_it,
+  ScanOpT scan_op,
+  EqualityOpT equality_op,
+  InitT init)
+{
+  c2h::host_vector<ValueT> host_values(d_values);
+  c2h::host_vector<KeyT> host_keys(d_keys);
+
+  std::size_t num_items = host_values.size();
+
+  compute_exclusive_scan_by_key_reference(
+    host_values.cbegin(), host_keys.cbegin(), result_out_it, scan_op, equality_op, init, num_items);
+}
+
+template <typename ValueInItT, typename KeyInItT, typename ValuesOutItT, typename ScanOpT, typename EqualityOpT>
+void compute_inclusive_scan_by_key_reference(
+  ValueInItT h_values_it,
+  KeyInItT h_keys_it,
+  ValuesOutItT result_out_it,
+  ScanOpT scan_op,
+  EqualityOpT equality_op,
+  std::size_t num_items)
+{
+  using value_t  = cub::detail::value_t<ValueInItT>;
+  using accum_t  = ::cuda::std::__accumulator_t<ScanOpT, value_t, value_t>;
+  using output_t = cub::detail::value_t<ValuesOutItT>;
+
+  for (std::size_t i = 0; i < num_items;)
+  {
+    accum_t inclusive = h_values_it[i];
+    result_out_it[i]  = static_cast<output_t>(inclusive);
+
+    ++i;
+
+    for (; i < num_items && equality_op(h_keys_it[i - 1], h_keys_it[i]); ++i)
+    {
+      accum_t val      = h_values_it[i];
+      inclusive        = static_cast<accum_t>(scan_op(inclusive, val));
+      result_out_it[i] = static_cast<output_t>(inclusive);
+    }
+  }
+}
+
+template <typename ValueT, typename KeyT, typename ValuesOutItT, typename ScanOpT, typename EqualityOpT>
+void compute_inclusive_scan_by_key_reference(
+  const c2h::device_vector<ValueT>& d_values,
+  const c2h::device_vector<KeyT>& d_keys,
+  ValuesOutItT result_out_it,
+  ScanOpT scan_op,
+  EqualityOpT equality_op)
+{
+  c2h::host_vector<ValueT> host_values(d_values);
+  c2h::host_vector<KeyT> host_keys(d_keys);
+
+  std::size_t num_items = host_values.size();
+
+  compute_inclusive_scan_by_key_reference(
+    host_values.cbegin(), host_keys.cbegin(), result_out_it, scan_op, equality_op, num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_api.cu
new file mode 100644
index 000000000..a840b08f1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_api.cu
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_scan.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <c2h/catch2_test_helper.cuh>
+
+C2H_TEST("Device inclusive scan works", "[scan][device]")
+{
+  // example-begin device-inclusive-scan
+  thrust::device_vector<int> input{0, -1, 2, -3, 4, -5, 6};
+  thrust::device_vector<int> out(input.size());
+
+  int init = 1;
+  size_t temp_storage_bytes{};
+
+  cub::DeviceScan::InclusiveScanInit(
+    nullptr, temp_storage_bytes, input.begin(), out.begin(), cub::Max{}, init, static_cast<int>(input.size()));
+
+  // Allocate temporary storage for inclusive scan
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+
+  // Run inclusive prefix sum
+  cub::DeviceScan::InclusiveScanInit(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    input.begin(),
+    out.begin(),
+    cub::Max{},
+    init,
+    static_cast<int>(input.size()));
+
+  thrust::host_vector<int> expected{1, 1, 2, 2, 4, 4, 6};
+  // example-end device-inclusive-scan
+
+  REQUIRE(expected == out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key.cu
new file mode 100644
index 000000000..d11361561
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key.cu
@@ -0,0 +1,377 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+#include <type_traits>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_device_scan.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_exclusive_sum_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_inclusive_sum_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_TYPES types 0:1:2:3
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+// type_quad's parameters and defaults:
+// type_quad<value_in_t, value_out_t=value_in_t, key_t=int32_t, equality_op_t=cub::Equality>
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_quad<std::uint8_t, std::int32_t, float>,
+                                      type_quad<std::int8_t, std::int8_t, std::int32_t, Mod2Equality>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_quad<std::int32_t>, type_quad<std::uint64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list =
+  c2h::type_list<type_quad<uchar3, uchar3, custom_t>, type_quad<ulonglong4, ulonglong4, std::uint8_t, Mod2Equality>>;
+#elif TEST_TYPES == 3
+// clang-format off
+using full_type_list = c2h::type_list<
+type_quad<custom_t, custom_t, custom_t>
+#if TEST_HALF_T
+, type_quad<half_t> // testing half
+#endif
+#if TEST_BF_T
+, type_quad<bfloat16_t> // testing bf16
+#endif
+>;
+// clang-format on
+#endif
+
+C2H_TEST("Device scan works with all device interfaces", "[by_key][scan][device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using key_t    = typename params::type_pair_t::key_t;
+  using value_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = std::uint32_t;
+  using eq_op_t  = typename params::type_pair_t::eq_op_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate (a segment is a series of consecutive equal keys)
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+
+  // Get array of keys from segment offsets
+  c2h::device_vector<key_t> segment_keys(num_items);
+  c2h::init_key_segments(segment_offsets, segment_keys);
+  auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data());
+
+  // Generate input data
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(C2H_SEED(2), in_values, std::numeric_limits<value_t>::min());
+  auto d_values_it = thrust::raw_pointer_cast(in_values.data());
+
+// Skip DeviceScan::InclusiveSum and DeviceScan::ExclusiveSum tests for extended floating-point
+// types because of unbounded epsilon due to pseudo associativity of the addition operation over
+// floating point numbers
+#if TEST_TYPES != 3
+  SECTION("inclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(in_values, segment_keys, expected_result.begin(), op_t{}, eq_op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    auto d_values_out_it = thrust::raw_pointer_cast(out_values.data());
+    device_inclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<value_t, output_t>::value)
+    {
+      // Copy input values to memory allocated for output values, to ensure in_values are
+      // unchanged for a (potentially) subsequent test that uses in_values as input
+      out_values            = in_values;
+      auto values_in_out_it = thrust::raw_pointer_cast(out_values.data());
+      device_inclusive_sum_by_key(d_keys_it, values_in_out_it, values_in_out_it, num_items, eq_op_t{});
+
+      // Verify result
+      REQUIRE(expected_result == out_values);
+    }
+  }
+
+  SECTION("exclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      in_values, segment_keys, expected_result.begin(), op_t{}, eq_op_t{}, output_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    auto d_values_out_it = thrust::raw_pointer_cast(out_values.data());
+    device_exclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<value_t, output_t>::value)
+    {
+      // Copy input values to memory allocated for output values, to ensure in_values are
+      // unchanged for a (potentially) subsequent test that uses in_values as input
+      out_values            = in_values;
+      auto values_in_out_it = thrust::raw_pointer_cast(out_values.data());
+      device_exclusive_sum_by_key(d_keys_it, values_in_out_it, values_in_out_it, num_items, eq_op_t{});
+
+      // Verify result
+      REQUIRE(expected_result == out_values);
+    }
+  }
+#endif
+
+  SECTION("inclusive scan")
+  {
+    using op_t = cub::Min;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(in_values, segment_keys, expected_result.begin(), op_t{}, eq_op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    auto d_values_out_it = thrust::raw_pointer_cast(out_values.data());
+    device_inclusive_scan_by_key(
+      d_keys_it, unwrap_it(d_values_it), unwrap_it(d_values_out_it), op_t{}, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<value_t, output_t>::value)
+    {
+      // Copy input values to memory allocated for output values, to ensure in_values are
+      // unchanged for a (potentially) subsequent test that uses in_values as input
+      out_values            = in_values;
+      auto values_in_out_it = thrust::raw_pointer_cast(out_values.data());
+      device_inclusive_scan_by_key(
+        d_keys_it, unwrap_it(values_in_out_it), unwrap_it(values_in_out_it), op_t{}, num_items, eq_op_t{});
+
+      // Verify result
+      REQUIRE(expected_result == out_values);
+    }
+  }
+
+  SECTION("exclusive scan")
+  {
+    using op_t = cub::Sum;
+
+    // Scan operator
+    auto scan_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      in_values, segment_keys, expected_result.begin(), scan_op, eq_op_t{}, output_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    auto d_values_out_it = thrust::raw_pointer_cast(out_values.data());
+    using init_t         = cub::detail::value_t<decltype(unwrap_it(d_values_out_it))>;
+    device_exclusive_scan_by_key(
+      d_keys_it, unwrap_it(d_values_it), unwrap_it(d_values_out_it), scan_op, init_t{}, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+
+    // Run test in-place
+    _CCCL_IF_CONSTEXPR (std::is_same<value_t, output_t>::value)
+    {
+      // Copy input values to memory allocated for output values, to ensure in_values are
+      // unchanged for a (potentially) subsequent test that uses in_values as input
+      out_values            = in_values;
+      auto values_in_out_it = thrust::raw_pointer_cast(out_values.data());
+      device_exclusive_scan_by_key(
+        d_keys_it, unwrap_it(values_in_out_it), unwrap_it(values_in_out_it), scan_op, init_t{}, num_items, eq_op_t{});
+
+      // Verify result
+      REQUIRE(expected_result == out_values);
+    }
+  }
+}
+
+#if TEST_TYPES == 0
+using key_alias_type_list = c2h::type_list<std::uint8_t>;
+#elif TEST_TYPES == 1
+using key_alias_type_list = c2h::type_list<std::int32_t>;
+#elif TEST_TYPES == 2
+using key_alias_type_list = c2h::type_list<float>;
+#elif TEST_TYPES == 3
+using key_alias_type_list = c2h::type_list<custom_t>;
+#endif
+
+C2H_TEST("Device scan works when memory for keys and results alias one another",
+         "[by_key][scan][device]",
+         key_alias_type_list)
+{
+  using key_t    = typename c2h::get<0, TestType>;
+  using value_t  = key_t;
+  using output_t = key_t;
+  using offset_t = std::uint32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate (a segment is a series of consecutive equal keys)
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+
+  // Get array of keys from segment offsets
+  c2h::device_vector<key_t> segment_keys(num_items);
+  c2h::init_key_segments(segment_offsets, segment_keys);
+  auto d_keys_it = thrust::raw_pointer_cast(segment_keys.data());
+
+  // Generate input data
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(C2H_SEED(2), in_values, std::numeric_limits<value_t>::min());
+  auto d_values_it = thrust::raw_pointer_cast(in_values.data());
+
+  SECTION("inclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(in_values, segment_keys, expected_result.begin(), op_t{}, cub::Equality{});
+
+    // Run test
+    auto d_values_out_it = d_keys_it;
+    device_inclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, cub::Equality{});
+
+    // Verify result
+    REQUIRE(expected_result == segment_keys);
+  }
+
+  SECTION("exclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      in_values, segment_keys, expected_result.begin(), op_t{}, cub::Equality{}, output_t{});
+
+    // Run test
+    auto d_values_out_it = d_keys_it;
+    device_exclusive_sum_by_key(d_keys_it, d_values_it, d_values_out_it, num_items, cub::Equality{});
+
+    // Verify result
+    REQUIRE(expected_result == segment_keys);
+  }
+
+  SECTION("inclusive scan")
+  {
+    using op_t = cub::Min;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(in_values, segment_keys, expected_result.begin(), op_t{}, cub::Equality{});
+
+    // Run test
+    auto d_values_out_it = d_keys_it;
+    device_inclusive_scan_by_key(d_keys_it, d_values_it, d_values_out_it, op_t{}, num_items, cub::Equality{});
+
+    // Verify result
+    REQUIRE(expected_result == segment_keys);
+  }
+
+  SECTION("exclusive scan")
+  {
+    using op_t = cub::Sum;
+
+    // Scan operator
+    auto scan_op = op_t{};
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      in_values, segment_keys, expected_result.begin(), scan_op, cub::Equality{}, output_t{});
+
+    // Run test
+    auto d_values_out_it = d_keys_it;
+    using init_t         = value_t;
+    device_exclusive_scan_by_key(d_keys_it, d_values_it, d_values_out_it, scan_op, init_t{}, num_items, cub::Equality{});
+
+    // Verify result
+    REQUIRE(expected_result == segment_keys);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_iterators.cu
new file mode 100644
index 000000000..9f8dbdbcf
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_iterators.cu
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_device_scan.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_exclusive_sum_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_inclusive_sum_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+// %PARAM% TEST_TYPES types 0:1:2:3
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+// type_quad's parameters and defaults:
+// type_quad<value_in_t, value_out_t=value_in_t, key_t=int32_t, equality_op_t=cub::Equality>
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_quad<std::uint8_t, std::int32_t, float>,
+                                      type_quad<std::int8_t, std::int8_t, std::int32_t, Mod2Equality>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_quad<std::int32_t>, type_quad<std::uint64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list =
+  c2h::type_list<type_quad<uchar3, uchar3, custom_t>, type_quad<ulonglong4, ulonglong4, std::uint8_t, Mod2Equality>>;
+#elif TEST_TYPES == 3
+using full_type_list = c2h::type_list<type_quad<custom_t, custom_t, custom_t>>;
+#endif
+
+/**
+ * @brief Input data generation mode
+ */
+enum class gen_data_t : int
+{
+  /// Uniform random data generation
+  GEN_TYPE_RANDOM,
+  /// Constant value as input data
+  GEN_TYPE_CONST
+};
+
+C2H_TEST("Device scan works with fancy iterators", "[by_key][scan][device]", full_type_list)
+{
+  using params   = params_t<TestType>;
+  using key_t    = typename params::type_pair_t::key_t;
+  using value_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = std::uint32_t;
+  using eq_op_t  = typename params::type_pair_t::eq_op_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate (a segment is a series of consecutive equal keys)
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+
+  // Get array of keys from segment offsets
+  c2h::device_vector<key_t> segment_keys(num_items);
+  c2h::init_key_segments(segment_offsets, segment_keys);
+  auto d_keys_it = segment_keys.begin();
+  c2h::host_vector<key_t> h_segment_keys(segment_keys);
+
+  // Prepare input data
+  value_t default_constant{};
+  init_default_constant(default_constant);
+  auto values_in_it = thrust::make_constant_iterator(default_constant);
+
+  SECTION("inclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(
+      values_in_it, h_segment_keys.cbegin(), expected_result.begin(), op_t{}, eq_op_t{}, num_items);
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    device_inclusive_sum_by_key(d_keys_it, values_in_it, out_values.begin(), num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+  }
+
+  SECTION("exclusive sum")
+  {
+    using op_t = cub::Sum;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      values_in_it, h_segment_keys.cbegin(), expected_result.begin(), op_t{}, eq_op_t{}, output_t{}, num_items);
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    device_exclusive_sum_by_key(d_keys_it, values_in_it, out_values.begin(), num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+  }
+
+  SECTION("inclusive scan")
+  {
+    using op_t = cub::Min;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_by_key_reference(
+      values_in_it, h_segment_keys.cbegin(), expected_result.begin(), op_t{}, eq_op_t{}, num_items);
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    device_inclusive_scan_by_key(d_keys_it, values_in_it, out_values.begin(), op_t{}, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+  }
+
+  SECTION("exclusive scan")
+  {
+    using op_t = cub::Sum;
+
+    // Scan operator
+    auto scan_op = op_t{};
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_by_key_reference(
+      values_in_it, h_segment_keys.cbegin(), expected_result.begin(), scan_op, eq_op_t{}, output_t{}, num_items);
+
+    // Run test
+    c2h::device_vector<output_t> out_values(num_items);
+    using init_t = value_t;
+    device_exclusive_scan_by_key(d_keys_it, values_in_it, out_values.begin(), scan_op, init_t{}, num_items, eq_op_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_values);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_large_offsets.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
new file mode 100644
index 000000000..c6ce29002
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_by_key_large_offsets.cu
@@ -0,0 +1,157 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_exclusive_scan_by_key);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_inclusive_scan_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+// List of offset types to be used for testing large number of items
+// using offset_types = c2h::type_list<std::uint32_t, std::uint64_t>;
+using offset_types = c2h::type_list<std::uint64_t>;
+
+template <typename ItemT, bool IsExclusive>
+struct expected_sum_op
+{
+  uint64_t segment_size;
+  ItemT init_value;
+
+  __host__ __device__ __forceinline__ ItemT operator()(const uint64_t index) const
+  {
+    uint64_t index_within_segment = index % segment_size;
+    uint64_t full_segments        = index / segment_size;
+
+    uint64_t sum_within_partial_segment{};
+    if (IsExclusive)
+    {
+      sum_within_partial_segment = (index_within_segment * (index_within_segment - 1)) / 2;
+    }
+    else
+    {
+      sum_within_partial_segment = (index_within_segment * (index_within_segment + 1)) / 2;
+    }
+    return index_within_segment == 0
+           ? (IsExclusive ? init_value : init_value + full_segments)
+           : static_cast<ItemT>(sum_within_partial_segment + full_segments) + init_value;
+  }
+};
+
+template <typename ItemT>
+struct mod_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ uint64_t operator()(const uint64_t index) const
+  {
+    auto mod = static_cast<ItemT>(index % segment_size);
+    auto div = static_cast<ItemT>(index / segment_size);
+    return mod == 0 ? div : mod;
+  }
+};
+
+template <typename KeyT>
+struct div_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ KeyT operator()(const uint64_t index) const
+  {
+    return static_cast<KeyT>(index / segment_size);
+  }
+};
+
+C2H_TEST("DeviceScan::ScanByKey works for very large number of items", "[by_key][scan][device]", offset_types)
+try
+{
+  using op_t     = cub::Sum;
+  using item_t   = std::uint32_t;
+  using key_t    = std::uint64_t;
+  using index_t  = std::uint64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // Prepare input (generate a series of: 0, 1, 2, ..., <segment_size-1>,  1, 1, 2, ..., <segment_size-1>, 2, 1, ...)
+  const index_t segment_size = GENERATE_COPY(values({offset_t{1000}, offset_t{1}}));
+  auto index_it              = thrust::make_counting_iterator(index_t{});
+  auto keys_it               = thrust::make_transform_iterator(index_it, div_op<key_t>{segment_size});
+  auto items_it              = thrust::make_transform_iterator(index_it, mod_op<item_t>{segment_size});
+
+  // Output memory allocation
+  c2h::device_vector<item_t> d_items_out(num_items);
+  auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data());
+
+  // Run test
+  SECTION("ExclusiveScanByKey")
+  {
+    constexpr bool is_exclusive = true;
+    auto initial_value          = item_t{42};
+    device_exclusive_scan_by_key(keys_it, items_it, d_items_out_it, op_t{}, initial_value, num_items, cub::Equality{});
+
+    // Ensure that we created the correct output
+    auto expected_out_it = thrust::make_transform_iterator(
+      index_it, expected_sum_op<item_t, is_exclusive>{static_cast<index_t>(segment_size), initial_value});
+    bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it);
+    REQUIRE(all_results_correct == true);
+  }
+  SECTION("InclusiveScanByKey")
+  {
+    constexpr bool is_exclusive = false;
+    auto initial_value          = item_t{0};
+    device_inclusive_scan_by_key(keys_it, items_it, d_items_out_it, op_t{}, num_items, cub::Equality{});
+
+    // Ensure that we created the correct output
+    auto expected_out_it = thrust::make_transform_iterator(
+      index_it, expected_sum_op<item_t, is_exclusive>{static_cast<index_t>(segment_size), initial_value});
+    bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it);
+    REQUIRE(all_results_correct == true);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_iterators.cu
new file mode 100644
index 000000000..0fcceceab
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_iterators.cu
@@ -0,0 +1,309 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_device_scan.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSum, device_exclusive_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSum, device_inclusive_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScan, device_inclusive_scan);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+using iterator_type_list = c2h::type_list<type_pair<std::int8_t>, type_pair<custom_t>, type_pair<uchar3>>;
+
+C2H_TEST("Device scan works with iterators", "[scan][device]", iterator_type_list)
+{
+  using params   = params_t<TestType>;
+  using input_t  = typename params::item_t;
+  using output_t = typename params::output_t;
+  using offset_t = int32_t;
+
+  constexpr offset_t min_items = 1;
+  constexpr offset_t max_items = 1000000;
+
+  // Generate the input sizes to test for
+  const offset_t num_items = GENERATE_COPY(
+    take(3, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+
+  // Prepare input iterator
+  input_t default_constant{};
+  init_default_constant(default_constant);
+  auto in_it = thrust::make_constant_iterator(default_constant);
+
+  SECTION("inclusive sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_reference(in_it, in_it + num_items, expected_result.begin(), op_t{}, accum_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_inclusive_sum(in_it, d_out_it, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("exclusive sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(in_it, in_it + num_items, expected_result.begin(), accum_t{}, op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_exclusive_sum(in_it, d_out_it, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("inclusive scan")
+  {
+    using op_t    = cub::Min;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_inclusive_scan_reference(
+      in_it, in_it + num_items, expected_result.begin(), op_t{}, cub::NumericTraits<accum_t>::Max());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_inclusive_scan(in_it, d_out_it, op_t{}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("exclusive scan")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(in_it, in_it + num_items, expected_result.begin(), accum_t{}, op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_exclusive_scan(in_it, d_out_it, op_t{}, input_t{}, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("exclusive scan with future-init value")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
+
+    // Prepare verification data
+    accum_t init_value{};
+    init_default_constant(init_value);
+    c2h::host_vector<output_t> expected_result(num_items);
+    compute_exclusive_scan_reference(in_it, in_it + num_items, expected_result.begin(), init_value, op_t{});
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_items);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    using init_t  = cub::detail::value_t<decltype(unwrap_it(d_out_it))>;
+    c2h::device_vector<init_t> d_initial_value(1);
+    d_initial_value[0]     = static_cast<init_t>(init_value);
+    auto future_init_value = cub::FutureValue<init_t>(thrust::raw_pointer_cast(d_initial_value.data()));
+    device_exclusive_scan(in_it, d_out_it, op_t{}, future_init_value, num_items);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+}
+
+class custom_input_t
+{
+  char m_val{};
+
+public:
+  __host__ __device__ explicit custom_input_t(char val)
+      : m_val(val)
+  {}
+
+  __host__ __device__ int get() const
+  {
+    return static_cast<int>(m_val);
+  }
+};
+
+class custom_accumulator_t
+{
+  int m_val{0};
+  int m_magic_value{42};
+
+  __host__ __device__ custom_accumulator_t(int val)
+      : m_val(val)
+  {}
+
+public:
+  __host__ __device__ custom_accumulator_t() {}
+
+  __host__ __device__ custom_accumulator_t(const custom_accumulator_t& in)
+      : m_val(in.is_valid() * in.get())
+      , m_magic_value(in.is_valid() * 42)
+  {}
+
+  __host__ __device__ custom_accumulator_t(const custom_input_t& in)
+      : m_val(in.get())
+      , m_magic_value(42)
+  {}
+
+  __host__ __device__ void operator=(const custom_input_t& in)
+  {
+    if (this->is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ void operator=(const custom_accumulator_t& in)
+  {
+    if (this->is_valid() && in.is_valid())
+    {
+      m_val = in.get();
+    }
+  }
+
+  __host__ __device__ custom_accumulator_t operator+(const custom_input_t& in) const
+  {
+    const int multiplier = this->is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ custom_accumulator_t operator+(const custom_accumulator_t& in) const
+  {
+    const int multiplier = this->is_valid() && in.is_valid();
+    return {(m_val + in.get()) * multiplier};
+  }
+
+  __host__ __device__ int get() const
+  {
+    return m_val;
+  }
+
+  __host__ __device__ bool is_valid() const
+  {
+    return m_magic_value == 42;
+  }
+};
+
+class custom_output_t
+{
+  int* m_d_ok_count{};
+  int m_expected{};
+
+public:
+  __host__ __device__ custom_output_t(int* d_ok_count, int expected)
+      : m_d_ok_count(d_ok_count)
+      , m_expected(expected)
+  {}
+
+  __device__ void operator=(const custom_accumulator_t& accum) const
+  {
+    const int ok = accum.is_valid() && (accum.get() == m_expected);
+    atomicAdd(m_d_ok_count, ok);
+  }
+};
+
+struct index_to_custom_output_op
+{
+  int* d_ok_count;
+
+  __host__ __device__ __forceinline__ custom_output_t operator()(int index)
+  {
+    return custom_output_t{d_ok_count, index};
+  }
+};
+
+C2H_TEST("Device scan works complex accumulator types", "[scan][device]")
+{
+  constexpr int num_items = 2 * 1024 * 1024;
+
+  custom_accumulator_t init{};
+
+  c2h::device_vector<custom_input_t> d_input(static_cast<size_t>(num_items), custom_input_t{1});
+  c2h::device_vector<custom_output_t> d_output{static_cast<size_t>(num_items), custom_output_t{nullptr, 0}};
+  c2h::device_vector<int> d_ok_count(1);
+
+  auto index_it = thrust::make_counting_iterator(0);
+  thrust::transform(
+    c2h::device_policy,
+    index_it,
+    index_it + num_items,
+    d_output.begin(),
+    index_to_custom_output_op{thrust::raw_pointer_cast(d_ok_count.data())});
+
+  auto d_in_it  = thrust::raw_pointer_cast(d_input.data());
+  auto d_out_it = thrust::raw_pointer_cast(d_output.data());
+  device_exclusive_scan(d_in_it, d_out_it, cub::Sum{}, init, num_items);
+
+  REQUIRE(d_ok_count[0] == num_items);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_large_offsets.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_large_offsets.cu
new file mode 100644
index 000000000..18c3f6527
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_scan_large_offsets.cu
@@ -0,0 +1,118 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_scan.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+// List of offset types to be used for testing large number of items
+using offset_types = c2h::type_list<std::uint32_t, std::uint64_t>;
+
+template <typename ItemT>
+struct expected_sum_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ ItemT operator()(const uint64_t index) const
+  {
+    uint64_t sum_per_full_segment = (segment_size * (segment_size - 1)) / 2;
+    uint64_t full_segments        = index / segment_size;
+    uint64_t index_within_segment = index % segment_size;
+
+    uint64_t sum_within_partial_segment = (index_within_segment * (index_within_segment - 1)) / 2;
+    uint64_t sum_over_full_segments     = full_segments * sum_per_full_segment;
+    return static_cast<ItemT>(sum_within_partial_segment + sum_over_full_segments);
+  }
+};
+
+template <typename ItemT>
+struct mod_op
+{
+  uint64_t segment_size;
+
+  __host__ __device__ __forceinline__ uint64_t operator()(const uint64_t index) const
+  {
+    return static_cast<ItemT>(index % segment_size);
+  }
+};
+
+C2H_TEST("DeviceScan works for very large number of items", "[scan][device]", offset_types)
+try
+{
+  using op_t     = cub::Sum;
+  using item_t   = std::uint32_t;
+  using index_t  = std::uint64_t;
+  using offset_t = typename c2h::get<0, TestType>;
+
+  // Clamp 64-bit offset type problem sizes to just slightly larger than 2^32 items
+  auto num_items_max_ull =
+    std::min(static_cast<std::size_t>(::cuda::std::numeric_limits<offset_t>::max()),
+             ::cuda::std::numeric_limits<std::uint32_t>::max() + static_cast<std::size_t>(2000000ULL));
+  offset_t num_items_max = static_cast<offset_t>(num_items_max_ull);
+  offset_t num_items_min =
+    num_items_max_ull > 10000 ? static_cast<offset_t>(num_items_max_ull - 10000ULL) : offset_t{0};
+  offset_t num_items = GENERATE_COPY(
+    values(
+      {num_items_max, static_cast<offset_t>(num_items_max - 1), static_cast<offset_t>(1), static_cast<offset_t>(3)}),
+    take(2, random(num_items_min, num_items_max)));
+
+  // Prepare input (generate a series of: 0, 1, 2, ..., <segment_size-1>,  0, 1, 2, ..., <segment_size-1>, 0, 1, ...)
+  constexpr index_t segment_size = 1000;
+  auto index_it                  = thrust::make_counting_iterator(index_t{});
+  auto items_it                  = thrust::make_transform_iterator(index_it, mod_op<item_t>{segment_size});
+
+  // Output memory allocation
+  c2h::device_vector<item_t> d_items_out(num_items);
+  auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data());
+
+  c2h::device_vector<item_t> d_initial_value(1);
+  d_initial_value[0]     = item_t{};
+  auto future_init_value = cub::FutureValue<item_t>(thrust::raw_pointer_cast(d_initial_value.data()));
+
+  // Run test
+  device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items);
+
+  // Ensure that we created the correct output
+  auto expected_out_it =
+    thrust::make_transform_iterator(index_it, expected_sum_op<item_t>{static_cast<index_t>(segment_size)});
+  bool all_results_correct = thrust::equal(d_items_out.cbegin(), d_items_out.cend(), expected_out_it);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_keys.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
new file mode 100644
index 000000000..f9d5c4974
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_keys.cu
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
+#include <cub/util_type.cuh>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/memory.h>
+#include <thrust/scatter.h>
+#include <thrust/transform.h>
+
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <limits>
+
+#include "catch2_radix_sort_helper.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// TODO replace with DeviceSegmentedRadixSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is
+// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
+// types
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  int begin_bit       = 0,
+  int end_bit         = sizeof(KeyT) * 8,
+  bool is_overwrite   = true,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<cub::NullType> d_values;
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  auto status = cub::DispatchSegmentedRadixSort<
+    IS_DESCENDING,
+    KeyT,
+    cub::NullType,
+    BeginOffsetIteratorT,
+    EndOffsetIteratorT, //
+    NumItemsT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         is_overwrite,
+                         stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_KEY_BITS key_bits 8:16:32:64
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeys, sort_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortKeysDescending, sort_keys_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<true>, dispatch_segmented_radix_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_wrapper<false>, dispatch_segmented_radix_sort);
+
+// TODO:
+// - int128
+// - uint128
+
+// The unsigned integer for the given byte count should be first:
+#if TEST_KEY_BITS == 8
+using key_types            = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, bool, char>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint8_t, cuda::std::int8_t, char>;
+#  define NO_FP_KEY_TYPES
+#  define SINGLE_TEST_CASE_INSTANTIATION
+#elif TEST_KEY_BITS == 16
+// clang-format off
+using key_types = c2h::type_list<
+    cuda::std::uint16_t
+  , cuda::std::int16_t
+#ifdef TEST_HALF_T
+  , half_t
+#endif
+#ifdef TEST_BF_T
+  , bfloat16_t
+#endif
+  >;
+// clang-format on
+using bit_window_key_types = c2h::type_list<cuda::std::uint16_t, cuda::std::int16_t>;
+#  define NO_FP_KEY_TYPES
+#elif TEST_KEY_BITS == 32
+using key_types            = c2h::type_list<cuda::std::uint32_t, cuda::std::int32_t, float>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint32_t, cuda::std::int32_t>;
+using fp_key_types         = c2h::type_list<float>;
+#elif TEST_KEY_BITS == 64
+using key_types            = c2h::type_list<cuda::std::uint64_t, cuda::std::int64_t, double>;
+using bit_window_key_types = c2h::type_list<cuda::std::uint64_t, cuda::std::int64_t>;
+using fp_key_types         = c2h::type_list<double>;
+#endif
+
+// Used for tests that just need a single type for testing:
+using single_key_type = c2h::type_list<c2h::get<0, key_types>>;
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: basic testing",
+         "[keys][segmented][radix][sort][device]",
+         key_types,
+         offset_types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = c2h::get<1, TestType>;
+
+  constexpr std::size_t min_num_items = 1 << 5;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items         = GENERATE_COPY(take(3, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(2, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  // Initialize the output keys using the input keys since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), offsets, static_cast<offset_t>(num_items));
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  REQUIRE((ref_keys == out_keys) == true);
+}
+
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: empty data", "[keys][segmented][radix][sort][device]", single_key_type)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  const std::size_t num_items    = GENERATE(0, take(1, random(0, 1 << 10)));
+  const std::size_t num_segments = GENERATE(0, 1);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  // Initialize the output keys using the input keys since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+  c2h::device_vector<offset_t> offsets(2);
+  offsets[0] = 0;
+  offsets[1] = 0;
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  REQUIRE((ref_keys == out_keys) == true);
+}
+
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: bit windows",
+         "[keys][segmented][radix][sort][device]",
+         bit_window_key_types)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t min_num_items = 1 << 5;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items         = GENERATE_COPY(take(2, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(1, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  // Initialize the output keys using the input keys since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), offsets, static_cast<offset_t>(num_items));
+
+  constexpr int num_bits = sizeof(key_t) * CHAR_BIT;
+  // Explicitly use values<>({}) to workaround bug catchorg/Catch2#2040:
+  const int begin_bit = GENERATE_COPY(values<int>({0, num_bits / 3, 3 * num_bits / 4, num_bits}));
+  const int end_bit   = GENERATE_COPY(values<int>({0, num_bits / 3, 3 * num_bits / 4, num_bits}));
+  if (end_bit < begin_bit || (begin_bit == 0 && end_bit == num_bits))
+  {
+    // SKIP(); Not available until Catch2 3.3.0
+    return;
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, begin_bit, end_bit, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets, begin_bit, end_bit);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit,
+      end_bit);
+  }
+  else
+  {
+    sort_keys(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit,
+      end_bit);
+  }
+
+  REQUIRE((ref_keys == out_keys) == true);
+}
+
+#if defined(SINGLE_TEST_CASE_INSTANTIATION)
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: large segments", "[keys][segmented][radix][sort][device]", single_key_type)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t min_num_items = 1 << 19;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items         = GENERATE_COPY(take(2, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = 2;
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  c2h::device_vector<offset_t> offsets(3);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items / 2);
+  offsets[2] = static_cast<offset_t>(num_items);
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  REQUIRE((ref_keys == out_keys) == true);
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: DoubleBuffer API",
+         "[keys][segmented][radix][sort][device]",
+         single_key_type)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t min_num_items = 1 << 17;
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(1, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  // Initialize the output keys using the input keys since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), offsets, static_cast<offset_t>(num_items));
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+
+  cub::DoubleBuffer<key_t> key_buffer(
+    thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()));
+
+  double_buffer_segmented_sort_t action(is_descending);
+  action.initialize();
+  launch(action,
+         key_buffer,
+         static_cast<int>(num_items),
+         static_cast<int>(num_segments),
+         // Mix pointers/iterators for segment info to test using different iterable types:
+         thrust::raw_pointer_cast(offsets.data()),
+         offsets.cbegin() + 1,
+         begin_bit<key_t>(),
+         end_bit<key_t>());
+
+  key_buffer.selector = action.selector();
+  action.finalize();
+
+  auto& keys = key_buffer.selector == 0 ? in_keys : out_keys;
+
+  REQUIRE((ref_keys == keys) == true);
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: unspecified ranges",
+         "[keys][segmented][radix][sort][device]",
+         single_key_type)
+{
+  using key_t    = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t min_num_items = 1 << 15;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(1, random(num_items / 128, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  // Initialize the output keys using the input keys since not all items
+  // will belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+
+  c2h::device_vector<offset_t> begin_offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), begin_offsets, static_cast<offset_t>(num_items));
+
+  // Create separate begin/end offsets arrays and remove some of the segments by
+  // setting both offsets to 0.
+  c2h::device_vector<offset_t> end_offsets(begin_offsets.cbegin() + 1, begin_offsets.cend());
+  begin_offsets.pop_back();
+
+  {
+    std::size_t num_empty_segments = num_segments / 16;
+    c2h::device_vector<std::size_t> indices(num_empty_segments);
+    c2h::gen(C2H_SEED(1), indices, std::size_t{0}, num_segments - 1);
+    auto begin = thrust::make_constant_iterator(key_t{0});
+    auto end   = begin + num_empty_segments;
+    thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), begin_offsets.begin());
+    thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), end_offsets.begin());
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  auto ref_keys = segmented_radix_sort_reference(in_keys, is_descending, begin_offsets, end_offsets);
+
+  if (is_descending)
+  {
+    sort_keys_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(begin_offsets.data()),
+      end_offsets.cbegin(),
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_keys(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(begin_offsets.data()),
+      end_offsets.cbegin(),
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  REQUIRE((ref_keys == out_keys) == true);
+}
+
+#  if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: very large num. items and num. segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortKeys: very large segments",
+         "[keys][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#  endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+#endif // defined(SINGLE_TEST_CASE_INSTANTIATION)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
new file mode 100644
index 000000000..ccc0f5299
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_radix_sort_pairs.cu
@@ -0,0 +1,528 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_radix_sort.cuh>
+#include <cub/device/dispatch/dispatch_radix_sort.cuh> // DispatchSegmentedRadixSort
+#include <cub/util_type.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/memory.h>
+#include <thrust/scatter.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "catch2_radix_sort_helper.cuh"
+#include "catch2_test_launch_helper.h"
+#include "thrust/detail/raw_pointer_cast.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// TODO replace with DeviceSegmentedRadixSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is
+// addressed Temporary wrapper that allows specializing the DeviceSegmentedRadixSort algorithm for different offset
+// types
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename NumItemsT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_radix_sort_pairs_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  const ValueT* d_values_in,
+  ValueT* d_values_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  int begin_bit       = 0,
+  int end_bit         = sizeof(KeyT) * 8,
+  bool is_overwrite   = true,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+  auto status = cub::DispatchSegmentedRadixSort<
+    IS_DESCENDING,
+    KeyT,
+    ValueT,
+    BeginOffsetIteratorT,
+    EndOffsetIteratorT, //
+    NumItemsT>::Dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         d_keys,
+                         d_values,
+                         num_items,
+                         num_segments,
+                         d_begin_offsets,
+                         d_end_offsets,
+                         begin_bit,
+                         end_bit,
+                         is_overwrite,
+                         stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairs, sort_pairs);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedRadixSort::SortPairsDescending, sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<true>,
+                       dispatch_segmented_radix_sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_radix_sort_pairs_wrapper<false>, dispatch_segmented_radix_sort_pairs);
+
+using custom_value_t = c2h::custom_type_t<c2h::equal_comparable_t>;
+using value_types    = c2h::type_list<cuda::std::uint8_t, cuda::std::uint64_t, custom_value_t>;
+
+// Index types used for OffsetsT testing
+C2H_TEST("DeviceSegmentedRadixSort::SortPairs: Basic testing",
+         "[pairs][segmented][radix][sort][device]",
+         value_types,
+         offset_types)
+{
+  using key_t    = cuda::std::uint32_t;
+  using value_t  = c2h::get<0, TestType>;
+  using offset_t = c2h::get<1, TestType>;
+
+  constexpr std::size_t min_num_items = 1 << 5;
+  constexpr std::size_t max_num_items = 1 << 20;
+  const std::size_t num_items         = GENERATE_COPY(take(3, random(min_num_items, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(2, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  c2h::device_vector<value_t> in_values(num_items);
+  const int num_value_seeds = 1;
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), offsets, static_cast<offset_t>(num_items));
+
+  // Initialize the output vectors by copying the inputs since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+  c2h::device_vector<value_t> out_values(in_values);
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  auto refs        = segmented_radix_sort_reference(in_keys, in_values, is_descending, offsets);
+  auto& ref_keys   = refs.first;
+  auto& ref_values = refs.second;
+
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortPairs: DoubleBuffer API", "[pairs][segmented][radix][sort][device]", value_types)
+{
+  using key_t    = cuda::std::uint32_t;
+  using value_t  = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(1, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  c2h::device_vector<value_t> in_values(num_items);
+  const int num_value_seeds = 1;
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), offsets, static_cast<offset_t>(num_items));
+
+  // Initialize the output vectors by copying the inputs since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+  c2h::device_vector<value_t> out_values(in_values);
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  cub::DoubleBuffer<key_t> key_buffer(
+    thrust::raw_pointer_cast(in_keys.data()), thrust::raw_pointer_cast(out_keys.data()));
+  cub::DoubleBuffer<value_t> value_buffer(
+    thrust::raw_pointer_cast(in_values.data()), thrust::raw_pointer_cast(out_values.data()));
+
+  double_buffer_segmented_sort_t action(is_descending);
+  action.initialize();
+  launch(action,
+         key_buffer,
+         value_buffer,
+         static_cast<int>(num_items),
+         static_cast<int>(num_segments),
+         // Mix pointers/iterators for segment info to test using different iterable types:
+         thrust::raw_pointer_cast(offsets.data()),
+         offsets.cbegin() + 1,
+         begin_bit<key_t>(),
+         end_bit<key_t>());
+
+  key_buffer.selector   = action.selector();
+  value_buffer.selector = action.selector();
+  action.finalize();
+
+  auto refs        = segmented_radix_sort_reference(in_keys, in_values, is_descending, offsets);
+  auto& ref_keys   = refs.first;
+  auto& ref_values = refs.second;
+
+  auto& keys   = key_buffer.selector == 0 ? in_keys : out_keys;
+  auto& values = value_buffer.selector == 0 ? in_values : out_values;
+
+  REQUIRE(ref_keys == keys);
+  REQUIRE(ref_values == values);
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortPairs: unspecified ranges",
+         "[pairs][segmented][radix][sort][device]",
+         value_types)
+{
+  using key_t    = cuda::std::uint32_t;
+  using value_t  = c2h::get<0, TestType>;
+  using offset_t = cuda::std::int32_t;
+
+  constexpr std::size_t max_num_items = 1 << 18;
+  const std::size_t num_items         = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
+  const std::size_t num_segments      = GENERATE_COPY(take(1, random(std::size_t{2}, num_items / 2)));
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  const int num_key_seeds = 1;
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+
+  c2h::device_vector<value_t> in_values(num_items);
+  const int num_value_seeds = 1;
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  // Initialize the output vectors by copying the inputs since not all items
+  // may belong to a segment.
+  c2h::device_vector<key_t> out_keys(in_keys);
+  c2h::device_vector<value_t> out_values(in_values);
+
+  c2h::device_vector<offset_t> begin_offsets(num_segments + 1);
+  const int num_segment_seeds = 1;
+  generate_segment_offsets(C2H_SEED(num_segment_seeds), begin_offsets, static_cast<offset_t>(num_items));
+
+  // Create separate begin/end offsets arrays and remove some of the segments by
+  // setting both offsets to 0.
+  c2h::device_vector<offset_t> end_offsets(begin_offsets.cbegin() + 1, begin_offsets.cend());
+  begin_offsets.pop_back();
+
+  {
+    std::size_t num_empty_segments = num_segments / 16;
+    c2h::device_vector<std::size_t> indices(num_empty_segments);
+    c2h::gen(C2H_SEED(1), indices, std::size_t{0}, num_segments - 1);
+    auto begin = thrust::make_constant_iterator(key_t{0});
+    auto end   = begin + num_empty_segments;
+    thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), begin_offsets.begin());
+    thrust::scatter(c2h::device_policy, begin, end, indices.cbegin(), end_offsets.begin());
+  }
+
+  const bool is_descending = GENERATE(false, true);
+
+  CAPTURE(num_items, num_segments, is_descending);
+
+  if (is_descending)
+  {
+    sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(begin_offsets.data()),
+      end_offsets.cbegin(),
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+  else
+  {
+    sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()),
+      static_cast<int>(num_items),
+      static_cast<int>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      thrust::raw_pointer_cast(begin_offsets.data()),
+      end_offsets.cbegin(),
+      begin_bit<key_t>(),
+      end_bit<key_t>());
+  }
+
+  auto refs        = segmented_radix_sort_reference(in_keys, in_values, is_descending, begin_offsets, end_offsets);
+  auto& ref_keys   = refs.first;
+  auto& ref_values = refs.second;
+
+  REQUIRE((ref_keys == out_keys) == true);
+  REQUIRE((ref_values == out_values) == true);
+}
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+C2H_TEST("DeviceSegmentedRadixSort::SortPairs: very large num. items and num. segments",
+         "[pairs][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      // Mix pointers/iterators for segment info to test using different iterable types:
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+C2H_TEST("DeviceSegmentedRadixSort::SortPairs: very large segments",
+         "[pairs][segmented][radix][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items = (sizeof(offset_t) == 8) ? uint32_max : ::cuda::std::numeric_limits<offset_t>::max();
+  constexpr std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+  c2h::device_vector<value_t> out_values(num_items);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(
+    in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_radix_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_radix_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      begin_bit<key_t>(),
+      end_bit<key_t>(),
+      is_overwrite);
+  }
+  if (out_keys_ptr != thrust::raw_pointer_cast(out_keys.data()))
+  {
+    std::swap(out_keys, in_keys);
+    std::swap(out_values, in_values);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented radix sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce.cu
new file mode 100644
index 000000000..b1f41b502
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce.cu
@@ -0,0 +1,230 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <cuda/std/limits>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Reduce, device_segmented_reduce);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Min, device_segmented_min);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::ArgMin, device_segmented_arg_min);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Max, device_segmented_max);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::ArgMax, device_segmented_arg_max);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+// %PARAM% TEST_TYPES types 0:1:2:3
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t,
+                     c2h::equal_comparable_t,
+                     c2h::lexicographical_less_comparable_t,
+                     c2h::lexicographical_greater_comparable_t>;
+
+#if TEST_TYPES == 0
+using full_type_list = c2h::type_list<type_pair<std::uint8_t>, type_pair<std::int8_t, std::int32_t>>;
+#elif TEST_TYPES == 1
+using full_type_list = c2h::type_list<type_pair<std::int32_t>, type_pair<std::int64_t>>;
+#elif TEST_TYPES == 2
+using full_type_list = c2h::type_list<type_pair<uchar3>, type_pair<ulonglong4>>;
+#elif TEST_TYPES == 3
+// clang-format off
+using full_type_list = c2h::type_list<
+type_pair<custom_t>
+#if TEST_HALF_T
+, type_pair<half_t> // testing half
+#endif
+#if TEST_BF_T
+, type_pair<bfloat16_t> // testing bf16
+#endif
+>;
+// clang-format on
+#endif
+
+using offsets = c2h::type_list<std::int32_t, std::uint32_t>;
+
+C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][device]", full_type_list, offsets)
+{
+  using type_pair_t = typename c2h::get<0, TestType>;
+  using input_t     = typename type_pair_t::input_t;
+  using output_t    = typename type_pair_t::output_t;
+  using offset_t    = typename c2h::get<1, TestType>;
+
+  constexpr int min_items = 1;
+  constexpr int max_items = 1000000;
+
+  // Number of items
+  const int num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate
+  // Note that the segment range [0, 1] may also include one last segment with more than 1 items
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{0, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
+
+  // Generate input data
+  c2h::device_vector<input_t> in_items(num_items);
+  c2h::gen(C2H_SEED(2), in_items);
+  auto d_in_it = thrust::raw_pointer_cast(in_items.data());
+
+  SECTION("reduce")
+  {
+    using op_t = cub::Sum;
+
+    // Binary reduction operator
+    auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
+
+    // Prepare verification data
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, output_t>;
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(in_items, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    using init_t  = cub::detail::value_t<decltype(unwrap_it(d_out_it))>;
+    device_segmented_reduce(
+      unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_t{});
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+// Skip DeviceReduce::Sum tests for extended floating-point types because of unbounded epsilon due
+// to pseudo associativity of the addition operation over floating point numbers
+#if TEST_TYPES != 3
+  SECTION("sum")
+  {
+    using op_t    = cub::Sum;
+    using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, output_t>;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(in_items, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = unwrap_it(thrust::raw_pointer_cast(out_result.data()));
+    device_segmented_sum(d_in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+#endif
+
+  SECTION("min")
+  {
+    using op_t = cub::Min;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(
+      in_items, segment_offsets, op_t{}, cub::NumericTraits<input_t>::Max(), expected_result.begin());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_segmented_min(unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("max")
+  {
+    using op_t = cub::Max;
+
+    // Prepare verification data
+    c2h::host_vector<output_t> expected_result(num_segments);
+    compute_segmented_problem_reference(
+      in_items, segment_offsets, op_t{}, cub::NumericTraits<input_t>::Lowest(), expected_result.begin());
+
+    // Run test
+    c2h::device_vector<output_t> out_result(num_segments);
+    auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+    device_segmented_max(unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("argmax")
+  {
+    using result_t = cub::KeyValuePair<int, output_t>;
+
+    // Prepare verification data
+    c2h::host_vector<result_t> expected_result(num_segments);
+    compute_segmented_argmax_reference(in_items, segment_offsets, expected_result.begin());
+
+    // Run test
+    c2h::device_vector<result_t> out_result(num_segments);
+    device_segmented_arg_max(
+      d_in_it, thrust::raw_pointer_cast(out_result.data()), num_segments, d_offsets_it, d_offsets_it + 1);
+
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+
+  SECTION("argmin")
+  {
+    using result_t = cub::KeyValuePair<int, output_t>;
+
+    // Prepare verification data
+    c2h::host_vector<input_t> host_items(in_items);
+    c2h::host_vector<result_t> expected_result(num_segments);
+    compute_segmented_argmin_reference(in_items, segment_offsets, expected_result.begin());
+
+    // Run test
+    c2h::device_vector<result_t> out_result(num_segments);
+    device_segmented_arg_min(
+      d_in_it, thrust::raw_pointer_cast(out_result.data()), num_segments, d_offsets_it, d_offsets_it + 1);
+    // Verify result
+    REQUIRE(expected_result == out_result);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_api.cu
new file mode 100644
index 000000000..116b68c37
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_api.cu
@@ -0,0 +1,246 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+
+#include <climits>
+#include <cstddef>
+
+#include "thrust/detail/raw_pointer_cast.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// example-begin segmented-reduce-custommin
+struct CustomMin
+{
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const
+  {
+    return (b < a) ? b : a;
+  }
+};
+
+// example-end segmented-reduce-custommin
+
+struct is_equal
+{
+  __device__ bool operator()(cub::KeyValuePair<int, int> lhs, cub::KeyValuePair<int, int> rhs)
+  {
+    return !(lhs != rhs);
+  }
+};
+
+C2H_TEST("cub::DeviceSegmentedReduce::Reduce works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-reduce
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<int> d_out(3);
+  CustomMin min_op;
+  int initial_value{INT_MAX};
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Reduce(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in.begin(),
+    d_out.begin(),
+    num_segments,
+    d_offsets_it,
+    d_offsets_it + 1,
+    min_op,
+    initial_value);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Reduce(
+    d_temp_storage,
+    temp_storage_bytes,
+    d_in.begin(),
+    d_out.begin(),
+    num_segments,
+    d_offsets_it,
+    d_offsets_it + 1,
+    min_op,
+    initial_value);
+
+  thrust::device_vector<int> expected{6, INT_MAX, 0};
+  // example-end segmented-reduce-reduce
+
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceSegmentedReduce::Sum works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-sum
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<int> d_out(3);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Sum(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Sum(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<int> expected{21, 0, 17};
+  // example-end segmented-reduce-sum
+
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceSegmentedReduce::Min works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-min
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<int> d_out(3);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Min(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Min(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<int> expected{6, INT_MAX, 0};
+  // example-end segmented-reduce-min
+
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceSegmentedReduce::ArgMin works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-argmin
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::ArgMin(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::ArgMin(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<cub::KeyValuePair<int, int>> expected{{1, 6}, {1, INT_MAX}, {2, 0}};
+  // example-end segmented-reduce-argmin
+
+  REQUIRE(thrust::equal(d_out.begin(), d_out.end(), expected.begin(), is_equal()));
+}
+
+C2H_TEST("cub::DeviceSegmentedReduce::Max works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-max
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<int> d_out(3);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Max(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Max(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<int> expected{8, INT_MIN, 9};
+  // example-end segmented-reduce-max
+
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceSegmentedReduce::ArgMax works with int data elements", "[segmented_reduce][device]")
+{
+  // example-begin segmented-reduce-argmax
+  int num_segments                     = 3;
+  thrust::device_vector<int> d_offsets = {0, 3, 3, 7};
+  auto d_offsets_it                    = thrust::raw_pointer_cast(d_offsets.data());
+  thrust::device_vector<int> d_in{8, 6, 7, 5, 3, 0, 9};
+  thrust::device_vector<cub::KeyValuePair<int, int>> d_out(3);
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::ArgMax(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::ArgMax(
+    d_temp_storage, temp_storage_bytes, d_in.begin(), d_out.begin(), num_segments, d_offsets_it, d_offsets_it + 1);
+
+  thrust::device_vector<cub::KeyValuePair<int, int>> expected{{0, 8}, {1, INT_MIN}, {3, 9}};
+  // example-end segmented-reduce-argmax
+
+  REQUIRE(thrust::equal(d_out.begin(), d_out.end(), expected.begin(), is_equal()));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators.cu
new file mode 100644
index 000000000..4647b2e8c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators.cu
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Reduce, device_segmented_reduce);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+// List of types to test
+using custom_t           = c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>;
+using iterator_type_list = c2h::type_list<type_pair<custom_t>, type_pair<std::int64_t>>;
+using offsets            = c2h::type_list<std::int32_t, std::uint32_t>;
+
+C2H_TEST("Device segmented reduce works with fancy input iterators", "[reduce][device]", iterator_type_list, offsets)
+{
+  using type_pair_t = typename c2h::get<0, TestType>;
+  using item_t      = typename type_pair_t::input_t;
+  using output_t    = typename type_pair_t::output_t;
+  using offset_t    = typename c2h::get<1, TestType>;
+
+  constexpr int min_items = 1;
+  constexpr int max_items = 1000000;
+
+  // Number of items
+  const int num_items = GENERATE_COPY(
+    take(2, random(min_items, max_items)),
+    values({
+      min_items,
+      max_items,
+    }));
+  INFO("Test num_items: " << num_items);
+
+  // Range of segment sizes to generate
+  const std::tuple<offset_t, offset_t> seg_size_range =
+    GENERATE_COPY(table<offset_t, offset_t>({{1, 1}, {1, num_items}, {num_items, num_items}}));
+  INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]");
+
+  // Generate input segments
+  c2h::device_vector<offset_t> segment_offsets = c2h::gen_uniform_offsets<offset_t>(
+    C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range));
+  const offset_t num_segments = static_cast<offset_t>(segment_offsets.size() - 1);
+  auto d_offsets_it           = thrust::raw_pointer_cast(segment_offsets.data());
+
+  // Prepare input data
+  item_t default_constant{};
+  init_default_constant(default_constant);
+  auto in_it = thrust::make_constant_iterator(default_constant);
+
+  using op_t   = cub::Sum;
+  using init_t = output_t;
+
+  // Binary reduction operator
+  auto reduction_op = op_t{};
+
+  // Prepare verification data
+  using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
+  c2h::host_vector<output_t> expected_result(num_segments);
+  compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
+
+  // Run test
+  c2h::device_vector<output_t> out_result(num_segments);
+  auto d_out_it = thrust::raw_pointer_cast(out_result.data());
+  device_segmented_reduce(in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_t{});
+
+  // Verify result
+  REQUIRE(expected_result == out_result);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
new file mode 100644
index 000000000..fe6c76639
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cstdint>
+
+#include "catch2/catch.hpp"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Reduce, device_segmented_reduce);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum);
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+// List of types to test
+using offsets = c2h::type_list<std::ptrdiff_t, std::size_t>;
+
+C2H_TEST("Device segmented reduce works with fancy input iterators and 64-bit offsets", "[reduce][device]", offsets)
+{
+  using offset_t = typename c2h::get<0, TestType>;
+  using op_t     = cub::Sum;
+
+  constexpr offset_t offset_zero           = 0;
+  constexpr offset_t offset_one            = 1;
+  constexpr offset_t iterator_value        = 2;
+  constexpr offset_t min_items_per_segment = offset_one << 31;
+  constexpr offset_t max_items_per_segment = offset_one << 33;
+
+  constexpr int num_segments = 2;
+
+  // generate individual segment lengths and store cumulative sum in segment_offsets
+  const offset_t num_items_in_first_segment =
+    GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment)));
+  const offset_t num_items_in_second_segment =
+    GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment)));
+  c2h::device_vector<offset_t> segment_offsets = {
+    offset_zero, num_items_in_first_segment, num_items_in_first_segment + num_items_in_second_segment};
+
+  // store expected result and initialize device output container
+  c2h::host_vector<offset_t> expected_result = {
+    iterator_value * num_items_in_first_segment, iterator_value * num_items_in_second_segment};
+  c2h::device_vector<offset_t> device_result(num_segments);
+
+  // prepare device iterators
+  auto in_it        = thrust::make_constant_iterator(iterator_value);
+  auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data());
+  auto d_out_it     = thrust::raw_pointer_cast(device_result.data());
+
+  // reduce
+  device_segmented_reduce(in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1, op_t{}, offset_t{});
+
+  // verify result
+  REQUIRE(expected_result == device_result);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_keys.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_keys.cu
new file mode 100644
index 000000000..0b4ac6597
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_keys.cu
@@ -0,0 +1,367 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+#include "catch2_radix_sort_helper.cuh"
+#include <c2h/catch2_test_helper.cuh>
+#include <catch2_segmented_sort_helper.cuh>
+
+// FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
+// graph launch.
+
+// TODO replace with DeviceSegmentedSort::SortKeys interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
+// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
+template <bool IS_DESCENDING, typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  bool is_overwrite   = false,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<cub::NullType> d_values;
+  auto status =
+    cub::DispatchSegmentedSort<IS_DESCENDING, KeyT, cub::NullType, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::
+      Dispatch(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_keys,
+        d_values,
+        num_items,
+        num_segments,
+        d_begin_offsets,
+        d_end_offsets,
+        is_overwrite,
+        stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<true>, dispatch_segmented_sort_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_wrapper<false>, dispatch_segmented_sort);
+
+using key_types =
+  c2h::type_list<bool,
+                 std::uint8_t,
+                 std::uint64_t
+#if TEST_HALF_T
+                 ,
+                 half_t
+#endif
+#if TEST_BF_T
+                 ,
+                 bfloat16_t
+#endif
+                 >;
+
+C2H_TEST("DeviceSegmentedSortKeys: No segments", "[keys][segmented][sort][device]")
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT = std::uint8_t;
+
+  const bool stable_sort     = GENERATE(unstable, stable);
+  const bool sort_descending = GENERATE(ascending, descending);
+  const bool sort_buffer     = GENERATE(pointers, double_buffer);
+
+  cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+  cub::DoubleBuffer<cub::NullType> values_buffer(nullptr, nullptr);
+  values_buffer.selector = 1;
+
+  call_cub_segmented_sort_api(
+    sort_descending,
+    sort_buffer,
+    stable_sort,
+    static_cast<KeyT*>(nullptr),
+    static_cast<KeyT*>(nullptr),
+    static_cast<cub::NullType*>(nullptr),
+    static_cast<cub::NullType*>(nullptr),
+    int{},
+    int{},
+    nullptr,
+    &keys_buffer.selector,
+    &values_buffer.selector);
+
+  REQUIRE(keys_buffer.selector == 0);
+  REQUIRE(values_buffer.selector == 1);
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Empty segments", "[keys][segmented][sort][device]")
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT = std::uint8_t;
+
+  const int num_segments     = GENERATE(take(2, random(1 << 2, 1 << 22)));
+  const bool sort_stable     = GENERATE(unstable, stable);
+  const bool sort_descending = GENERATE(ascending, descending);
+  const bool sort_buffer     = GENERATE(pointers, double_buffer);
+
+  c2h::device_vector<int> offsets(num_segments + 1, int{});
+  const int* d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+  cub::DoubleBuffer<cub::NullType> values_buffer(nullptr, nullptr);
+  values_buffer.selector = 1;
+
+  call_cub_segmented_sort_api(
+    sort_descending,
+    sort_buffer,
+    sort_stable,
+    static_cast<KeyT*>(nullptr),
+    static_cast<KeyT*>(nullptr),
+    static_cast<cub::NullType*>(nullptr),
+    static_cast<cub::NullType*>(nullptr),
+    int{},
+    num_segments,
+    d_offsets,
+    &keys_buffer.selector,
+    &values_buffer.selector);
+
+  REQUIRE(keys_buffer.selector == 0);
+  REQUIRE(values_buffer.selector == 1);
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Same size segments, derived keys", "[keys][segmented][sort][device]", key_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+
+  const int segment_size = GENERATE_COPY(
+    take(2, random(1 << 0, 1 << 5)), //
+    take(2, random(1 << 5, 1 << 10)),
+    take(2, random(1 << 10, 1 << 15)));
+
+  const int segments = GENERATE_COPY(take(2, random(1 << 0, 1 << 5)), //
+                                     take(2, random(1 << 5, 1 << 10)));
+
+  test_same_size_segments_derived<KeyT>(segment_size, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Randomly sized segments, derived keys", "[keys][segmented][sort][device]", key_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+
+  const int max_items   = 1 << 22;
+  const int max_segment = 6000;
+
+  const int segments = GENERATE_COPY(
+    take(2, random(1 << 0, 1 << 5)), //
+    take(2, random(1 << 5, 1 << 10)),
+    take(2, random(1 << 10, 1 << 15)),
+    take(2, random(1 << 15, 1 << 20)));
+
+  test_random_size_segments_derived<KeyT>(C2H_SEED(1), max_items, max_segment, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Randomly sized segments, random keys", "[keys][segmented][sort][device]", key_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+
+  const int max_items   = 1 << 22;
+  const int max_segment = 6000;
+
+  const int segments = GENERATE_COPY(take(2, random(1 << 15, 1 << 20)));
+
+  test_random_size_segments_random<KeyT>(C2H_SEED(1), max_items, max_segment, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Edge case segments, random keys", "[keys][segmented][sort][device]", key_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+  test_edge_case_segments_random<KeyT>(C2H_SEED(4));
+}
+
+C2H_TEST("DeviceSegmentedSortKeys: Unspecified segments, random keys", "[keys][segmented][sort][device]", key_types)
+{
+  using KeyT = c2h::get<0, TestType>;
+  test_unspecified_segments_random<KeyT>(C2H_SEED(4));
+}
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+// we can reuse the same structure of DeviceSegmentedRadixSortKeys for simplicity
+C2H_TEST("DeviceSegmentedSortKeys: very large num. items and num. segments",
+         "[keys][segmented][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)) == cudaSuccess);
+  }
+
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, num_segments, offsets, offsets_plus_1);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE((ref_keys == out_keys) == true);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+C2H_TEST("DeviceSegmentedSort::SortKeys: very large segments", "[keys][segmented][sort][device]", all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0] = 0;
+  offsets[1] = static_cast<offset_t>(num_items);
+  offsets[2] = static_cast<offset_t>(num_items);
+
+  // Allocate host/device-accessible memory to communicate the selected output buffer
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+  auto ref_keys     = segmented_radix_sort_reference(in_keys, is_descending, offsets);
+  auto out_keys_ptr = thrust::raw_pointer_cast(out_keys.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+    }
+    REQUIRE(cudaSuccess == cudaFreeHost(selector_ptr));
+  }
+  REQUIRE((ref_keys == out_keys) == true);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_pairs.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_pairs.cu
new file mode 100644
index 000000000..fe49553b1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_segmented_sort_pairs.cu
@@ -0,0 +1,427 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include "catch2_radix_sort_helper.cuh"
+// above header needs to be included first
+
+#include <c2h/catch2_test_helper.cuh>
+#include <catch2_segmented_sort_helper.cuh>
+
+// FIXME: Graph launch disabled, algorithm syncs internally. WAR exists for device-launch, figure out how to enable for
+// graph launch.
+
+// TODO replace with DeviceSegmentedSort::SortPairs interface once https://github.com/NVIDIA/cccl/issues/50 is addressed
+// Temporary wrapper that allows specializing the DeviceSegmentedSort algorithm for different offset types
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_segmented_sort_pairs_wrapper(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  const KeyT* d_keys_in,
+  KeyT* d_keys_out,
+  const ValueT* d_values_in,
+  ValueT* d_values_out,
+  NumItemsT num_items,
+  NumItemsT num_segments,
+  BeginOffsetIteratorT d_begin_offsets,
+  EndOffsetIteratorT d_end_offsets,
+  bool* selector,
+  bool is_overwrite   = false,
+  cudaStream_t stream = 0)
+{
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+  auto status = cub::
+    DispatchSegmentedSort<IS_DESCENDING, KeyT, ValueT, NumItemsT, BeginOffsetIteratorT, EndOffsetIteratorT>::Dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_keys,
+      d_values,
+      num_items,
+      num_segments,
+      d_begin_offsets,
+      d_end_offsets,
+      is_overwrite,
+      stream);
+  if (status != cudaSuccess)
+  {
+    return status;
+  }
+  if (is_overwrite)
+  {
+    // Only write to selector in the DoubleBuffer invocation
+    *selector = d_keys.Current() != d_keys_out;
+  }
+  return cudaSuccess;
+}
+
+// %PARAM% TEST_LAUNCH lid 0:1
+
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<true>, dispatch_segmented_sort_pairs_descending);
+DECLARE_LAUNCH_WRAPPER(dispatch_segmented_sort_pairs_wrapper<false>, dispatch_segmented_sort_pairs);
+
+using pair_types =
+  c2h::type_list<c2h::type_list<bool, std::uint8_t>,
+                 c2h::type_list<std::int8_t, std::uint64_t>,
+                 c2h::type_list<double, float>
+#if TEST_HALF_T
+                 ,
+                 c2h::type_list<half_t, std::int8_t>
+#endif
+#if TEST_BF_T
+                 ,
+                 c2h::type_list<bfloat16_t, float>
+#endif
+                 >;
+
+C2H_TEST("DeviceSegmentedSortPairs: No segments", "[pairs][segmented][sort][device]")
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT   = std::uint8_t;
+  using ValueT = std::uint8_t;
+
+  const bool stable_sort     = GENERATE(unstable, stable);
+  const bool sort_descending = GENERATE(ascending, descending);
+  const bool sort_buffer     = GENERATE(pointers, double_buffer);
+
+  cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+  cub::DoubleBuffer<ValueT> values_buffer(nullptr, nullptr);
+  values_buffer.selector = 1;
+
+  call_cub_segmented_sort_api(
+    sort_descending,
+    sort_buffer,
+    stable_sort,
+    static_cast<KeyT*>(nullptr),
+    static_cast<KeyT*>(nullptr),
+    static_cast<ValueT*>(nullptr),
+    static_cast<ValueT*>(nullptr),
+    int{},
+    int{},
+    nullptr,
+    &keys_buffer.selector,
+    &values_buffer.selector);
+
+  REQUIRE(keys_buffer.selector == 0);
+  REQUIRE(values_buffer.selector == 1);
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Empty segments", "[pairs][segmented][sort][device]")
+{
+  // Type doesn't affect the escape logic, so it should be fine
+  // to test only one set of types here.
+
+  using KeyT   = std::uint8_t;
+  using ValueT = std::uint8_t;
+
+  const int num_segments     = GENERATE(take(2, random(1 << 2, 1 << 22)));
+  const bool sort_stable     = GENERATE(unstable, stable);
+  const bool sort_descending = GENERATE(ascending, descending);
+  const bool sort_buffer     = GENERATE(pointers, double_buffer);
+
+  c2h::device_vector<int> offsets(num_segments + 1, int{});
+  const int* d_offsets = thrust::raw_pointer_cast(offsets.data());
+
+  cub::DoubleBuffer<KeyT> keys_buffer(nullptr, nullptr);
+  cub::DoubleBuffer<ValueT> values_buffer(nullptr, nullptr);
+  values_buffer.selector = 1;
+
+  call_cub_segmented_sort_api(
+    sort_descending,
+    sort_buffer,
+    sort_stable,
+    static_cast<KeyT*>(nullptr),
+    static_cast<KeyT*>(nullptr),
+    static_cast<ValueT*>(nullptr),
+    static_cast<ValueT*>(nullptr),
+    int{},
+    num_segments,
+    d_offsets,
+    &keys_buffer.selector,
+    &values_buffer.selector);
+
+  REQUIRE(keys_buffer.selector == 0);
+  REQUIRE(values_buffer.selector == 1);
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Same size segments, derived keys/values",
+         "[pairs][segmented][sort][device]",
+         pair_types)
+{
+  using PairT  = c2h::get<0, TestType>;
+  using KeyT   = c2h::get<0, PairT>;
+  using ValueT = c2h::get<1, PairT>;
+
+  const int segment_size = GENERATE_COPY(
+    take(2, random(1 << 0, 1 << 5)), //
+    take(2, random(1 << 5, 1 << 10)),
+    take(2, random(1 << 10, 1 << 15)));
+
+  const int segments = GENERATE_COPY(take(2, random(1 << 0, 1 << 5)), //
+                                     take(2, random(1 << 5, 1 << 10)));
+
+  test_same_size_segments_derived<KeyT, ValueT>(segment_size, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Randomly sized segments, derived keys/values",
+         "[pairs][segmented][sort][device]",
+         pair_types)
+{
+  using PairT  = c2h::get<0, TestType>;
+  using KeyT   = c2h::get<0, PairT>;
+  using ValueT = c2h::get<1, PairT>;
+
+  const int max_items   = 1 << 22;
+  const int max_segment = 6000;
+
+  const int segments = GENERATE_COPY(
+    take(2, random(1 << 0, 1 << 5)), //
+    take(2, random(1 << 5, 1 << 10)),
+    take(2, random(1 << 10, 1 << 15)),
+    take(2, random(1 << 15, 1 << 20)));
+
+  test_random_size_segments_derived<KeyT, ValueT>(C2H_SEED(1), max_items, max_segment, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Randomly sized segments, random keys/values",
+         "[pairs][segmented][sort][device]",
+         pair_types)
+{
+  using PairT  = c2h::get<0, TestType>;
+  using KeyT   = c2h::get<0, PairT>;
+  using ValueT = c2h::get<1, PairT>;
+
+  const int max_items   = 1 << 22;
+  const int max_segment = 6000;
+
+  const int segments = GENERATE_COPY(take(2, random(1 << 15, 1 << 20)));
+
+  test_random_size_segments_random<KeyT, ValueT>(C2H_SEED(1), max_items, max_segment, segments);
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Edge case segments, random keys/values",
+         "[pairs][segmented][sort][device]",
+         pair_types)
+{
+  using PairT  = c2h::get<0, TestType>;
+  using KeyT   = c2h::get<0, PairT>;
+  using ValueT = c2h::get<1, PairT>;
+
+  test_edge_case_segments_random<KeyT, ValueT>(C2H_SEED(4));
+}
+
+C2H_TEST("DeviceSegmentedSortPairs: Unspecified segments, random key/values",
+         "[pairs][segmented][sort][device]",
+         pair_types)
+{
+  using PairT  = c2h::get<0, TestType>;
+  using KeyT   = c2h::get<0, PairT>;
+  using ValueT = c2h::get<1, PairT>;
+
+  test_unspecified_segments_random<KeyT, ValueT>(C2H_SEED(4));
+}
+
+#if defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
+
+// we can reuse the same structure of DeviceSegmentedRadixSortPairs for simplicity
+C2H_TEST("DeviceSegmentedSortPairs: very large num. items and num. segments",
+         "[pairs][segmented][sort][device]",
+         all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t Step       = 500;
+  using segment_iterator_t         = segment_iterator<offset_t, Step>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  const std::size_t num_segments = ::cuda::ceil_div(num_items, Step);
+  CAPTURE(c2h::type_name<offset_t>(), num_items, num_segments, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+
+  // Initialize the output vectors by copying the inputs since not all items may belong to a segment.
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::device_vector<value_t> out_values(num_items);
+  auto offsets =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), segment_iterator_t{num_items});
+  auto offsets_plus_1 = offsets + 1;
+  bool* selector_ptr  = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(in_keys, in_values, is_descending, num_segments, offsets, offsets_plus_1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      offsets,
+      offsets_plus_1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+C2H_TEST("DeviceSegmentedSort::SortPairs: very large segments", "[pairs][segmented][sort][device]", all_offset_types)
+try
+{
+  using key_t                      = cuda::std::uint8_t; // minimize memory footprint to support a wider range of GPUs
+  using value_t                    = cuda::std::uint8_t;
+  using offset_t                   = c2h::get<0, TestType>;
+  constexpr std::size_t uint32_max = ::cuda::std::numeric_limits<std::uint32_t>::max();
+  constexpr int num_key_seeds      = 1;
+  constexpr int num_value_seeds    = 1;
+  const bool is_descending         = GENERATE(false, true);
+  const bool is_overwrite          = GENERATE(false, true);
+  constexpr std::size_t num_items =
+    (sizeof(offset_t) == 8) ? uint32_max + (1 << 20) : ::cuda::std::numeric_limits<offset_t>::max();
+  constexpr std::size_t num_segments = 2;
+  CAPTURE(c2h::type_name<offset_t>(), num_items, is_descending, is_overwrite);
+
+  c2h::device_vector<key_t> in_keys(num_items);
+  c2h::device_vector<value_t> in_values(num_items);
+  c2h::device_vector<key_t> out_keys(num_items);
+  c2h::gen(C2H_SEED(num_key_seeds), in_keys);
+  c2h::gen(C2H_SEED(num_value_seeds), in_values);
+  c2h::device_vector<value_t> out_values(num_items);
+  c2h::device_vector<offset_t> offsets(num_segments + 1);
+  offsets[0]         = 0;
+  offsets[1]         = static_cast<offset_t>(num_items);
+  offsets[2]         = static_cast<offset_t>(num_items);
+  bool* selector_ptr = nullptr;
+  if (is_overwrite)
+  {
+    REQUIRE(cudaSuccess == cudaMallocHost(&selector_ptr, sizeof(*selector_ptr)));
+  }
+
+  auto refs = segmented_radix_sort_reference(
+    in_keys, in_values, is_descending, num_segments, offsets.cbegin(), offsets.cbegin() + 1);
+  auto& ref_keys      = refs.first;
+  auto& ref_values    = refs.second;
+  auto out_keys_ptr   = thrust::raw_pointer_cast(out_keys.data());
+  auto out_values_ptr = thrust::raw_pointer_cast(out_values.data());
+  if (is_descending)
+  {
+    dispatch_segmented_sort_pairs_descending(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  else
+  {
+    dispatch_segmented_sort_pairs(
+      thrust::raw_pointer_cast(in_keys.data()),
+      out_keys_ptr,
+      thrust::raw_pointer_cast(in_values.data()),
+      out_values_ptr,
+      static_cast<offset_t>(num_items),
+      static_cast<offset_t>(num_segments),
+      thrust::raw_pointer_cast(offsets.data()),
+      offsets.cbegin() + 1,
+      selector_ptr,
+      is_overwrite);
+  }
+  if (is_overwrite)
+  {
+    if (*selector_ptr)
+    {
+      std::swap(out_keys, in_keys);
+      std::swap(out_values, in_values);
+    }
+    REQUIRE(cudaFreeHost(selector_ptr) == cudaSuccess);
+  }
+  REQUIRE(ref_keys == out_keys);
+  REQUIRE(ref_values == out_values);
+}
+catch (std::bad_alloc& e)
+{
+  std::cerr << "Skipping segmented sort test, unsufficient GPU memory. " << e.what() << "\n";
+}
+
+#endif // defined(CCCL_TEST_ENABLE_LARGE_SEGMENTED_SORT)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_api.cu
new file mode 100644
index 000000000..e961600de
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_api.cu
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/memory.h>
+
+#include <cstddef>
+
+#include <c2h/catch2_test_helper.cuh>
+
+// example-begin segmented-select-iseven
+struct is_even_t
+{
+  __host__ __device__ bool operator()(int flag) const
+  {
+    return !(flag % 2);
+  }
+};
+// example-end segmented-select-iseven
+
+C2H_TEST("cub::DeviceSelect::FlaggedIf works with int data elements", "[select][device]")
+{
+  // example-begin segmented-select-flaggedif
+  constexpr int num_items            = 8;
+  thrust::device_vector<int> d_in    = {0, 1, 2, 3, 4, 5, 6, 7};
+  thrust::device_vector<int> d_flags = {8, 6, 7, 5, 3, 0, 9, 3};
+  thrust::device_vector<int> d_out(num_items);
+  thrust::device_vector<int> d_num_selected_out(num_items);
+  is_even_t is_even{};
+
+  // Determine temporary device storage requirements
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSelect::FlaggedIf(
+    nullptr,
+    temp_storage_bytes,
+    d_in.begin(),
+    d_flags.begin(),
+    d_out.begin(),
+    d_num_selected_out.data(),
+    num_items,
+    is_even);
+
+  // Allocate temporary storage
+  c2h::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // Run selection
+  cub::DeviceSelect::FlaggedIf(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    d_in.begin(),
+    d_flags.begin(),
+    d_out.begin(),
+    d_num_selected_out.data(),
+    num_items,
+    is_even);
+
+  thrust::device_vector<int> expected{0, 1, 5};
+  // example-end segmented-select-flaggedif
+
+  REQUIRE(d_num_selected_out[0] == static_cast<int>(expected.size()));
+  d_out.resize(d_num_selected_out[0]);
+  REQUIRE(d_out == expected);
+}
+
+C2H_TEST("cub::DeviceSelect::FlaggedIf in-place works with int data elements", "[select][device]")
+{
+  // example-begin segmented-select-flaggedif-inplace
+  constexpr int num_items            = 8;
+  thrust::device_vector<int> d_data  = {0, 1, 2, 3, 4, 5, 6, 7};
+  thrust::device_vector<int> d_flags = {8, 6, 7, 5, 3, 0, 9, 3};
+  thrust::device_vector<int> d_num_selected_out(num_items);
+  is_even_t is_even{};
+
+  // Determine temporary device storage requirements
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSelect::FlaggedIf(
+    nullptr, temp_storage_bytes, d_data.begin(), d_flags.begin(), d_num_selected_out.data(), num_items, is_even);
+
+  // Allocate temporary storage
+  c2h::device_vector<char> temp_storage(temp_storage_bytes);
+
+  // Run selection
+  cub::DeviceSelect::FlaggedIf(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    d_data.begin(),
+    d_flags.begin(),
+    d_num_selected_out.data(),
+    num_items,
+    is_even);
+
+  thrust::device_vector<int> expected{0, 1, 5};
+  // example-end segmented-select-flaggedif-inplace
+
+  REQUIRE(d_num_selected_out[0] == static_cast<int>(expected.size()));
+  d_data.resize(d_num_selected_out[0]);
+  REQUIRE(d_data == expected);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_common.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_common.cuh
new file mode 100644
index 000000000..015fcb81a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_common.cuh
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <typename T>
+struct less_than_t
+{
+  T compare;
+
+  explicit __host__ less_than_t(T compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a < compare;
+  }
+};
+
+template <typename T>
+struct mod_n
+{
+  T mod;
+  __host__ __device__ bool operator()(T x)
+  {
+    return (x % mod == 0) ? true : false;
+  }
+};
+
+template <typename T>
+struct multiply_n
+{
+  T multiplier;
+  __host__ __device__ T operator()(T x)
+  {
+    return x * multiplier;
+  }
+};
+
+template <typename T, typename TargetT>
+struct modx_and_add_divy
+{
+  T mod;
+  T div;
+
+  __host__ __device__ TargetT operator()(T x)
+  {
+    return static_cast<TargetT>((x % mod) + (x / div));
+  }
+};
+
+template <typename SelectedItT, typename RejectedItT>
+struct index_to_expected_partition_op
+{
+  using value_t = typename ::cuda::std::iterator_traits<SelectedItT>::value_type;
+  SelectedItT expected_selected_it;
+  RejectedItT expected_rejected_it;
+  std::int64_t expected_num_selected;
+
+  template <typename OffsetT>
+  __host__ __device__ value_t operator()(OffsetT index)
+  {
+    return (index < static_cast<OffsetT>(expected_num_selected))
+           ? expected_selected_it[index]
+           : expected_rejected_it[index - expected_num_selected];
+  }
+};
+
+template <typename SelectedItT, typename RejectedItT>
+static index_to_expected_partition_op<SelectedItT, RejectedItT> make_index_to_expected_partition_op(
+  SelectedItT expected_selected_it, RejectedItT expected_rejected_it, std::int64_t expected_num_selected)
+{
+  return index_to_expected_partition_op<SelectedItT, RejectedItT>{
+    expected_selected_it, expected_rejected_it, expected_num_selected};
+}
+
+template <typename ExpectedValuesItT>
+struct flag_correct_writes_op
+{
+  ExpectedValuesItT expected_it;
+  std::uint32_t* d_correctness_flags;
+
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  template <typename OffsetT, typename T>
+  __host__ __device__ void operator()(OffsetT index, T val)
+  {
+    // Set bit-flag if the correct result has been written at the given index
+    if (expected_it[index] == val)
+    {
+      OffsetT uint_index     = index / static_cast<OffsetT>(bits_per_element);
+      std::uint32_t bit_flag = 0x00000001U << (index % bits_per_element);
+      atomicOr(&d_correctness_flags[uint_index], bit_flag);
+    }
+  }
+};
+
+template <typename ExpectedValuesItT>
+flag_correct_writes_op<ExpectedValuesItT> static make_checking_write_op(
+  ExpectedValuesItT expected_it, std::uint32_t* d_correctness_flags)
+{
+  return flag_correct_writes_op<ExpectedValuesItT>{expected_it, d_correctness_flags};
+}
+
+static bool are_all_flags_set(c2h::device_vector<std::uint32_t>& flag_vector, std::size_t num_flags_to_check)
+{
+  static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+  bool all_flags_set                     = thrust::equal(
+    flag_vector.cbegin(),
+    flag_vector.cbegin() + (num_flags_to_check / bits_per_element),
+    thrust::make_constant_iterator(0xFFFFFFFFU));
+  if (num_flags_to_check % bits_per_element != 0)
+  {
+    std::uint32_t last_element_flags = (0x00000001U << (num_flags_to_check % bits_per_element)) - 0x01U;
+    all_flags_set = all_flags_set && (flag_vector[num_flags_to_check / bits_per_element] == last_element_flags);
+  }
+  return all_flags_set;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged.cu
new file mode 100644
index 000000000..6176be099
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged.cu
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/count.h>
+#include <thrust/partition.h>
+#include <thrust/reverse.h>
+
+#include <algorithm>
+
+#include "catch2_test_device_select_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T, class FlagT>
+static c2h::host_vector<T> get_reference(const c2h::device_vector<T>& in, const c2h::device_vector<FlagT>& flags)
+{
+  struct selector
+  {
+    const T* ref_begin      = nullptr;
+    const FlagT* flag_begin = nullptr;
+
+    constexpr selector(const T* ref, const FlagT* flag) noexcept
+        : ref_begin(ref)
+        , flag_begin(flag)
+    {}
+
+    bool operator()(const T& val) const
+    {
+      const auto pos = &val - ref_begin;
+      return static_cast<bool>(flag_begin[pos]);
+    }
+  };
+
+  c2h::host_vector<T> reference   = in;
+  c2h::host_vector<FlagT> h_flags = flags;
+
+  const selector pred{thrust::raw_pointer_cast(reference.data()), thrust::raw_pointer_cast(h_flags.data())};
+  const auto boundary = std::stable_partition(reference.begin(), reference.end(), pred);
+  reference.erase(boundary, reference.end());
+  return reference;
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::Flagged, select_flagged);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using types = c2h::type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+C2H_TEST("DeviceSelect::Flagged can run with empty input", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::device_vector<int> flags(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 42);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::Flagged handles all matched", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items, 1);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DeviceSelect::Flagged handles no matched", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(0);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items, 0);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::Flagged does not change input", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<type> reference = in;
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceSelect::Flagged is stable",
+         "[device][select_flagged]",
+         c2h::type_list<c2h::custom_type_t<c2h::equal_comparable_t>>)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Flagged works with iterators", "[device][select_flagged]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.data(), flags.begin(), out.data(), d_num_selected_out, num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Flagged works with pointers", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(thrust::raw_pointer_cast(in.data()),
+                 thrust::raw_pointer_cast(flags.data()),
+                 thrust::raw_pointer_cast(out.data()),
+                 d_num_selected_out,
+                 num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+struct convertible_to_bool
+{
+  int val_;
+
+  convertible_to_bool() = default;
+  __host__ __device__ convertible_to_bool(const int val) noexcept
+      : val_(val)
+  {}
+
+  __host__ __device__ operator bool() const noexcept
+  {
+    return static_cast<bool>(val_);
+  }
+  __host__ __device__ friend bool operator==(const convertible_to_bool& lhs, const int& rhs) noexcept
+  {
+    return lhs.val_ == rhs;
+  }
+  __host__ __device__ friend bool operator==(const int& lhs, const convertible_to_bool& rhs) noexcept
+  {
+    return lhs == rhs.val_;
+  }
+};
+
+C2H_TEST("DeviceSelect::Flagged works with flags that are convertible to bool", "[device][select_flagged]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> iflags(num_items);
+  c2h::gen(C2H_SEED(1), iflags, 0, 1);
+
+  c2h::device_vector<convertible_to_bool> flags = iflags;
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Flagged works with flags that alias input", "[device][select_flagged]")
+{
+  using type = int;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> out(num_items);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(flags, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(flags.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Flagged works in place", "[device][select_flagged]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.begin(), flags.begin(), d_num_selected_out, num_items);
+
+  in.resize(num_selected_out[0]);
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceSelect::Flagged works in place with flags that alias input", "[device][select_flagged]")
+{
+  using type = int;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<int> flags(num_items);
+
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(flags, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(flags.begin(), flags.begin(), d_num_selected_out, num_items);
+
+  flags.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == flags);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceSelect::Flagged works with a different output type", "[device][select_flagged]")
+{
+  using type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags, 0, 1);
+
+  const int num_selected = static_cast<int>(thrust::count(c2h::device_policy, flags.begin(), flags.end(), 1));
+  const c2h::host_vector<type> reference = get_reference(in, flags);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged(in.data(), flags.begin(), out.data(), d_num_selected_out, num_items);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Flagged works for very large number of items", "[device][select_flagged]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // Input
+  constexpr offset_t match_every_nth = 1000000;
+  auto in                            = thrust::make_counting_iterator(static_cast<type>(0));
+  auto flags_in = thrust::make_transform_iterator(in, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  offset_t expected_num_copied = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
+  c2h::device_vector<type> out(expected_num_copied);
+  select_flagged(in, flags_in, out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == expected_num_copied);
+  auto expected_out_it =
+    thrust::make_transform_iterator(in, multiply_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged_if.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged_if.cu
new file mode 100644
index 000000000..62c399441
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_flagged_if.cu
@@ -0,0 +1,289 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <typename PredOpT>
+struct predicate_op_wrapper_t
+{
+  PredOpT if_pred;
+  template <typename FlagT, typename ItemT>
+  __host__ __device__ bool operator()(thrust::tuple<FlagT, ItemT> tuple) const
+  {
+    const auto flag = thrust::get<0>(tuple);
+    return static_cast<bool>(if_pred(flag));
+  }
+};
+
+template <class T, class FlagT, class Pred>
+static c2h::host_vector<T>
+get_reference(c2h::device_vector<T> const& in, c2h::device_vector<FlagT> const& flags, Pred if_predicate)
+{
+  c2h::host_vector<T> reference   = in;
+  c2h::host_vector<FlagT> h_flags = flags;
+  // Zips flags and items
+  auto zipped_in_it = thrust::make_zip_iterator(h_flags.cbegin(), reference.cbegin());
+
+  // Discards the flags part and only keeps the items
+  auto zipped_out_it = thrust::make_zip_iterator(thrust::make_discard_iterator(), reference.begin());
+
+  auto end =
+    std::copy_if(zipped_in_it, zipped_in_it + in.size(), zipped_out_it, predicate_op_wrapper_t<Pred>{if_predicate});
+  reference.resize(thrust::distance(zipped_out_it, end));
+  return reference;
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::FlaggedIf, select_flagged_if);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using custom_t = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+template <typename T>
+struct is_even_t
+{
+  __host__ __device__ bool operator()(T const& elem) const
+  {
+    return !(elem % 2);
+  }
+};
+
+template <>
+struct is_even_t<custom_t>
+{
+  __host__ __device__ bool operator()(custom_t elem) const
+  {
+    return !(elem.key % 2);
+  }
+};
+
+struct equal_to_default_t
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a == T{};
+  }
+};
+
+struct always_false_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return false;
+  }
+};
+
+struct always_true_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return true;
+  }
+};
+
+using all_types =
+  c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, ulonglong2, ulonglong4, int, long2, custom_t>;
+
+using types = c2h::type_list<std::uint8_t, std::uint32_t, ulonglong4, custom_t>;
+
+using flag_types = c2h::type_list<std::uint8_t, std::uint64_t, custom_t>;
+
+C2H_TEST("DeviceSelect::FlaggedIf can run with empty input", "[device][select_flagged_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::device_vector<int> flags(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged_if(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::FlaggedIf handles all matched", "[device][select_flagged_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::device_vector<int> flags(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged_if(in.begin(), flags.begin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DeviceSelect::FlaggedIf handles no matched", "[device][select_flagged_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(0);
+  c2h::gen(C2H_SEED(2), in);
+
+  c2h::device_vector<int> flags(num_items, 0);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged_if(in.begin(), flags.begin(), out.begin(), d_first_num_selected_out, num_items, always_false_t{});
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::FlaggedIf does not change input and is stable",
+         "[device][select_flagged_if]",
+         c2h::type_list<std::uint8_t, std::uint64_t>,
+         flag_types)
+{
+  using input_type = typename c2h::get<0, TestType>;
+  using flag_type  = typename c2h::get<1, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<input_type> in(num_items);
+  c2h::device_vector<input_type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  is_even_t<flag_type> is_even{};
+
+  c2h::device_vector<flag_type> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags);
+  const c2h::host_vector<input_type> reference_out = get_reference(in, flags, is_even);
+  const int num_selected                           = static_cast<int>(reference_out.size());
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<input_type> reference_in = in;
+
+  select_flagged_if(in.begin(), flags.begin(), out.begin(), d_num_selected_out, num_items, is_even);
+
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference_in == in);
+
+  // Ensure that we did not overwrite other elements
+  const auto boundary = out.begin() + num_selected_out[0];
+  REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(reference_out == out);
+}
+
+C2H_TEST("DeviceSelect::FlaggedIf works with iterators", "[device][select_if]", all_types, flag_types)
+{
+  using input_type = typename c2h::get<0, TestType>;
+  using flag_type  = typename c2h::get<1, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<input_type> in(num_items);
+  c2h::device_vector<input_type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  is_even_t<flag_type> is_even{};
+
+  c2h::device_vector<flag_type> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags);
+  const c2h::host_vector<input_type> reference = get_reference(in, flags, is_even);
+  const int num_selected                       = static_cast<int>(reference.size());
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged_if(in.begin(), flags.begin(), out.begin(), d_first_num_selected_out, num_items, is_even);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::FlaggedIf works with pointers", "[device][select_flagged]", types, flag_types)
+{
+  using input_type = typename c2h::get<0, TestType>;
+  using flag_type  = typename c2h::get<1, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<input_type> in(num_items);
+  c2h::device_vector<input_type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  is_even_t<flag_type> is_even{};
+
+  c2h::device_vector<flag_type> flags(num_items);
+  c2h::gen(C2H_SEED(1), flags);
+
+  const c2h::host_vector<input_type> reference = get_reference(in, flags, is_even);
+  const int num_selected                       = static_cast<int>(reference.size());
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_flagged_if(
+    thrust::raw_pointer_cast(in.data()),
+    thrust::raw_pointer_cast(flags.data()),
+    thrust::raw_pointer_cast(out.data()),
+    d_num_selected_out,
+    num_items,
+    is_even);
+
+  out.resize(num_selected_out[0]);
+  REQUIRE(num_selected == num_selected_out[0]);
+  REQUIRE(reference == out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if.cu
new file mode 100644
index 000000000..ab8d5da24
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if.cu
@@ -0,0 +1,407 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+#include <cub/device/dispatch/dispatch_select_if.cuh>
+
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/partition.h>
+#include <thrust/reverse.h>
+
+#include <cuda/std/limits>
+
+#include <algorithm>
+
+#include "catch2_test_device_select_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+struct equal_to_default_t
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a == T{};
+  }
+};
+
+struct always_false_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return false;
+  }
+};
+
+struct always_true_t
+{
+  template <typename T>
+  __device__ bool operator()(const T&) const
+  {
+    return true;
+  }
+};
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
+
+using types = c2h::
+  type_list<std::uint8_t, std::uint32_t, ulonglong4, c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>>;
+
+C2H_TEST("DeviceSelect::If can run with empty input", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 42);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::If handles all matched", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(out == in);
+}
+
+C2H_TEST("DeviceSelect::If handles no matched", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(0);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, always_false_t{});
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::If does not change input", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<type> reference = in;
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceSelect::If is stable", "[device][select_if]")
+{
+  using type = c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  std::stable_partition(reference.begin(), reference.end(), le);
+
+  // Ensure that we did not overwrite other elements
+  const auto boundary = out.begin() + num_selected_out[0];
+  REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
+
+  out.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::If works with iterators", "[device][select_if]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  const auto boundary = out.begin() + num_selected_out[0];
+  REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le));
+  REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
+}
+
+C2H_TEST("DeviceSelect::If works with pointers", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), d_first_num_selected_out, num_items, le);
+
+  const auto boundary = out.begin() + num_selected_out[0];
+  REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le));
+  REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
+}
+
+C2H_TEST("DeviceSelect::If works in place", "[device][select_if]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  std::stable_partition(reference.begin(), reference.end(), le);
+
+  select_if(in.begin(), d_first_num_selected_out, num_items, le);
+
+  in.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == in);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceSelect::If works with a different output type", "[device][select_if]")
+{
+  using type = c2h::custom_type_t<c2h::less_comparable_t, c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_if(in.begin(), out.begin(), d_first_num_selected_out, num_items, le);
+
+  const auto boundary = out.begin() + num_selected_out[0];
+  REQUIRE(thrust::all_of(c2h::device_policy, out.begin(), boundary, le));
+  REQUIRE(thrust::all_of(c2h::device_policy, boundary, out.end(), equal_to_default_t{}));
+}
+
+C2H_TEST("DeviceSelect::If works for very large number of items", "[device][select_if]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // Input
+  auto in = thrust::make_counting_iterator(static_cast<type>(0));
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  constexpr offset_t match_every_nth = 1000000;
+  offset_t expected_num_copied       = (num_items + match_every_nth - offset_t{1}) / match_every_nth;
+  c2h::device_vector<type> out(expected_num_copied);
+  select_if(
+    in, out.begin(), d_first_num_selected_out, num_items, mod_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == expected_num_copied);
+  auto expected_out_it =
+    thrust::make_transform_iterator(in, multiply_n<offset_t>{static_cast<offset_t>(match_every_nth)});
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), expected_out_it);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
+
+C2H_TEST("DeviceSelect::If works for very large number of output items", "[device][select_if]")
+try
+{
+  using type     = std::uint8_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // Prepare input iterator: it[i] = (i%mod)+(i/div)
+  static constexpr offset_t mod = 200;
+  static constexpr offset_t div = 1000000000;
+  auto in                       = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(offset_t{0}), modx_and_add_divy<offset_t, type>{mod, div});
+
+  // Prepare output
+  c2h::device_vector<type> out(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<offset_t> num_selected_out(1, 0);
+  offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_if(in, out.begin(), d_first_num_selected_out, num_items, always_true_t{});
+
+  // Ensure that we created the correct output
+  REQUIRE(num_selected_out[0] == num_items);
+  bool all_results_correct = thrust::equal(out.cbegin(), out.cend(), in);
+  REQUIRE(all_results_correct == true);
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if_vsmem.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if_vsmem.cu
new file mode 100644
index 000000000..802942a8d
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_if_vsmem.cu
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/vector.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::If, select_if);
+
+using types = c2h::type_list<
+  // Type large enough to dispatch to the fallback policy
+  c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<256>::type>,
+  // Type large enough to require virtual shared memory
+  c2h::custom_type_t<c2h::equal_comparable_t, c2h::less_comparable_t, c2h::huge_data<512>::type>>;
+
+template <typename T>
+struct less_than_t
+{
+  T compare;
+
+  explicit __host__ less_than_t(T compare)
+      : compare(compare)
+  {}
+
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a < compare;
+  }
+};
+
+C2H_TEST("DeviceSelect::If works for large types", "[select_if][vsmem][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 10000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in);
+
+  // Just pick one of the input elements as boundary
+  less_than_t<type> le{in[num_items / 2]};
+
+  // Run test
+  c2h::device_vector<int> num_selected_out(1, 0);
+  select_if(in.begin(), out.begin(), num_selected_out.begin(), num_items, le);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  std::stable_partition(reference.begin(), reference.end(), le);
+
+  out.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique.cu
new file mode 100644
index 000000000..aab7d2304
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique.cu
@@ -0,0 +1,352 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/tabulate_output_iterator.h>
+
+#include <cuda/cmath>
+
+#include <algorithm>
+
+#include "catch2_test_device_select_common.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T>
+inline T to_bound(const unsigned long long bound)
+{
+  return static_cast<T>(bound);
+}
+
+template <>
+inline ulonglong2 to_bound(const unsigned long long bound)
+{
+  return {bound, bound};
+}
+
+template <>
+inline ulonglong4 to_bound(const unsigned long long bound)
+{
+  return {bound, bound, bound, bound};
+}
+
+template <>
+inline long2 to_bound(const unsigned long long bound)
+{
+  return {static_cast<long>(bound), static_cast<long>(bound)};
+}
+
+template <>
+inline c2h::custom_type_t<c2h::equal_comparable_t> to_bound(const unsigned long long bound)
+{
+  c2h::custom_type_t<c2h::equal_comparable_t> val;
+  val.key = bound;
+  val.val = bound;
+  return val;
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::Unique, select_unique);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+struct equal_to_default_t
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a == T{};
+  }
+};
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using types = c2h::type_list<std::uint8_t, std::uint32_t>;
+
+C2H_TEST("DeviceSelect::Unique can run with empty input", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 42);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(in.begin(), out.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::Unique handles none equal", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(thrust::counting_iterator<type>(0), thrust::discard_iterator<>(), d_first_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == num_items);
+}
+
+C2H_TEST("DeviceSelect::Unique handles all equal", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items, static_cast<type>(1));
+  c2h::device_vector<type> out(1);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items);
+
+  // At least one item is selected
+  REQUIRE(num_selected_out[0] == 1);
+  REQUIRE(out[0] == in[0]);
+}
+
+C2H_TEST("DeviceSelect::Unique does not change input", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in, to_bound<type>(0), to_bound<type>(42));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // copy input first
+  c2h::device_vector<type> reference = in;
+
+  select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items);
+
+  REQUIRE(reference == in);
+}
+
+C2H_TEST("DeviceSelect::Unique works with iterators", "[device][select_unique]", all_types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in, to_bound<type>(0), to_bound<type>(42));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  const auto boundary              = std::unique(reference.begin(), reference.end());
+  REQUIRE((boundary - reference.begin()) == num_selected_out[0]);
+
+  out.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Unique works with pointers", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<type> out(num_items);
+  c2h::gen(C2H_SEED(2), in, to_bound<type>(0), to_bound<type>(42));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), d_first_num_selected_out, num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  const auto boundary              = std::unique(reference.begin(), reference.end());
+  REQUIRE((boundary - reference.begin()) == num_selected_out[0]);
+
+  out.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceSelect::Unique works with a different output type", "[device][select_unique]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  c2h::device_vector<convertible_from_T<type>> out(num_items);
+  c2h::gen(C2H_SEED(2), in, to_bound<type>(0), to_bound<type>(42));
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique(in.begin(), out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference = in;
+  const auto boundary              = std::unique(reference.begin(), reference.end());
+  REQUIRE((boundary - reference.begin()) == num_selected_out[0]);
+
+  out.resize(num_selected_out[0]);
+  reference.resize(num_selected_out[0]);
+  REQUIRE(reference == out);
+}
+
+C2H_TEST("DeviceSelect::Unique works for very large number of items", "[device][select_unique]")
+try
+{
+  using type     = std::int64_t;
+  using offset_t = std::int64_t;
+
+  // The partition size (the maximum number of items processed by a single kernel invocation) is an important boundary
+  constexpr auto max_partition_size = static_cast<offset_t>(::cuda::std::numeric_limits<std::int32_t>::max());
+
+  offset_t num_items = GENERATE_COPY(
+    values({
+      offset_t{2} * max_partition_size + offset_t{20000000}, // 3 partitions
+      offset_t{2} * max_partition_size, // 2 partitions
+      max_partition_size + offset_t{1}, // 2 partitions
+      max_partition_size, // 1 partitions
+      max_partition_size - offset_t{1} // 1 partitions
+    }),
+    take(2, random(max_partition_size - offset_t{1000000}, max_partition_size + offset_t{1000000})));
+
+  // All unique
+  SECTION("AllUnique")
+  {
+    auto in = thrust::make_counting_iterator(offset_t{0});
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(num_items, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == num_items);
+    bool all_results_correct = are_all_flags_set(correctness_flags, num_items);
+    REQUIRE(all_results_correct == true);
+  }
+
+  // All the same -> single unique
+  SECTION("AllSame")
+  {
+    auto in = thrust::make_constant_iterator(offset_t{0});
+    constexpr offset_t expected_num_unique{1};
+
+    // Prepare tabulate output iterator to verify results in a memory-efficient way:
+    // We use a tabulate iterator that checks whenever the algorithm writes an output whether that item
+    // corresponds to the expected value at that index and, if correct, sets a boolean flag at that index.
+    static constexpr auto bits_per_element = 8 * sizeof(std::uint32_t);
+    c2h::device_vector<std::uint32_t> correctness_flags(::cuda::ceil_div(expected_num_unique, bits_per_element));
+    auto expected_result_it = in;
+    auto check_result_op =
+      make_checking_write_op(expected_result_it, thrust::raw_pointer_cast(correctness_flags.data()));
+    auto check_result_it = thrust::make_tabulate_output_iterator(check_result_op);
+
+    // Needs to be device accessible
+    c2h::device_vector<offset_t> num_selected_out(1, 0);
+    offset_t* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+    // Run test
+    select_unique(in, check_result_it, d_first_num_selected_out, num_items);
+
+    // Ensure that we created the correct output
+    REQUIRE(num_selected_out[0] == expected_num_unique);
+    bool all_results_correct = are_all_flags_set(correctness_flags, expected_num_unique);
+    REQUIRE(all_results_correct == true);
+  }
+}
+catch (std::bad_alloc&)
+{
+  // Exceeding memory is not a failure.
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique_by_key.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique_by_key.cu
new file mode 100644
index 000000000..2b3929292
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_select_unique_by_key.cu
@@ -0,0 +1,536 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_select.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T>
+inline T to_bound(const unsigned long long bound)
+{
+  return static_cast<T>(bound);
+}
+
+template <>
+inline ulonglong2 to_bound(const unsigned long long bound)
+{
+  return {bound, bound};
+}
+
+template <>
+inline ulonglong4 to_bound(const unsigned long long bound)
+{
+  return {bound, bound, bound, bound};
+}
+
+template <>
+inline long2 to_bound(const unsigned long long bound)
+{
+  return {static_cast<long>(bound), static_cast<long>(bound)};
+}
+
+template <>
+inline c2h::custom_type_t<c2h::equal_comparable_t> to_bound(const unsigned long long bound)
+{
+  c2h::custom_type_t<c2h::equal_comparable_t> val;
+  val.key = bound;
+  val.val = bound;
+  return val;
+}
+
+template <typename HugeDataTypeT>
+struct index_to_huge_type_op_t
+{
+  template <typename ValueType>
+  __device__ __host__ HugeDataTypeT operator()(const ValueType& val)
+  {
+    HugeDataTypeT return_val{};
+    return_val.key = val;
+    return_val.val = val;
+    return return_val;
+  }
+};
+
+template <typename ValueT>
+struct index_to_value_t
+{
+  template <typename IndexT>
+  __host__ __device__ __forceinline__ ValueT operator()(IndexT index)
+  {
+    if (static_cast<std::uint64_t>(index) == 4300000000ULL)
+    {
+      return static_cast<ValueT>(1);
+    }
+    else
+    {
+      return static_cast<ValueT>(0);
+    }
+  }
+};
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceSelect::UniqueByKey, select_unique_by_key);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+struct equal_to_default_t
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T& a) const
+  {
+    return a == T{};
+  }
+};
+
+using all_types =
+  c2h::type_list<std::uint8_t,
+                 std::uint16_t,
+                 std::uint32_t,
+                 std::uint64_t,
+                 ulonglong2,
+                 ulonglong4,
+                 int,
+                 long2,
+                 c2h::custom_type_t<c2h::equal_comparable_t>>;
+
+using huge_types = c2h::type_list<c2h::custom_type_t<c2h::equal_comparable_t, c2h::huge_data<128>::type>,
+                                  c2h::custom_type_t<c2h::equal_comparable_t, c2h::huge_data<256>::type>>;
+
+using types = c2h::type_list<std::uint8_t, std::uint32_t>;
+
+C2H_TEST("DeviceSelect::UniqueByKey can run with empty input", "[device][select_unique_by_key]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+  c2h::device_vector<type> empty(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(empty.begin(), empty.begin(), empty.begin(), empty.begin(), d_num_selected_out, num_items);
+
+  REQUIRE(num_selected_out[0] == 0);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey handles none equal", "[device][select_unique_by_key]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> vals_in(num_items);
+  c2h::device_vector<type> vals_out(num_items);
+
+  // Ensure we copy the right value
+  c2h::gen(C2H_SEED(2), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    thrust::counting_iterator<type>(0),
+    vals_in.begin(),
+    thrust::discard_iterator<>(),
+    vals_out.begin(),
+    d_first_num_selected_out,
+    num_items);
+
+  REQUIRE(num_selected_out[0] == num_items);
+  REQUIRE(vals_in == vals_out);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey handles all equal", "[device][select_unique_by_key]", types)
+{
+  using type     = typename c2h::get<0, TestType>;
+  using val_type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> keys_in(num_items, static_cast<type>(1));
+  c2h::device_vector<val_type> vals_in(num_items);
+  c2h::device_vector<type> keys_out(1);
+  c2h::device_vector<val_type> vals_out(1);
+
+  // Ensure we copy the right value
+  c2h::gen(C2H_SEED(2), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    keys_in.begin(), vals_in.begin(), keys_out.begin(), vals_out.begin(), d_first_num_selected_out, num_items);
+
+  // At least one item is selected
+  REQUIRE(num_selected_out[0] == 1);
+  REQUIRE(keys_in[0] == keys_out[0]);
+  REQUIRE(vals_in[0] == vals_out[0]);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey does not change input", "[device][select_unique_by_key]", types)
+{
+  using type     = typename c2h::get<0, TestType>;
+  using val_type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> keys_in(num_items);
+  c2h::device_vector<val_type> vals_in(num_items);
+  c2h::gen(C2H_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));
+  c2h::gen(C2H_SEED(1), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  c2h::device_vector<type> reference_keys     = keys_in;
+  c2h::device_vector<val_type> reference_vals = vals_in;
+
+  select_unique_by_key(
+    keys_in.begin(),
+    vals_in.begin(),
+    thrust::discard_iterator<>(),
+    thrust::discard_iterator<>(),
+    d_first_num_selected_out,
+    num_items);
+
+  // At least one item is selected
+  REQUIRE(reference_keys == keys_in);
+  REQUIRE(reference_vals == vals_in);
+}
+
+template <typename EqualityOpT>
+struct project_first
+{
+  EqualityOpT equality_op;
+  template <typename Tuple>
+  __host__ __device__ bool operator()(const Tuple& lhs, const Tuple& rhs) const
+  {
+    return equality_op(thrust::get<0>(lhs), thrust::get<0>(rhs));
+  }
+};
+
+template <typename T>
+struct custom_equality_op
+{
+  T div_val;
+  __host__ __device__ __forceinline__ bool operator()(const T& lhs, const T& rhs) const
+  {
+    return (lhs / div_val) == (rhs / div_val);
+  }
+};
+
+C2H_TEST("DeviceSelect::UniqueByKey works with iterators", "[device][select_unique_by_key]", all_types)
+{
+  using type     = typename c2h::get<0, TestType>;
+  using val_type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> keys_in(num_items);
+  c2h::device_vector<val_type> vals_in(num_items);
+  c2h::device_vector<type> keys_out(num_items);
+  c2h::device_vector<val_type> vals_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));
+  c2h::gen(C2H_SEED(1), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    keys_in.begin(), vals_in.begin(), keys_out.begin(), vals_out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference_keys     = keys_in;
+  c2h::host_vector<val_type> reference_vals = vals_in;
+  const auto zip_begin                      = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end                        = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
+  REQUIRE((boundary - zip_begin) == num_selected_out[0]);
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey works with pointers", "[device][select_unique_by_key]", types)
+{
+  using type     = typename c2h::get<0, TestType>;
+  using val_type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> keys_in(num_items);
+  c2h::device_vector<val_type> vals_in(num_items);
+  c2h::device_vector<type> keys_out(num_items);
+  c2h::device_vector<val_type> vals_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));
+  c2h::gen(C2H_SEED(1), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    thrust::raw_pointer_cast(keys_in.data()),
+    thrust::raw_pointer_cast(vals_in.data()),
+    thrust::raw_pointer_cast(keys_out.data()),
+    thrust::raw_pointer_cast(vals_out.data()),
+    d_first_num_selected_out,
+    num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference_keys     = keys_in;
+  c2h::host_vector<val_type> reference_vals = vals_in;
+  const auto zip_begin                      = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end                        = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
+  REQUIRE((boundary - zip_begin) == num_selected_out[0]);
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
+
+template <class T>
+struct convertible_from_T
+{
+  T val_;
+
+  convertible_from_T() = default;
+  __host__ __device__ convertible_from_T(const T& val) noexcept
+      : val_(val)
+  {}
+  __host__ __device__ convertible_from_T& operator=(const T& val) noexcept
+  {
+    val_ = val;
+  }
+  // Converting back to T helps satisfy all the machinery that T supports
+  __host__ __device__ operator T() const noexcept
+  {
+    return val_;
+  }
+};
+
+C2H_TEST("DeviceSelect::UniqueByKey works with a different output type", "[device][select_unique_by_key]", types)
+{
+  using type     = typename c2h::get<0, TestType>;
+  using val_type = c2h::custom_type_t<c2h::equal_comparable_t>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  c2h::device_vector<type> keys_in(num_items);
+  c2h::device_vector<val_type> vals_in(num_items);
+  c2h::device_vector<type> keys_out(num_items);
+  c2h::device_vector<convertible_from_T<val_type>> vals_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));
+  c2h::gen(C2H_SEED(1), vals_in);
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    keys_in.begin(), vals_in.begin(), keys_out.begin(), vals_out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference_keys     = keys_in;
+  c2h::host_vector<val_type> reference_vals = vals_in;
+  const auto zip_begin                      = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end                        = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
+  REQUIRE((boundary - zip_begin) == num_selected_out[0]);
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey works and uses vsmem for large types",
+         "[device][select_unique_by_key][vsmem]",
+         huge_types)
+{
+  using type     = std::uint32_t;
+  using val_type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 100000)));
+  c2h::device_vector<type> keys_in(num_items);
+  c2h::device_vector<type> keys_out(num_items);
+  c2h::device_vector<val_type> vals_out(num_items);
+  c2h::gen(C2H_SEED(2), keys_in, to_bound<type>(0), to_bound<type>(42));
+
+  auto vals_it =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0U), index_to_huge_type_op_t<val_type>{});
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  select_unique_by_key(
+    thrust::raw_pointer_cast(keys_in.data()),
+    vals_it,
+    thrust::raw_pointer_cast(keys_out.data()),
+    thrust::raw_pointer_cast(vals_out.data()),
+    d_first_num_selected_out,
+    num_items);
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference_keys = keys_in;
+  c2h::host_vector<val_type> reference_vals(num_items);
+  thrust::copy(vals_it, vals_it + num_items, reference_vals.begin());
+
+  const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<cub::Equality>{cub::Equality{}});
+  REQUIRE((boundary - zip_begin) == num_selected_out[0]);
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey works for very large input that need 64-bit offset types",
+         "[device][select_unique_by_key]")
+{
+  using type       = std::int32_t;
+  using index_type = std::int64_t;
+
+  const std::size_t num_items = 4400000000ULL;
+  c2h::host_vector<type> reference_keys{static_cast<type>(0), static_cast<type>(1), static_cast<type>(0)};
+  c2h::host_vector<index_type> reference_values{0, 4300000000ULL, 4300000001ULL};
+
+  auto keys_in   = thrust::make_transform_iterator(thrust::make_counting_iterator(0ULL), index_to_value_t<type>{});
+  auto values_in = thrust::make_counting_iterator(0ULL);
+  c2h::device_vector<type> keys_out(reference_keys.size());
+  c2h::device_vector<index_type> values_out(reference_values.size());
+
+  // Needs to be device accessible
+  c2h::device_vector<int> num_selected_out(1, 0);
+  int* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(keys_in, values_in, keys_out.begin(), values_out.begin(), d_first_num_selected_out, num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(reference_keys.size() == static_cast<std::size_t>(num_selected_out[0]));
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_values == values_out);
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey works for very large outputs that needs 64-bit offset types",
+         "[device][select_unique_by_key]")
+{
+  using type       = std::int32_t;
+  using index_type = std::int64_t;
+
+  constexpr std::size_t num_items = 4400000000ULL;
+
+  auto keys_in   = thrust::make_counting_iterator(0ULL);
+  auto values_in = thrust::make_counting_iterator(0ULL);
+
+  // Needs to be device accessible
+  c2h::device_vector<index_type> num_selected_out(1, 0);
+  index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(
+    keys_in,
+    values_in,
+    thrust::make_discard_iterator(),
+    thrust::make_discard_iterator(),
+    d_first_num_selected_out,
+    num_items);
+
+  // Ensure that we created the correct output
+  REQUIRE(num_items == static_cast<std::size_t>(num_selected_out[0]));
+}
+
+C2H_TEST("DeviceSelect::UniqueByKey works with a custom equality operator", "[device][select_unique_by_key]")
+{
+  using type        = std::int32_t;
+  using custom_op_t = custom_equality_op<type>;
+  using val_type    = std::uint64_t;
+  using index_type  = std::int64_t;
+
+  const int num_items = GENERATE_COPY(take(2, random(1, 1000000)));
+  auto keys_in        = thrust::make_counting_iterator(static_cast<type>(0));
+  auto values_in      = thrust::make_counting_iterator(0ULL);
+  c2h::device_vector<type> keys_out(num_items);
+  c2h::device_vector<val_type> vals_out(num_items);
+
+  // Needs to be device accessible
+  c2h::device_vector<index_type> num_selected_out(1, 0);
+  index_type* d_first_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  // Run test
+  select_unique_by_key(
+    keys_in,
+    values_in,
+    keys_out.begin(),
+    vals_out.begin(),
+    d_first_num_selected_out,
+    num_items,
+    custom_op_t{static_cast<type>(8)});
+
+  // Ensure that we create the same output as std
+  c2h::host_vector<type> reference_keys(num_items);
+  c2h::host_vector<val_type> reference_vals(num_items);
+  thrust::copy(keys_in, keys_in + num_items, reference_keys.begin());
+  thrust::copy(values_in, values_in + num_items, reference_vals.begin());
+  const auto zip_begin = thrust::make_zip_iterator(reference_keys.begin(), reference_vals.begin());
+  const auto zip_end   = thrust::make_zip_iterator(reference_keys.end(), reference_vals.end());
+  const auto boundary  = std::unique(zip_begin, zip_end, project_first<custom_op_t>{custom_op_t{static_cast<type>(8)}});
+  REQUIRE((boundary - zip_begin) == static_cast<std::ptrdiff_t>(num_selected_out[0]));
+
+  keys_out.resize(num_selected_out[0]);
+  vals_out.resize(num_selected_out[0]);
+  reference_keys.resize(num_selected_out[0]);
+  reference_vals.resize(num_selected_out[0]);
+  REQUIRE(reference_keys == keys_out);
+  REQUIRE(reference_vals == vals_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_three_way_partition.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_three_way_partition.cu
new file mode 100644
index 000000000..b42966d49
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_three_way_partition.cu
@@ -0,0 +1,442 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_partition.cuh>
+
+#include <thrust/partition.h>
+#include <thrust/random.h>
+#include <thrust/reduce.h>
+#include <thrust/shuffle.h>
+#include <thrust/tabulate.h>
+
+#include <cuda/std/utility>
+
+#include "catch2_test_launch_helper.h"
+#include "cub/util_type.cuh"
+#include <c2h/catch2_test_helper.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DevicePartition::If, partition);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using types = c2h::type_list<std::int32_t, std::int64_t>;
+
+template <typename T>
+struct less_than_t
+{
+  T compare;
+
+  explicit __host__ less_than_t(T compare)
+      : compare(compare)
+  {}
+
+  __device__ bool operator()(const T& a) const
+  {
+    return a < compare;
+  }
+};
+
+template <typename T>
+struct equal_to_t
+{
+  T compare;
+
+  explicit __host__ equal_to_t(T compare)
+      : compare(compare)
+  {}
+
+  __device__ bool operator()(const T& a) const
+  {
+    return a == compare;
+  }
+};
+
+template <typename T>
+struct greater_or_equal_t
+{
+  T compare;
+
+  explicit __host__ greater_or_equal_t(T compare)
+      : compare(compare)
+  {}
+
+  __device__ bool operator()(const T& a) const
+  {
+    return a >= compare;
+  }
+};
+
+template <typename ValueT>
+struct count_to_pair_t
+{
+  template <typename OffsetT>
+  __device__ __host__ cuda::std::pair<ValueT, std::uint32_t> operator()(OffsetT id)
+  {
+    return cuda::std::make_pair(static_cast<ValueT>(id), id);
+  }
+};
+
+C2H_TEST("Device three-way partition can handle empty problems", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 0;
+
+  type* in{};
+  type* d_first_part_out{};
+  type* d_second_part_out{};
+  type* d_unselected_out{};
+  type* d_num_selected_out{};
+
+  less_than_t<type> le(type{0});
+  greater_or_equal_t<type> ge(type{1});
+
+  partition(in, d_first_part_out, d_second_part_out, d_unselected_out, d_num_selected_out, num_items, le, ge);
+}
+
+template <typename T>
+struct three_way_partition_result_t
+{
+  three_way_partition_result_t() = delete;
+  three_way_partition_result_t(int num_items)
+      : first_part(num_items)
+      , second_part(num_items)
+      , unselected(num_items)
+  {}
+
+  c2h::device_vector<T> first_part;
+  c2h::device_vector<T> second_part;
+  c2h::device_vector<T> unselected;
+
+  int num_items_in_first_part{};
+  int num_items_in_second_part{};
+  int num_unselected_items{};
+
+  bool operator==(const three_way_partition_result_t<T>& other) const
+  {
+    return std::tie(num_items_in_first_part,
+                    num_items_in_second_part,
+                    num_unselected_items,
+                    first_part,
+                    second_part,
+                    unselected)
+        == std::tie(other.num_items_in_first_part,
+                    other.num_items_in_second_part,
+                    other.num_unselected_items,
+                    other.first_part,
+                    other.second_part,
+                    other.unselected);
+  }
+};
+
+template <typename FirstPartSelectionOp, typename SecondPartSelectionOp, typename T>
+three_way_partition_result_t<T>
+cub_partition(FirstPartSelectionOp first_selector, SecondPartSelectionOp second_selector, c2h::device_vector<T>& in)
+{
+  const int num_items = static_cast<int>(in.size());
+  three_way_partition_result_t<T> result(num_items);
+
+  T* d_in              = thrust::raw_pointer_cast(in.data());
+  T* d_first_part_out  = thrust::raw_pointer_cast(result.first_part.data());
+  T* d_second_part_out = thrust::raw_pointer_cast(result.second_part.data());
+  T* d_unselected_out  = thrust::raw_pointer_cast(result.unselected.data());
+
+  c2h::device_vector<int> num_selected_out(2);
+  int* d_num_selected_out = thrust::raw_pointer_cast(num_selected_out.data());
+
+  partition(
+    d_in,
+    d_first_part_out,
+    d_second_part_out,
+    d_unselected_out,
+    d_num_selected_out,
+    num_items,
+    first_selector,
+    second_selector);
+
+  c2h::host_vector<int> h_num_selected_out(num_selected_out);
+
+  result.num_items_in_first_part  = h_num_selected_out[0];
+  result.num_items_in_second_part = h_num_selected_out[1];
+
+  result.num_unselected_items = num_items - h_num_selected_out[0] - h_num_selected_out[1];
+
+  return result;
+}
+
+template <typename FirstPartSelectionOp, typename SecondPartSelectionOp, typename T>
+three_way_partition_result_t<T>
+thrust_partition(FirstPartSelectionOp first_selector, SecondPartSelectionOp second_selector, c2h::device_vector<T>& in)
+{
+  const int num_items = static_cast<int>(in.size());
+  three_way_partition_result_t<T> result(num_items);
+
+  c2h::device_vector<T> intermediate_result(num_items);
+
+  auto intermediate_iterators = thrust::partition_copy(
+    c2h::device_policy, in.begin(), in.end(), result.first_part.begin(), intermediate_result.begin(), first_selector);
+
+  result.num_items_in_first_part =
+    static_cast<int>(thrust::distance(result.first_part.begin(), intermediate_iterators.first));
+
+  auto final_iterators = thrust::partition_copy(
+    c2h::device_policy,
+    intermediate_result.begin(),
+    intermediate_result.begin() + (num_items - result.num_items_in_first_part),
+    result.second_part.begin(),
+    result.unselected.begin(),
+    second_selector);
+
+  result.num_items_in_second_part =
+    static_cast<int>(thrust::distance(result.second_part.begin(), final_iterators.first));
+
+  result.num_unselected_items = static_cast<int>(thrust::distance(result.unselected.begin(), final_iterators.second));
+
+  return result;
+}
+
+C2H_TEST("Device three-way partition is stable", "[partition][device]", types)
+{
+  using type      = typename c2h::get<0, TestType>;
+  using pair_type = cuda::std::pair<type, std::uint32_t>;
+
+  const int num_items = GENERATE_COPY(take(10, random(1, 1000000)));
+  c2h::device_vector<pair_type> in(num_items);
+
+  thrust::tabulate(c2h::device_policy, in.begin(), in.end(), count_to_pair_t<type>{});
+
+  pair_type first_unselected_val = cuda::std::make_pair(static_cast<type>(num_items / 3), std::uint32_t{});
+
+  pair_type first_val_of_second_part = cuda::std::make_pair(static_cast<type>(2 * num_items / 3), std::uint32_t{});
+
+  less_than_t<pair_type> le(first_unselected_val);
+  greater_or_equal_t<pair_type> ge(first_val_of_second_part);
+
+  auto cub_result    = cub_partition(le, ge, in);
+  auto thrust_result = thrust_partition(le, ge, in);
+
+  REQUIRE(cub_result == thrust_result);
+}
+
+C2H_TEST("Device three-way partition handles empty first part", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(10, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  thrust::sequence(c2h::device_policy, in.begin(), in.end());
+
+  type first_unselected_val     = type{0};
+  type first_val_of_second_part = static_cast<type>(num_items / 2);
+
+  less_than_t<type> le(first_unselected_val);
+  greater_or_equal_t<type> ge(first_val_of_second_part);
+
+  auto cub_result    = cub_partition(le, ge, in);
+  auto thrust_result = thrust_partition(le, ge, in);
+
+  REQUIRE(cub_result == thrust_result);
+  REQUIRE(cub_result.num_items_in_first_part == 0);
+}
+
+C2H_TEST("Device three-way partition handles empty second part", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(10, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  thrust::sequence(c2h::device_policy, in.begin(), in.end());
+
+  type first_unselected_val     = static_cast<type>(num_items / 2);
+  type first_val_of_second_part = type{0}; // empty set for unsigned types
+
+  less_than_t<type> le(first_unselected_val);
+  greater_or_equal_t<type> ge(first_val_of_second_part);
+
+  auto cub_result    = cub_partition(ge, le, in);
+  auto thrust_result = thrust_partition(ge, le, in);
+
+  REQUIRE(cub_result == thrust_result);
+  REQUIRE(cub_result.num_items_in_second_part == 0);
+}
+
+C2H_TEST("Device three-way partition handles empty unselected part", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(10, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  thrust::sequence(c2h::device_policy, in.begin(), in.end());
+
+  type first_unselected_val = static_cast<type>(num_items / 2);
+
+  less_than_t<type> le(first_unselected_val);
+  greater_or_equal_t<type> ge(first_unselected_val);
+
+  auto cub_result    = cub_partition(le, ge, in);
+  auto thrust_result = thrust_partition(le, ge, in);
+
+  REQUIRE(cub_result == thrust_result);
+  REQUIRE(cub_result.num_unselected_items == 0);
+}
+
+C2H_TEST("Device three-way partition handles only unselected items", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items = GENERATE_COPY(take(10, random(1, 1000000)));
+  c2h::device_vector<type> in(num_items);
+  thrust::sequence(c2h::device_policy, in.begin(), in.end());
+
+  type first_unselected_val = type{0};
+
+  less_than_t<type> le(first_unselected_val);
+
+  auto cub_result    = cub_partition(le, le, in);
+  auto thrust_result = thrust_partition(le, le, in);
+
+  REQUIRE(cub_result == thrust_result);
+  REQUIRE(cub_result.num_unselected_items == num_items);
+  REQUIRE(cub_result.num_items_in_first_part == 0);
+  REQUIRE(cub_result.num_items_in_second_part == 0);
+}
+
+C2H_TEST("Device three-way partition handles reverse iterator", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items               = GENERATE_COPY(take(10, random(1, 1000000)));
+  const int num_items_in_first_part = num_items / 3;
+  const int num_unselected_items    = 2 * num_items / 3;
+
+  type first_part_val{0};
+  type second_part_val{1};
+  type unselected_part_val{2};
+
+  c2h::device_vector<type> in(num_items, second_part_val);
+  thrust::fill_n(c2h::device_policy, in.begin(), num_items_in_first_part, first_part_val);
+  thrust::fill_n(c2h::device_policy, in.begin() + num_items_in_first_part, num_unselected_items, unselected_part_val);
+
+  thrust::shuffle(c2h::device_policy, in.begin(), in.end(), thrust::default_random_engine{});
+
+  c2h::device_vector<type> first_and_unselected_part(num_items);
+
+  equal_to_t<type> first_selector{first_part_val};
+  equal_to_t<type> second_selector{second_part_val};
+
+  c2h::device_vector<int> num_selected_out(2);
+
+  partition(
+    in.cbegin(),
+    first_and_unselected_part.begin(),
+    thrust::make_discard_iterator(),
+    first_and_unselected_part.rbegin(),
+    num_selected_out.begin(),
+    num_items,
+    first_selector,
+    second_selector);
+
+  c2h::device_vector<int> h_num_selected_out = num_selected_out;
+
+  REQUIRE(h_num_selected_out[0] == num_items_in_first_part);
+
+  const auto actual_num_unselected_items = thrust::count(
+    c2h::device_policy,
+    first_and_unselected_part.rbegin(),
+    first_and_unselected_part.rbegin() + num_unselected_items,
+    unselected_part_val);
+
+  REQUIRE(actual_num_unselected_items == num_unselected_items);
+
+  const auto actual_num_items_in_first_part = thrust::count(
+    c2h::device_policy,
+    first_and_unselected_part.begin(),
+    first_and_unselected_part.begin() + num_items_in_first_part,
+    first_part_val);
+
+  REQUIRE(actual_num_items_in_first_part == num_items_in_first_part);
+}
+
+C2H_TEST("Device three-way partition handles single output", "[partition][device]", types)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  const int num_items          = GENERATE_COPY(take(10, random(1, 1000000)));
+  int num_items_in_first_part  = num_items / 3;
+  int num_unselected_items     = 2 * num_items / 3;
+  int num_items_in_second_part = num_items - num_items_in_first_part - num_unselected_items;
+
+  type first_part_val{0};
+  type second_part_val{1};
+  type unselected_part_val{2};
+
+  c2h::device_vector<type> in(num_items, second_part_val);
+  thrust::fill_n(c2h::device_policy, in.begin(), num_items_in_first_part, first_part_val);
+  thrust::fill_n(c2h::device_policy, in.begin() + num_items_in_first_part, num_unselected_items, unselected_part_val);
+
+  thrust::shuffle(c2h::device_policy, in.begin(), in.end(), thrust::default_random_engine{});
+
+  c2h::device_vector<type> output(num_items);
+
+  equal_to_t<type> first_selector{first_part_val};
+  equal_to_t<type> second_selector{second_part_val};
+
+  c2h::device_vector<int> num_selected_out(2);
+
+  partition(
+    in.cbegin(),
+    output.begin(),
+    output.begin() + num_items_in_first_part,
+    output.rbegin(),
+    num_selected_out.begin(),
+    num_items,
+    first_selector,
+    second_selector);
+
+  c2h::device_vector<int> h_num_selected_out(num_selected_out);
+
+  REQUIRE(h_num_selected_out[0] == num_items_in_first_part);
+  REQUIRE(h_num_selected_out[1] == num_items_in_second_part);
+
+  const auto actual_num_unselected_items =
+    thrust::count(c2h::device_policy, output.rbegin(), output.rbegin() + num_unselected_items, unselected_part_val);
+  REQUIRE(actual_num_unselected_items == num_unselected_items);
+
+  const auto actual_num_items_in_first_part =
+    thrust::count(c2h::device_policy, output.begin(), output.begin() + num_items_in_first_part, first_part_val);
+  REQUIRE(actual_num_items_in_first_part == num_items_in_first_part);
+
+  const auto actual_num_items_in_second_part = thrust::count(
+    c2h::device_policy,
+    output.begin() + num_items_in_first_part,
+    output.begin() + num_items_in_first_part + num_items_in_second_part,
+    second_part_val);
+  REQUIRE(actual_num_items_in_second_part == num_items_in_second_part);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform.cu
new file mode 100644
index 000000000..949fa6569
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform.cu
@@ -0,0 +1,556 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_for.cuh>
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/zip_function.h>
+
+#include <sstream>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/test_util_vec.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using cub::detail::transform::Algorithm;
+
+template <Algorithm Alg>
+struct policy_hub_for_alg
+{
+  struct max_policy : cub::ChainedPolicy<300, max_policy, max_policy>
+  {
+    static constexpr int min_bif         = 64 * 1024;
+    static constexpr Algorithm algorithm = Alg;
+    using algo_policy =
+      ::cuda::std::_If<Alg == Algorithm::prefetch,
+                       cub::detail::transform::prefetch_policy_t<256>,
+                       cub::detail::transform::async_copy_policy_t<256>>;
+  };
+};
+
+template <Algorithm Alg,
+          typename Offset,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp>
+CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  Offset num_items,
+  TransformOp transform_op,
+  cudaStream_t stream = nullptr)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+
+  constexpr bool RequiresStableAddress = false;
+  return cub::detail::transform::dispatch_t<RequiresStableAddress,
+                                            Offset,
+                                            ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+                                            RandomAccessIteratorOut,
+                                            TransformOp,
+                                            policy_hub_for_alg<Alg>>{}
+    .dispatch(inputs, output, num_items, transform_op, stream);
+}
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::TransformStableArgumentAddresses, transform_many_stable);
+DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point,
+                            transform_many_with_alg,
+                            ESCAPE_LIST(Algorithm Alg, typename Offset),
+                            ESCAPE_LIST(Alg, Offset));
+
+using algorithms =
+  c2h::enum_type_list<Algorithm,
+                      Algorithm::prefetch
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+                      ,
+                      Algorithm::ublkcp
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+                      >;
+
+using offset_types = c2h::type_list<std::int32_t, std::int64_t>;
+
+#ifdef _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP                                \
+    if (alg == Algorithm::ublkcp && ptx_version < 900) \
+    {                                                  \
+      return;                                          \
+    }
+#else // _CUB_HAS_TRANSFORM_UBLKCP
+#  define FILTER_UBLKCP
+#endif // _CUB_HAS_TRANSFORM_UBLKCP
+
+#define FILTER_UNSUPPORTED_ALGS                                           \
+  int ptx_version = 0;                                                    \
+  REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess);                   \
+  _CCCL_DIAG_PUSH                                                         \
+  _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ \
+  FILTER_UBLKCP                                                           \
+  _CCCL_DIAG_POP
+
+C2H_TEST("DeviceTransform::Transform BabelStream add",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t /*, int3, double3*/>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  const int num_items = GENERATE(0, 1, 15, 16, 17, 127, 128, 129, 4095, 4096, 4097); // edge cases around 16 and 128
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg, num_items);
+
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::gen(C2H_SEED(1), a);
+  c2h::gen(C2H_SEED(1), b);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+template <int Alignment>
+struct alignas(Alignment) overaligned_addable_t
+{
+  int value;
+
+  overaligned_addable_t() = default;
+
+  _CCCL_HOST_DEVICE overaligned_addable_t(int val)
+      : value{val}
+  {}
+
+  _CCCL_HOST_DEVICE static void check(const overaligned_addable_t& obj)
+  {
+    if (reinterpret_cast<uintptr_t>(&obj) % Alignment != 0)
+    {
+      printf("Error: object not aligned to %d: %p\n", Alignment, &obj);
+      ::cuda::std::terminate();
+    }
+  }
+
+  _CCCL_HOST_DEVICE friend auto operator==(const overaligned_addable_t& a, const overaligned_addable_t& b) -> bool
+  {
+    check(a);
+    check(b);
+    return a.value == b.value;
+  }
+
+  _CCCL_HOST_DEVICE friend auto
+  operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t
+  {
+    check(a);
+    check(b);
+    return overaligned_addable_t{a.value + b.value};
+  }
+
+  _CCCL_HOST friend auto operator<<(std::ostream& os, const overaligned_addable_t& obj) -> std::ostream&
+  {
+    check(obj);
+    return os << "over{" << obj.value << "}";
+  }
+};
+
+using overaligned_types =
+  c2h::type_list<overaligned_addable_t<32>
+#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned
+                 ,
+                 overaligned_addable_t<256>
+#endif // _CCCL_COMPILER_MSVC
+                 >;
+
+// test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively)
+C2H_TEST("DeviceTransform::Transform overaligned type", "[device][device_transform]", overaligned_types)
+{
+  using type = c2h::get<0, TestType>;
+  CAPTURE(c2h::demangle(typeid(type).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<int> a(num_items, 3); // put some integers at the front, so SMEM has to handle different alignments
+  c2h::device_vector<type> b(num_items, 4);
+
+  c2h::device_vector<type> result(num_items);
+  // we need raw pointers here to halfen the conversion sequence from device_reference<int> -> int -> type when calling
+  // plus(...), which is too long to compile
+  transform_many(::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())),
+                 result.begin(),
+                 num_items,
+                 ::cuda::std::plus<type>{});
+
+  REQUIRE(result == c2h::device_vector<type>(num_items, 7));
+}
+
+C2H_TEST("DeviceTransform::Transform huge type", "[device][device_transform]")
+{
+  using huge_t = c2h::custom_type_t<c2h::equal_comparable_t, c2h::accumulateable_t, c2h::huge_data<666>::type>;
+  static_assert(alignof(huge_t) == 8, "Need a large type with alignment < 16");
+  CAPTURE(c2h::demangle(typeid(huge_t).name()));
+
+  const int num_items = GENERATE(0, 1, 100, 1000);
+  c2h::device_vector<huge_t> a(num_items);
+  c2h::device_vector<huge_t> b(num_items);
+  c2h::gen(C2H_SEED(1), a);
+  c2h::gen(C2H_SEED(1), b);
+
+  c2h::device_vector<huge_t> result(num_items);
+  transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus<huge_t>{});
+
+  c2h::host_vector<huge_t> a_h = a;
+  c2h::host_vector<huge_t> b_h = b;
+  c2h::host_vector<huge_t> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<huge_t>{});
+  REQUIRE(result == reference_h);
+}
+
+struct times_seven
+{
+  _CCCL_HOST_DEVICE auto operator()(unsigned char v) const -> char
+  {
+    return static_cast<unsigned char>(v * 7);
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform with large input", "[device][device_transform]", algorithms)
+try
+{
+  using type         = unsigned char;
+  using offset_t     = cuda::std::int64_t;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(alg);
+
+  constexpr offset_t num_items = (offset_t{1} << 32) + 123456; // a few thread blocks beyond 4GiB
+  c2h::device_vector<type> input(num_items);
+  c2h::gen(C2H_SEED(1), input);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, times_seven{});
+
+  // compute reference and verify
+  c2h::host_vector<type> input_h = input;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(input_h.begin(), input_h.end(), reference_h.begin(), times_seven{});
+  REQUIRE((reference_h == result));
+}
+catch (const std::bad_alloc&)
+{
+  // allocation failure is not a test failure, so we can run tests on smaller GPUs
+}
+
+template <typename T>
+struct nstream_kernel
+{
+  static constexpr T scalar = 42;
+
+  _CCCL_HOST_DEVICE T operator()(const T& ai, const T& bi, const T& ci) const
+  {
+    return ai + bi + scalar * ci;
+  }
+};
+
+// overwrites one input stream
+C2H_TEST("DeviceTransform::Transform BabelStream nstream",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>,
+         offset_types,
+         algorithms)
+{
+  using type         = typename c2h::get<0, TestType>;
+  using offset_t     = typename c2h::get<1, TestType>;
+  constexpr auto alg = c2h::get<2, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+  CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg);
+
+  const int num_items = GENERATE(0, 1, 100, 1000, 10000);
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  c2h::device_vector<type> c(num_items);
+  c2h::gen(C2H_SEED(1), a, type{10}, type{100});
+  c2h::gen(C2H_SEED(1), b, type{10}, type{100});
+  c2h::gen(C2H_SEED(1), c, type{10}, type{100});
+
+  // copy to host before changing
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> c_h = c;
+
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel<type>{});
+
+  // compute reference and verify
+  auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin());
+  std::transform(z, z + num_items, a_h.begin(), thrust::make_zip_function(nstream_kernel<type>{}));
+  REQUIRE(a_h == a);
+}
+
+struct sum_five
+{
+  __device__ auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double
+  {
+    return a + b + c + d + e;
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform add five streams", "[device][device_transform]", algorithms)
+{
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<std::int8_t> a(num_items, 1);
+  c2h::device_vector<std::int16_t> b(num_items, 2);
+  c2h::device_vector<std::int32_t> c(num_items, 3);
+  c2h::device_vector<std::int64_t> d(num_items, 4);
+  c2h::device_vector<float> e(num_items, 5);
+
+  c2h::device_vector<double> result(num_items);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()),
+    result.begin(),
+    num_items,
+    sum_five{});
+
+  // compute reference and verify
+  c2h::device_vector<double> reference(num_items, 1 + 2 + 3 + 4 + 5);
+  REQUIRE(reference == result);
+}
+
+struct give_me_five
+{
+  __device__ auto operator()() const -> int
+  {
+    return 5;
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform no streams", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<int> result(num_items);
+  transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, give_me_five{});
+
+  // compute reference and verify
+  c2h::device_vector<int> reference(num_items, 5);
+  REQUIRE(reference == result);
+}
+
+C2H_TEST("DeviceTransform::Transform fancy input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  thrust::counting_iterator<type> b{10};
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b, reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform fancy output iterator type", "[device][device_transform]", algorithms)
+{
+  using type         = int;
+  using offset_t     = int;
+  constexpr auto alg = c2h::get<0, TestType>::value;
+  FILTER_UNSUPPORTED_ALGS
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items, 13);
+  c2h::device_vector<type> b(num_items, 35);
+  c2h::device_vector<type> result(num_items);
+
+  using thrust::placeholders::_1;
+  auto out = thrust::make_transform_output_iterator(result.begin(), _1 + 4);
+  transform_many_with_alg<alg, offset_t>(
+    ::cuda::std::make_tuple(a.begin(), b.begin()), out, num_items, ::cuda::std::plus<type>{});
+  REQUIRE(result == c2h::device_vector<type>(num_items, (13 + 35) + 4));
+}
+
+C2H_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  thrust::counting_iterator<type> a{0};
+  c2h::device_vector<type> b(num_items, 10);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus<type>{});
+
+  // compute reference and verify
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+struct plus_needs_stable_address
+{
+  int* a;
+  int* b;
+
+  _CCCL_HOST_DEVICE int operator()(const int& v) const
+  {
+    const auto i = &v - a;
+    return v + b[i];
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform address stability", "[device][device_transform]")
+{
+  using type = int;
+
+  constexpr int num_items = 100;
+  c2h::device_vector<type> a(num_items);
+  c2h::device_vector<type> b(num_items);
+  thrust::sequence(a.begin(), a.end());
+  thrust::sequence(b.begin(), b.end(), 42);
+
+  c2h::device_vector<type> result(num_items);
+  transform_many_stable(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())),
+    result.begin(),
+    num_items,
+    plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())});
+
+  // compute reference and verify
+  c2h::device_vector<type> a_h = a;
+  c2h::device_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+// Non-trivially-copyable/relocatable type which cannot be copied using std::memcpy or cudaMemcpy
+struct non_trivial
+{
+  int data;
+
+  non_trivial() = default;
+
+  _CCCL_HOST_DEVICE explicit non_trivial(int data)
+      : data(data)
+  {}
+
+  _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt)
+      : data(nt.data)
+  {}
+
+  _CCCL_HOST_DEVICE auto operator=(const non_trivial& nt) -> non_trivial&
+  {
+    data = nt.data;
+    return *this;
+  }
+
+  _CCCL_HOST_DEVICE auto operator-() const -> non_trivial
+  {
+    return non_trivial{-data};
+  }
+
+  friend _CCCL_HOST_DEVICE auto operator==(non_trivial a, non_trivial b) -> bool
+  {
+    return a.data == b.data;
+  }
+};
+static_assert(!::cuda::std::is_trivially_copyable<non_trivial>::value, ""); // as required by the standard
+static_assert(!thrust::is_trivially_relocatable<non_trivial>::value, ""); // CUB uses this check internally
+
+// Note(bgruber): I gave up on writing a test that checks whether the copy ctor/assignment operator is actually called
+// (e.g. by tracking/counting invocations of those), since C++ allows (but not guarantees) elision of these operations.
+// Also thrust algorithms perform a lot of copies in-between, so the test needs to use only raw allocations and
+// iteration for setup and checking.
+C2H_TEST("DeviceTransform::Transform not trivially relocatable", "[device][device_transform]")
+{
+  constexpr int num_items = 100;
+  c2h::device_vector<non_trivial> input(num_items, non_trivial{42});
+  c2h::device_vector<non_trivial> result(num_items);
+  transform_many(
+    ::cuda::std::make_tuple(thrust::raw_pointer_cast(input.data())), result.begin(), num_items, ::cuda::std::negate<>{});
+
+  const auto reference = c2h::device_vector<non_trivial>(num_items, non_trivial{-42});
+  REQUIRE((reference == result));
+}
+
+C2H_TEST("DeviceTransform::Transform buffer start alignment",
+         "[device][device_transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, float, double>)
+{
+  using type = typename c2h::get<0, TestType>;
+
+  constexpr int num_items = 1000;
+  const int offset        = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned
+  CAPTURE(c2h::demangle(typeid(type).name()), offset);
+  c2h::device_vector<type> input(num_items);
+  thrust::sequence(input.begin(), input.end());
+  c2h::device_vector<type> result(num_items);
+  using thrust::placeholders::_1;
+  transform_many(::cuda::std::make_tuple(input.begin() + offset),
+                 result.begin() + offset,
+                 num_items - offset,
+                 _1 * 10); // FIXME(bgruber): does not work on negative
+
+  c2h::device_vector<type> reference(num_items);
+  thrust::tabulate(reference.begin() + offset, reference.end(), (_1 + offset) * 10);
+  REQUIRE(reference == result);
+}
+
+namespace Catch
+{
+template <typename T>
+struct StringMaker<cub::detail::transform::aligned_base_ptr<T>>
+{
+  static auto convert(cub::detail::transform::aligned_base_ptr<T> abp) -> std::string
+  {
+    std::stringstream ss;
+    ss << "{ptr: " << abp.ptr << ", head_padding: " << abp.head_padding << "}";
+    return ss.str();
+  }
+};
+} // namespace Catch
+
+// TODO(bgruber): rewrite this example using int3
+C2H_TEST("DeviceTransform::Transform aligned_base_ptr", "[device][device_transform]")
+{
+  alignas(128) int arr[256];
+  using namespace cub::detail::transform;
+  CHECK(make_aligned_base_ptr(&arr[0], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 0});
+  CHECK(make_aligned_base_ptr(&arr[1], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 4});
+  CHECK(make_aligned_base_ptr(&arr[5], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 20});
+  CHECK(make_aligned_base_ptr(&arr[31], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[0]), 124});
+  CHECK(make_aligned_base_ptr(&arr[32], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 0});
+  CHECK(make_aligned_base_ptr(&arr[33], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[32]), 4});
+  CHECK(make_aligned_base_ptr(&arr[127], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[96]), 124});
+  CHECK(make_aligned_base_ptr(&arr[128], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 0});
+  CHECK(make_aligned_base_ptr(&arr[129], 128) == aligned_base_ptr<int>{reinterpret_cast<char*>(&arr[128]), 4});
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_api.cu
new file mode 100644
index 000000000..e6b519361
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_api.cu
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cub/device/device_transform.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <c2h/catch2_test_helper.cuh>
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_api()
+{
+  // example-begin transform-many
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<float>{5.2f, 3.1f, -1.1f, 3.0f};
+  auto input3              = thrust::counting_iterator<int>{100};
+  auto op                  = [] __device__(int a, float b, int c) {
+    return (a + b) * c;
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::Transform(
+    ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{520, 111, 397, 618};
+  // example-end transform-many
+  CHECK(result == expected);
+}
+
+C2H_TEST("DeviceTransform::Transform API example", "[device][device_transform]")
+{
+  test_transform_api();
+}
+
+// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows
+void test_transform_stable_api()
+{
+  // example-begin transform-many-stable
+  constexpr auto num_items = 4;
+  auto input1              = thrust::device_vector<int>{0, -2, 5, 3};
+  auto input2              = thrust::device_vector<int>{52, 31, -11, 30};
+
+  auto* input1_ptr = thrust::raw_pointer_cast(input1.data());
+  auto* input2_ptr = thrust::raw_pointer_cast(input2.data());
+
+  auto op = [input1_ptr, input2_ptr] __device__(const int& a) -> int {
+    const auto i = &a - input1_ptr; // we depend on the address of a
+    return a + input2_ptr[i];
+  };
+
+  auto result = thrust::device_vector<int>(num_items);
+  cub::DeviceTransform::TransformStableArgumentAddresses(
+    ::cuda::std::make_tuple(input1_ptr), result.begin(), num_items, op);
+
+  const auto expected = thrust::host_vector<float>{52, 29, -6, 33};
+  // example-end transform-many-stable
+  CHECK(result == expected);
+}
+
+C2H_TEST("DeviceTransform::TransformStableArgumentAddresses API example", "[device][device_transform]")
+{
+  test_transform_stable_api();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_reduce.cu
new file mode 100644
index 000000000..338788466
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_device_transform_reduce.cu
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "insert_nested_NVTX_range_guard.h"
+// above header needs to be included first
+
+#include <cub/device/device_reduce.cuh>
+
+#include <cstdint>
+
+#include "catch2_test_device_reduce.cuh"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+#include <c2h/extended_types.cuh>
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::TransformReduce, device_transform_reduce);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using types = c2h::type_list<std::uint32_t, std::uint64_t>;
+
+template <class T>
+struct square_t
+{
+  __host__ __device__ T operator()(const T& x) const
+  {
+    return x * x;
+  }
+};
+
+C2H_TEST("Device transform reduce works with pointers", "[reduce][device]", types)
+{
+  using item_t         = c2h::get<0, TestType>;
+  using init_t         = item_t;
+  using offset_t       = std::int32_t;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<item_t>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(take(3, random(min_items, max_items)));
+
+  item_t init{42};
+  c2h::device_vector<item_t> out(1);
+  c2h::device_vector<item_t> in(num_items + 1);
+  c2h::gen(C2H_SEED(2), in);
+
+  item_t* d_in  = thrust::raw_pointer_cast(in.data());
+  item_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  const c2h::host_vector<item_t> h_in = in;
+  c2h::host_vector<item_t> h_transformed_in(h_in.size() - 1);
+
+  SECTION("when aligned")
+  {
+    device_transform_reduce(d_in, d_out, num_items, reduction_op_t{}, transform_op_t{}, init);
+
+    std::transform(h_in.begin(), h_in.end() - 1, h_transformed_in.begin(), transform_op_t{});
+    const item_t expected = std::accumulate(h_transformed_in.begin(), h_transformed_in.end(), init);
+
+    INFO("num_items: " << num_items);
+    REQUIRE(expected == out[0]);
+  }
+
+  SECTION("when unaligned")
+  {
+    device_transform_reduce(d_in + 1, d_out, num_items, reduction_op_t{}, transform_op_t{}, init);
+
+    std::transform(h_in.begin() + 1, h_in.end(), h_transformed_in.begin(), transform_op_t{});
+    const item_t expected = std::accumulate(h_transformed_in.begin(), h_transformed_in.end(), init);
+
+    INFO("num_items: " << num_items);
+    REQUIRE(expected == out[0]);
+  }
+}
+
+C2H_TEST("Device transform reduce works with iterators", "[reduce][device]", types)
+{
+  using item_t         = c2h::get<0, TestType>;
+  using init_t         = item_t;
+  using offset_t       = std::int32_t;
+  using reduction_op_t = cub::Sum;
+  using transform_op_t = square_t<item_t>;
+
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(take(3, random(min_items, max_items)));
+
+  const item_t magic_val{2};
+  c2h::device_vector<item_t> in(num_items, magic_val);
+  c2h::device_vector<item_t> out(1);
+
+  device_transform_reduce(in.begin(), out.begin(), num_items, reduction_op_t{}, transform_op_t{}, init_t{});
+
+  const item_t expected = num_items * magic_val * magic_val;
+  const item_t actual   = out[0];
+
+  INFO("num_items: " << num_items);
+  REQUIRE(expected == actual);
+}
+
+struct input_t
+{
+  std::uint32_t a;
+  std::uint32_t b;
+};
+
+struct transformed_input_t
+{
+  std::uint64_t a;
+  std::uint64_t b;
+};
+
+struct init_t
+{
+  char a;
+  char b;
+};
+
+struct accum_t
+{
+  std::uint64_t a;
+  std::uint64_t b;
+
+  __host__ __device__ accum_t()
+      : a{42}
+      , b{42}
+  {}
+
+  __host__ __device__ accum_t(const transformed_input_t& other)
+      : a{other.a}
+      , b{other.b}
+  {}
+
+  __host__ __device__ accum_t(const init_t& other)
+      : a{static_cast<std::uint64_t>(other.a)}
+      , b{static_cast<std::uint64_t>(other.b)}
+  {}
+
+  __host__ __device__ accum_t& operator=(const transformed_input_t& other)
+  {
+    a = other.a;
+    b = other.b;
+    return *this;
+  }
+};
+
+struct output_t
+{
+  std::uint64_t a;
+  std::uint64_t b;
+
+  __host__ __device__ output_t()
+      : a{42}
+      , b{42}
+  {}
+
+  __host__ __device__ output_t(const accum_t& other)
+      : a{other.a}
+      , b{other.b}
+  {}
+
+  __host__ __device__ output_t(const init_t& other)
+      : a{static_cast<std::uint64_t>(other.a)}
+      , b{static_cast<std::uint64_t>(other.b)}
+  {}
+};
+
+struct transform_op_t
+{
+  __host__ __device__ transformed_input_t operator()(const input_t& x) const
+  {
+    return {static_cast<std::uint64_t>(x.a * x.a), static_cast<std::uint64_t>(x.b * x.b)};
+  }
+};
+
+struct reduction_op_t
+{
+  __host__ __device__ accum_t operator()(accum_t x, accum_t y) const
+  {
+    accum_t result{};
+    result.a = x.a + y.a;
+    result.b = x.b + y.b;
+    return result;
+  }
+};
+
+C2H_TEST("Device transform reduce doesn't let input type into reduction op", "[reduce][device]")
+{
+  constexpr int max_items = 5000000;
+  constexpr int min_items = 1;
+
+  const int num_items = GENERATE_COPY(take(3, random(min_items, max_items)));
+
+  const init_t init{3, 3};
+  const input_t magic_val{2, 2};
+
+  c2h::device_vector<input_t> in(num_items, magic_val);
+  c2h::device_vector<output_t> out(1);
+
+  input_t* d_in   = thrust::raw_pointer_cast(in.data());
+  output_t* d_out = thrust::raw_pointer_cast(out.data());
+
+  device_transform_reduce(d_in, d_out, num_items, reduction_op_t{}, transform_op_t{}, init);
+
+  const std::uint64_t expected = num_items * magic_val.a * magic_val.a + init.a;
+  const output_t actual        = out[0];
+
+  INFO("num_items: " << num_items);
+  REQUIRE(expected == actual.a);
+  REQUIRE(expected == actual.b);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_iterator.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_iterator.cu
new file mode 100644
index 000000000..e2a5c9153
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_iterator.cu
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/__cccl/dialect.h>
+
+#include <cstdint>
+
+#include <c2h/catch2_test_helper.cuh>
+
+using scalar_types = c2h::type_list<std::int8_t, std::int16_t, std::int32_t, std::int64_t, float, double>;
+
+using types = ::cuda::std::__type_push_back<
+  scalar_types,
+  char2,
+  short2,
+  int2,
+  long2,
+  longlong2,
+  float2,
+  double2,
+  char3,
+  short3,
+  int3,
+  long3,
+  longlong3,
+  float3,
+  double3,
+  char4,
+  short4,
+  int4,
+  long4,
+  longlong4,
+  float4,
+  double4,
+  c2h::custom_type_t<c2h::equal_comparable_t, c2h::accumulateable_t>>;
+
+template <typename InputIteratorT, typename T>
+__global__ void test_iterator_kernel(InputIteratorT d_in, T* d_out, InputIteratorT* d_itrs)
+{
+  d_out[0] = *d_in; // Value at offset 0
+  d_out[1] = d_in[100]; // Value at offset 100
+  d_out[2] = *(d_in + 1000); // Value at offset 1000
+  d_out[3] = *(d_in + 10000); // Value at offset 10000
+
+  d_in++;
+  d_out[4] = d_in[0]; // Value at offset 1
+
+  d_in += 20;
+  d_out[5]  = d_in[0]; // Value at offset 21
+  d_itrs[0] = d_in; // Iterator at offset 21
+
+  d_in -= 10;
+  d_out[6] = d_in[0]; // Value at offset 11;
+
+  d_in -= 11;
+  d_out[7]  = d_in[0]; // Value at offset 0
+  d_itrs[1] = d_in; // Iterator at offset 0
+}
+
+template <typename InputIteratorT, typename T>
+void test_iterator(InputIteratorT d_in, const c2h::host_vector<T>& h_reference)
+{
+  c2h::device_vector<T> d_out(h_reference.size());
+  c2h::device_vector<InputIteratorT> d_itrs(2, d_in); // TODO(bgruber): using a raw allocation halves the compile time
+                                                      // (nvcc 12.5), because we instantiate a lot of device_vectors
+
+  test_iterator_kernel<<<1, 1>>>(d_in, thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_itrs.data()));
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  c2h::host_vector<InputIteratorT> h_itrs = d_itrs;
+  CHECK(h_reference == c2h::host_vector<T>(d_out)); // comparing host_vectors compiles a lot faster than mixed vectors
+  CHECK(d_in + 21 == h_itrs[0]);
+  CHECK(d_in == h_itrs[1]);
+}
+
+C2H_TEST("Test constant iterator", "[iterator]", scalar_types)
+{
+  using T                = c2h::get<0, TestType>;
+  const T base           = static_cast<T>(GENERATE(0, 99));
+  const auto h_reference = c2h::host_vector<T>{base, base, base, base, base, base, base, base};
+  test_iterator(cub::ConstantInputIterator<T>(base), h_reference);
+}
+
+C2H_TEST("Test counting iterator", "[iterator]", scalar_types)
+{
+  using T                = c2h::get<0, TestType>;
+  const T base           = static_cast<T>(GENERATE(0, 99));
+  const auto h_reference = c2h::host_vector<T>{
+    static_cast<T>(base + 0),
+    static_cast<T>(base + 100),
+    static_cast<T>(base + 1000),
+    static_cast<T>(base + 10000),
+    static_cast<T>(base + 1),
+    static_cast<T>(base + 21),
+    static_cast<T>(base + 11),
+    static_cast<T>(base + 0)};
+  test_iterator(cub::CountingInputIterator<T>(base), h_reference);
+}
+
+using cache_modifiers =
+  c2h::enum_type_list<cub::CacheLoadModifier,
+                      cub::LOAD_DEFAULT,
+                      cub::LOAD_CA,
+                      cub::LOAD_CG,
+                      cub::LOAD_CS,
+                      cub::LOAD_CV,
+                      cub::LOAD_LDG,
+                      cub::LOAD_VOLATILE>;
+
+C2H_TEST("Test cache modified iterator", "[iterator]", types, cache_modifiers)
+{
+  using T                       = c2h::get<0, TestType>;
+  constexpr auto cache_modifier = c2h::get<1, TestType>::value;
+  constexpr int TEST_VALUES     = 11000;
+
+  c2h::device_vector<T> d_data(TEST_VALUES);
+  c2h::gen(C2H_SEED(1), d_data);
+  c2h::host_vector<T> h_data(d_data);
+
+  const auto h_reference = c2h::host_vector<T>{
+    h_data[0], h_data[100], h_data[1000], h_data[10000], h_data[1], h_data[21], h_data[11], h_data[0]};
+  test_iterator(
+    cub::CacheModifiedInputIterator<cache_modifier, T>(const_cast<const T*>(thrust::raw_pointer_cast(d_data.data()))),
+    h_reference);
+}
+
+template <typename T>
+struct transform_op_t
+{
+  _CCCL_HOST_DEVICE T operator()(T input) const
+  {
+    return input + input;
+  }
+};
+
+C2H_TEST("Test transform iterator", "[iterator]", types)
+{
+  using T                   = c2h::get<0, TestType>;
+  constexpr int TEST_VALUES = 11000;
+
+  c2h::device_vector<T> d_data(TEST_VALUES);
+  c2h::gen(C2H_SEED(1), d_data);
+  c2h::host_vector<T> h_data(d_data);
+
+  transform_op_t<T> op;
+  const auto h_reference = c2h::host_vector<T>{
+    op(h_data[0]),
+    op(h_data[100]),
+    op(h_data[1000]),
+    op(h_data[10000]),
+    op(h_data[1]),
+    op(h_data[21]),
+    op(h_data[11]),
+    op(h_data[0])};
+  test_iterator(cub::TransformInputIterator<T, transform_op_t<T>, const T*>(
+                  const_cast<const T*>(const_cast<const T*>(thrust::raw_pointer_cast(d_data.data()))), op),
+                h_reference);
+}
+
+C2H_TEST("Test tex-obj texture iterator", "[iterator]", types)
+{
+  using T                            = c2h::get<0, TestType>;
+  constexpr unsigned int TEST_VALUES = 11000;
+
+  c2h::device_vector<T> d_data(TEST_VALUES);
+  c2h::gen(C2H_SEED(1), d_data);
+  c2h::host_vector<T> h_data(d_data);
+
+  const auto h_reference = c2h::host_vector<T>{
+    h_data[0], h_data[100], h_data[1000], h_data[10000], h_data[1], h_data[21], h_data[11], h_data[0]};
+  cub::TexObjInputIterator<T> d_obj_itr;
+  CubDebugExit(
+    d_obj_itr.BindTexture(const_cast<const T*>(thrust::raw_pointer_cast(d_data.data())), sizeof(T) * TEST_VALUES));
+  test_iterator(d_obj_itr, h_reference);
+}
+
+C2H_TEST("Test texture transform iterator", "[iterator]", types)
+{
+  using T                   = c2h::get<0, TestType>;
+  constexpr int TEST_VALUES = 11000;
+
+  c2h::device_vector<T> d_data(TEST_VALUES);
+  c2h::gen(C2H_SEED(1), d_data);
+  c2h::host_vector<T> h_data(d_data.begin(), d_data.end());
+
+  transform_op_t<T> op;
+  const auto h_reference = c2h::host_vector<T>{
+    op(h_data[0]),
+    op(h_data[100]),
+    op(h_data[1000]),
+    op(h_data[10000]),
+    op(h_data[1]),
+    op(h_data[21]),
+    op(h_data[11]),
+    op(h_data[0])};
+
+  using TextureIterator = cub::TexObjInputIterator<T>;
+  TextureIterator d_tex_itr;
+  CubDebugExit(
+    d_tex_itr.BindTexture(const_cast<const T*>(thrust::raw_pointer_cast(d_data.data())), sizeof(T) * TEST_VALUES));
+  cub::TransformInputIterator<T, transform_op_t<T>, TextureIterator> xform_itr(d_tex_itr, op);
+  test_iterator(xform_itr, h_reference);
+  CubDebugExit(d_tex_itr.UnbindTexture());
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_helper.h b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_helper.h
new file mode 100644
index 000000000..fbe71c1ab
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_helper.h
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <c2h/catch2_test_helper.cuh>
+
+//! @file
+//! This file contains utilities for device-scope API tests
+//!
+//! Device-scope API in CUB can be launched from the host or device side.
+//! Utilities in this file facilitate testing in both cases.
+//!
+//!
+//! ```
+//! // Add PARAM to make CMake generate a test for both host and device launch:
+//! // %PARAM% TEST_LAUNCH lid 0:1
+//!
+//! // Declare CDP wrapper for CUB API. The wrapper will accept the same
+//! // arguments as the CUB API. The wrapper name is provided as the second argument.
+//! DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Sum, cub_reduce_sum);
+//!
+//! C2H_TEST("Reduce test", "[device][reduce]")
+//! {
+//!   // ...
+//!   // Invoke the wrapper from the test. It'll allocate temporary storage and
+//!   // invoke the CUB API on the host or device side while checking return
+//!   // codes and launch errors.
+//!   cub_reduce_sum(d_in, d_out, n, should_be_invoked_on_device);
+//! }
+//!
+//! ```
+//!
+//! It's also possible to cover cuda graph capture. To do that, extend
+//! launcher ids with `2` as follows:
+//!
+//! ```
+//! // %PARAM% TEST_LAUNCH lid 0:1:2
+//! ```
+//!
+//! Graph capture backend of launch helper will add extra parameter to each call,
+//! so `cub_reduce_sum(d_in, d_out, n, should_be_invoked_on_device)` implicitly turns
+//! into `cub_reduce_sum(d_in, d_out, n, should_be_invoked_on_device, stream)`.
+//!
+//! If the wrapped API contains default parameters before stream, you'd want to explicitly
+//! specify those at all invocations.
+//!
+//! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples.
+
+#if !defined(TEST_LAUNCH)
+#  error Test file should contain %PARAM% TEST_LAUNCH lid 0:1:2
+#endif
+
+#define DECLARE_INVOCABLE(API, WRAPPED_API_NAME, TMPL_HEAD_OPT, TMPL_ARGS_OPT)                  \
+  TMPL_HEAD_OPT                                                                                 \
+  struct WRAPPED_API_NAME##_invocable_t                                                         \
+  {                                                                                             \
+    template <class... Ts>                                                                      \
+    CUB_RUNTIME_FUNCTION cudaError_t                                                            \
+    operator()(std::uint8_t* d_temp_storage, std::size_t& temp_storage_bytes, Ts... args) const \
+    {                                                                                           \
+      return API TMPL_ARGS_OPT(d_temp_storage, temp_storage_bytes, args...);                    \
+    }                                                                                           \
+  }
+
+#define DECLARE_LAUNCH_WRAPPER(API, WRAPPED_API_NAME)                                                  \
+  DECLARE_INVOCABLE(API, WRAPPED_API_NAME, , );                                                        \
+  _CCCL_INLINE_VAR constexpr struct WRAPPED_API_NAME##_t                                               \
+  {                                                                                                    \
+    template <class... As>                                                                             \
+    void operator()(As... args) const                                                                  \
+    {                                                                                                  \
+      launch(WRAPPED_API_NAME##_invocable_t{}, args...);                                               \
+    }                                                                                                  \
+  } WRAPPED_API_NAME; /* TODO(bgruber): mark with [[maybe_unused]] in C++17. Below is a workaround: */ \
+  static_assert(((void) WRAPPED_API_NAME, true), "")
+
+#define ESCAPE_LIST(...) __VA_ARGS__
+
+// TODO(bgruber): make the following macro also produce a global instance of a functor, but to pass the template
+// arguments, we need variable templates from C++14.
+#define DECLARE_TMPL_LAUNCH_WRAPPER(API, WRAPPED_API_NAME, TMPL_PARAMS, TMPL_ARGS)                         \
+  DECLARE_INVOCABLE(API, WRAPPED_API_NAME, ESCAPE_LIST(template <TMPL_PARAMS>), ESCAPE_LIST(<TMPL_ARGS>)); \
+  template <TMPL_PARAMS, class... As>                                                                      \
+  static void WRAPPED_API_NAME(As... args)                                                                 \
+  {                                                                                                        \
+    launch(WRAPPED_API_NAME##_invocable_t<TMPL_ARGS>{}, args...);                                          \
+  }
+
+#if TEST_LAUNCH == 2
+
+template <class ActionT, class... Args>
+void launch(ActionT action, Args... args)
+{
+  cudaStream_t stream{};
+  REQUIRE(cudaSuccess == cudaStreamCreate(&stream));
+
+  std::size_t temp_storage_bytes{};
+  cudaError_t error = action(nullptr, temp_storage_bytes, args..., stream);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == error);
+
+  c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+
+  cudaGraph_t graph{};
+  REQUIRE(cudaSuccess == cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+  error = action(thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes, args..., stream);
+  REQUIRE(cudaSuccess == cudaStreamEndCapture(stream, &graph));
+  REQUIRE(cudaSuccess == error);
+
+  cudaGraphExec_t exec{};
+  REQUIRE(cudaSuccess == cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0));
+
+  REQUIRE(cudaSuccess == cudaGraphLaunch(exec, stream));
+  REQUIRE(cudaSuccess == cudaStreamSynchronize(stream));
+
+  REQUIRE(cudaSuccess == cudaGraphExecDestroy(exec));
+  REQUIRE(cudaSuccess == cudaGraphDestroy(graph));
+  REQUIRE(cudaSuccess == cudaStreamDestroy(stream));
+}
+
+#elif TEST_LAUNCH == 1
+
+template <class ActionT, class... Args>
+__global__ void device_side_api_launch_kernel(
+  std::uint8_t* d_temp_storage, std::size_t* temp_storage_bytes, cudaError_t* d_error, ActionT action, Args... args)
+{
+  *d_error = action(d_temp_storage, *temp_storage_bytes, args...);
+}
+
+// We should assign 0 to stream argument when launching on device side, because host stream is not valid there.
+
+template <class ActionT, class... Args>
+void launch(ActionT action, Args... args)
+{
+  c2h::device_vector<cudaError_t> d_error(1, cudaErrorInvalidValue);
+  c2h::device_vector<std::size_t> d_temp_storage_bytes(1, 0);
+  device_side_api_launch_kernel<<<1, 1>>>(
+    nullptr,
+    thrust::raw_pointer_cast(d_temp_storage_bytes.data()),
+    thrust::raw_pointer_cast(d_error.data()),
+    action,
+    args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == d_error[0]);
+
+  c2h::device_vector<std::uint8_t> temp_storage(d_temp_storage_bytes[0]);
+
+  device_side_api_launch_kernel<<<1, 1>>>(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    thrust::raw_pointer_cast(d_temp_storage_bytes.data()),
+    thrust::raw_pointer_cast(d_error.data()),
+    action,
+    args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == d_error[0]);
+}
+
+#else // TEST_LAUNCH == 0
+
+template <class ActionT, class... Args>
+void launch(ActionT action, Args... args)
+{
+  std::size_t temp_storage_bytes{};
+  cudaError_t error = action(nullptr, temp_storage_bytes, args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == error);
+
+  REQUIRE(temp_storage_bytes > 0); // required by API contract
+
+  c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+
+  error = action(thrust::raw_pointer_cast(temp_storage.data()), temp_storage_bytes, args...);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+  REQUIRE(cudaSuccess == error);
+}
+
+#endif // TEST_LAUNCH == 0
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_wrapper.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_wrapper.cu
new file mode 100644
index 000000000..21670628e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_launch_wrapper.cu
@@ -0,0 +1,216 @@
+/******************************************************************************
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/count.h>
+
+#include <cuda/std/tuple>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+template <class T>
+__global__ void cub_api_example_x2_0_kernel(const T* d_in, T* d_out, int num_items)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < num_items)
+  {
+    d_out[i] = d_in[i] * T{2};
+  }
+}
+
+template <class T>
+__global__ void cub_api_example_x0_5_kernel(const T* d_in, T* d_out, int num_items)
+{
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < num_items)
+  {
+    d_out[i] = d_in[i] / T{2};
+  }
+}
+
+struct cub_api_example_t
+{
+  static constexpr int threads_in_block = 256;
+
+  template <class T, class KernelT>
+  CUB_RUNTIME_FUNCTION static cudaError_t invoke(
+    std::uint8_t* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    KernelT kernel,
+    const T* d_in,
+    T* d_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    constexpr bool should_be_invoked_on_device = TEST_LAUNCH == 1;
+
+    NV_IF_ELSE_TARGET(NV_IS_HOST,
+                      (if (should_be_invoked_on_device) { return cudaErrorLaunchFailure; }),
+                      (if (!should_be_invoked_on_device) { return cudaErrorLaunchFailure; }));
+
+    if (d_temp_storage == nullptr)
+    {
+      temp_storage_bytes = static_cast<std::size_t>(num_items);
+      return cudaSuccess;
+    }
+
+    if (temp_storage_bytes != static_cast<std::size_t>(num_items))
+    {
+      return cudaErrorInvalidValue;
+    }
+
+#if TEST_LAUNCH == 2
+    NV_IF_TARGET(NV_IS_HOST,
+                 (cudaStreamCaptureStatus status{}; cudaStreamIsCapturing(stream, &status);
+                  if (status != cudaStreamCaptureStatusActive) { return cudaErrorLaunchFailure; }));
+#endif
+
+    const int blocks_in_grid = (num_items + threads_in_block - 1) / threads_in_block;
+
+    return thrust::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
+      .doit(kernel, d_in, d_out, num_items);
+  }
+
+  template <class T>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  x2_0(std::uint8_t* d_temp_storage,
+       std::size_t& temp_storage_bytes,
+       const T* d_in,
+       T* d_out,
+       int num_items,
+       cudaStream_t stream = 0)
+  {
+    return invoke(d_temp_storage, temp_storage_bytes, cub_api_example_x2_0_kernel<T>, d_in, d_out, num_items, stream);
+  }
+
+  template <class T>
+  CUB_RUNTIME_FUNCTION static cudaError_t
+  x0_5(std::uint8_t* d_temp_storage,
+       std::size_t& temp_storage_bytes,
+       const T* d_in,
+       T* d_out,
+       int num_items,
+       cudaStream_t stream = 0)
+  {
+    return invoke(d_temp_storage, temp_storage_bytes, cub_api_example_x0_5_kernel<T>, d_in, d_out, num_items, stream);
+  }
+};
+
+DECLARE_LAUNCH_WRAPPER(cub_api_example_t::x2_0, x2_0);
+DECLARE_LAUNCH_WRAPPER(cub_api_example_t::x0_5, x0_5);
+
+C2H_TEST("Launch wrapper works with predefined invocables", "[test][utils]")
+{
+  INFO("Launch = " << TEST_LAUNCH);
+
+  int n = 42;
+  c2h::device_vector<int> in(n, 21);
+  c2h::device_vector<int> out(n);
+
+  int* d_in  = thrust::raw_pointer_cast(in.data());
+  int* d_out = thrust::raw_pointer_cast(out.data());
+
+  {
+    x2_0(d_in, d_out, n);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(c2h::device_policy, out.begin(), out.end(), 42));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+
+  {
+    x0_5(d_out, d_out, n);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(c2h::device_policy, out.begin(), out.end(), 21));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+}
+
+struct custom_x2_0_invocable
+{
+  template <class T>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(
+    std::uint8_t* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const T* d_in,
+    T* d_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    return cub_api_example_t::x2_0(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+};
+
+struct custom_x0_5_invocable
+{
+  template <class T>
+  CUB_RUNTIME_FUNCTION cudaError_t operator()(
+    std::uint8_t* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    const T* d_in,
+    T* d_out,
+    int num_items,
+    cudaStream_t stream = 0)
+  {
+    return cub_api_example_t::x0_5(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  }
+};
+
+C2H_TEST("Launch wrapper works with custom invocables", "[test][utils]")
+{
+  int n = 42;
+  c2h::device_vector<int> in(n, 21);
+  c2h::device_vector<int> out(n);
+
+  int* d_in  = thrust::raw_pointer_cast(in.data());
+  int* d_out = thrust::raw_pointer_cast(out.data());
+
+  {
+    launch(custom_x2_0_invocable{}, d_in, d_out, n);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(c2h::device_policy, out.begin(), out.end(), 42));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+
+  {
+    launch(custom_x0_5_invocable{}, d_out, d_out, n);
+
+    const auto actual   = static_cast<std::size_t>(thrust::count(c2h::device_policy, out.begin(), out.end(), 21));
+    const auto expected = static_cast<std::size_t>(n);
+
+    REQUIRE(actual == expected);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_nvrtc.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_nvrtc.cu
new file mode 100644
index 000000000..5bd261452
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_nvrtc.cu
@@ -0,0 +1,303 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cuda.h>
+
+#include <string>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <nvrtc.h>
+
+TEST_CASE("Test nvrtc", "[test][nvrtc]")
+{
+  nvrtcProgram prog{};
+
+  const char* src =
+    "#include <cub/warp/warp_reduce.cuh>                                                         \n"
+    "#include <cub/warp/warp_scan.cuh>                                                           \n"
+    "#include <cub/warp/warp_exchange.cuh>                                                       \n"
+    "#include <cub/warp/warp_load.cuh>                                                           \n"
+    "#include <cub/warp/warp_store.cuh>                                                          \n"
+    "#include <cub/warp/warp_merge_sort.cuh>                                                     \n"
+    "#include <cub/block/block_adjacent_difference.cuh>                                          \n"
+    "#include <cub/block/block_discontinuity.cuh>                                                \n"
+    "#include <cub/block/block_exchange.cuh>                                                     \n"
+    "#include <cub/block/block_histogram.cuh>                                                    \n"
+    "#include <cub/block/block_load.cuh>                                                         \n"
+    "#include <cub/block/block_store.cuh>                                                        \n"
+    "#include <cub/block/block_merge_sort.cuh>                                                   \n"
+    "#include <cub/block/block_radix_rank.cuh>                                                   \n"
+    "#include <cub/block/block_radix_sort.cuh>                                                   \n"
+    "#include <cub/block/block_reduce.cuh>                                                       \n"
+    "#include <cub/block/block_scan.cuh>                                                         \n"
+    "#include <cub/device/dispatch/kernels/reduce.cuh>                                           \n"
+    "#include <cub/device/dispatch/kernels/for_each.cuh>                                         \n"
+    "                                                                                            \n"
+    "extern \"C\" __global__ void kernel(int *ptr, int *errors)                                  \n"
+    "{                                                                                           \n"
+    "  constexpr int items_per_thread = 4;                                                       \n"
+    "  constexpr int threads_per_block = 128;                                                    \n"
+    "  using warp_load_t = cub::WarpLoad<int, items_per_thread>;                                 \n"
+    "  using warp_load_storage_t = warp_load_t::TempStorage;                                     \n"
+    "                                                                                            \n"
+    "  using warp_exchange_t = cub::WarpExchange<int, items_per_thread>;                         \n"
+    "  using warp_exchange_storage_t = warp_exchange_t::TempStorage;                             \n"
+    "                                                                                            \n"
+    "  using warp_reduce_t = cub::WarpReduce<int>;                                               \n"
+    "  using warp_reduce_storage_t = warp_reduce_t::TempStorage;                                 \n"
+    "                                                                                            \n"
+    "  using warp_merge_sort_t = cub::WarpMergeSort<int, items_per_thread>;                      \n"
+    "  using warp_merge_sort_storage_t = warp_merge_sort_t::TempStorage;                         \n"
+    "                                                                                            \n"
+    "  using warp_scan_t = cub::WarpScan<int>;                                                   \n"
+    "  using warp_scan_storage_t = warp_scan_t::TempStorage;                                     \n"
+    "                                                                                            \n"
+    "  using warp_store_t = cub::WarpStore<int, items_per_thread>;                               \n"
+    "  using warp_store_storage_t = warp_store_t::TempStorage;                                   \n"
+    "                                                                                            \n"
+    "  __shared__ warp_load_storage_t warp_load_storage;                                         \n"
+    "  __shared__ warp_exchange_storage_t warp_exchange_storage;                                 \n"
+    "  __shared__ warp_reduce_storage_t warp_reduce_storage;                                     \n"
+    "  __shared__ warp_merge_sort_storage_t warp_merge_sort_storage;                             \n"
+    "  __shared__ warp_scan_storage_t warp_scan_storage;                                         \n"
+    "  __shared__ warp_store_storage_t warp_store_storage;                                       \n"
+    "                                                                                            \n"
+    "  int items[items_per_thread];                                                              \n"
+    "  if (threadIdx.x < 32)                                                                     \n"
+    "  {                                                                                         \n"
+    "    // Test warp load                                                                       \n"
+    "    warp_load_t(warp_load_storage).Load(ptr, items);                                        \n"
+    "                                                                                            \n"
+    "    for (int i = 0; i < items_per_thread; i++)                                              \n"
+    "    {                                                                                       \n"
+    "      if (items[i] != (i + threadIdx.x * items_per_thread))                                 \n"
+    "      {                                                                                     \n"
+    "        atomicAdd(errors, 1);                                                               \n"
+    "      }                                                                                     \n"
+    "    }                                                                                       \n"
+    "                                                                                            \n"
+    "    // Test warp exchange                                                                   \n"
+    "    warp_exchange_t(warp_exchange_storage).BlockedToStriped(items, items);                  \n"
+    "                                                                                            \n"
+    "    for (int i = 0; i < items_per_thread; i++)                                              \n"
+    "    {                                                                                       \n"
+    "      if (items[i] != (i * 32 + threadIdx.x))                                               \n"
+    "      {                                                                                     \n"
+    "        atomicAdd(errors, 1);                                                               \n"
+    "      }                                                                                     \n"
+    "    }                                                                                       \n"
+    "                                                                                            \n"
+    "    // Test warp reduce                                                                     \n"
+    "    const int sum = warp_reduce_t(warp_reduce_storage).Sum(items[0]);                       \n"
+    "    if (threadIdx.x == 0)                                                                   \n"
+    "    {                                                                                       \n"
+    "      if (sum != (32 * (32 - 1) / 2))                                                       \n"
+    "      {                                                                                     \n"
+    "        atomicAdd(errors, 1);                                                               \n"
+    "      }                                                                                     \n"
+    "    }                                                                                       \n"
+    "                                                                                            \n"
+    "    // Test warp scan                                                                       \n"
+    "    int prefix_sum{};                                                                       \n"
+    "    warp_scan_t(warp_scan_storage).InclusiveSum(items[0], prefix_sum);                      \n"
+    "    if (prefix_sum != (threadIdx.x * (threadIdx.x + 1) / 2))                                \n"
+    "    {                                                                                       \n"
+    "      atomicAdd(errors, 1);                                                                 \n"
+    "    }                                                                                       \n"
+    "                                                                                            \n"
+    "    // Test warp merge sort                                                                 \n"
+    "    warp_merge_sort_t(warp_merge_sort_storage).Sort(                                        \n"
+    "      items,                                                                                \n"
+    "      [](int a, int b) { return a < b; });                                                  \n"
+    "                                                                                            \n"
+    "    for (int i = 0; i < items_per_thread; i++)                                              \n"
+    "    {                                                                                       \n"
+    "      if (items[i] != (i + threadIdx.x * items_per_thread))                                 \n"
+    "      {                                                                                     \n"
+    "        atomicAdd(errors, 1);                                                               \n"
+    "      }                                                                                     \n"
+    "    }                                                                                       \n"
+    "                                                                                            \n"
+    "    // Test warp store                                                                      \n"
+    "    warp_store_t(warp_store_storage).Store(ptr, items);                                     \n"
+    "  }                                                                                         \n"
+    "  __syncthreads();                                                                          \n"
+    "                                                                                            \n"
+    "  using block_load_t = cub::BlockLoad<int, threads_per_block, items_per_thread>;            \n"
+    "  using block_load_storage_t = block_load_t::TempStorage;                                   \n"
+    "                                                                                            \n"
+    "  using block_exchange_t = cub::BlockExchange<int, threads_per_block, items_per_thread>;    \n"
+    "  using block_exchange_storage_t = block_exchange_t::TempStorage;                           \n"
+    "                                                                                            \n"
+    "  using block_reduce_t = cub::BlockReduce<int, threads_per_block>;                          \n"
+    "  using block_reduce_storage_t = block_reduce_t::TempStorage;                               \n"
+    "                                                                                            \n"
+    "  using block_scan_t = cub::BlockScan<int, threads_per_block>;                              \n"
+    "  using block_scan_storage_t = block_scan_t::TempStorage;                                   \n"
+    "                                                                                            \n"
+    "  using block_radix_sort_t = cub::BlockRadixSort<int, threads_per_block, items_per_thread>; \n"
+    "  using block_radix_sort_storage_t = block_radix_sort_t::TempStorage;                       \n"
+    "                                                                                            \n"
+    "  using block_store_t = cub::BlockStore<int, threads_per_block, items_per_thread>;          \n"
+    "  using block_store_storage_t = block_store_t::TempStorage;                                 \n"
+    "                                                                                            \n"
+    "  __shared__ block_load_storage_t block_load_storage;                                       \n"
+    "  __shared__ block_exchange_storage_t block_exchange_storage;                               \n"
+    "  __shared__ block_reduce_storage_t block_reduce_storage;                                   \n"
+    "  __shared__ block_scan_storage_t block_scan_storage;                                       \n"
+    "  __shared__ block_radix_sort_storage_t block_radix_sort_storage;                           \n"
+    "  __shared__ block_store_storage_t block_store_storage;                                     \n"
+    "                                                                                            \n"
+    "  // Test block load                                                                        \n"
+    "  block_load_t(block_load_storage).Load(ptr, items);                                        \n"
+    "                                                                                            \n"
+    "  for (int i = 0; i < items_per_thread; i++)                                                \n"
+    "  {                                                                                         \n"
+    "    if (items[i] != (i + threadIdx.x * items_per_thread))                                   \n"
+    "    {                                                                                       \n"
+    "      atomicAdd(errors, 1);                                                                 \n"
+    "    }                                                                                       \n"
+    "  }                                                                                         \n"
+    "                                                                                            \n"
+    "  // Test block exchange                                                                    \n"
+    "  block_exchange_t(block_exchange_storage).BlockedToStriped(items, items);                  \n"
+    "                                                                                            \n"
+    "  for (int i = 0; i < items_per_thread; i++)                                                \n"
+    "  {                                                                                         \n"
+    "    if (items[i] != (i * threads_per_block + threadIdx.x))                                  \n"
+    "    {                                                                                       \n"
+    "      atomicAdd(errors, 1);                                                                 \n"
+    "    }                                                                                       \n"
+    "  }                                                                                         \n"
+    "                                                                                            \n"
+    "  // Test block reduce                                                                      \n"
+    "  const int sum = block_reduce_t(block_reduce_storage).Sum(items[0]);                       \n"
+    "  if (threadIdx.x == 0)                                                                     \n"
+    "  {                                                                                         \n"
+    "    if (sum != (threads_per_block * (threads_per_block - 1) / 2))                           \n"
+    "    {                                                                                       \n"
+    "      atomicAdd(errors, 1);                                                                 \n"
+    "    }                                                                                       \n"
+    "  }                                                                                         \n"
+    "                                                                                            \n"
+    "  // Test block scan                                                                        \n"
+    "  int prefix_sum{};                                                                         \n"
+    "  block_scan_t(block_scan_storage).InclusiveSum(items[0], prefix_sum);                      \n"
+    "  if (prefix_sum != (threadIdx.x * (threadIdx.x + 1) / 2))                                  \n"
+    "  {                                                                                         \n"
+    "    atomicAdd(errors, 1);                                                                   \n"
+    "  }                                                                                         \n"
+    "                                                                                            \n"
+    "  // Test block radix sort                                                                  \n"
+    "  block_radix_sort_t(block_radix_sort_storage).SortDescending(items);                       \n"
+    "                                                                                            \n"
+    "  // Test block store                                                                       \n"
+    "  block_store_t(block_store_storage).Store(ptr, items);                                     \n"
+    "}                                                                                           \n";
+
+  const char* name = "test";
+
+  REQUIRE(NVRTC_SUCCESS == nvrtcCreateProgram(&prog, src, name, 0, nullptr, nullptr));
+
+  int ptx_version{};
+  cub::PtxVersion(ptx_version);
+  const std::string arch = std::string("-arch=sm_") + std::to_string(ptx_version / 10);
+  const std::string std  = std::string("-std=c++") + std::to_string(_CCCL_STD_VER - 2000);
+
+  constexpr int num_includes         = 6;
+  const char* includes[num_includes] = {
+    NVRTC_CUB_PATH, NVRTC_THRUST_PATH, NVRTC_LIBCUDACXX_PATH, NVRTC_CTK_PATH, arch.c_str(), std.c_str()};
+
+  std::size_t log_size{};
+  nvrtcResult compile_result = nvrtcCompileProgram(prog, num_includes, includes);
+
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetProgramLogSize(prog, &log_size));
+
+  std::unique_ptr<char[]> log{new char[log_size]};
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetProgramLog(prog, log.get()));
+  INFO("nvrtc log = " << log.get());
+  REQUIRE(NVRTC_SUCCESS == compile_result);
+
+  std::size_t code_size{};
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetCUBINSize(prog, &code_size));
+
+  std::unique_ptr<char[]> code{new char[code_size]};
+  REQUIRE(NVRTC_SUCCESS == nvrtcGetCUBIN(prog, code.get()));
+  REQUIRE(NVRTC_SUCCESS == nvrtcDestroyProgram(&prog));
+
+  CUcontext context{};
+  CUdevice device{};
+  CUmodule module{};
+  CUfunction kernel{};
+
+  REQUIRE(CUDA_SUCCESS == cuInit(0));
+  REQUIRE(CUDA_SUCCESS == cuDeviceGet(&device, 0));
+  REQUIRE(CUDA_SUCCESS == cuCtxCreate(&context, 0, device));
+  REQUIRE(CUDA_SUCCESS == cuModuleLoadDataEx(&module, code.get(), 0, 0, 0));
+  REQUIRE(CUDA_SUCCESS == cuModuleGetFunction(&kernel, module, "kernel"));
+
+  // Generate input for execution, and create output buffers.
+  constexpr int threads_in_block = 128;
+  constexpr int items_per_thread = 4;
+  constexpr int tile_size        = threads_in_block * items_per_thread;
+
+  CUdeviceptr d_ptr{};
+  REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_ptr, tile_size * sizeof(int)));
+
+  CUdeviceptr d_err{};
+  REQUIRE(CUDA_SUCCESS == cuMemAlloc(&d_err, sizeof(int)));
+
+  int h_ptr[tile_size];
+  for (int i = 0; i < tile_size; i++)
+  {
+    h_ptr[i] = i;
+  }
+  REQUIRE(CUDA_SUCCESS == cuMemcpyHtoD(d_ptr, h_ptr, tile_size * sizeof(int)));
+
+  int h_err{0};
+  REQUIRE(CUDA_SUCCESS == cuMemcpyHtoD(d_err, &h_err, sizeof(int)));
+
+  void* args[] = {&d_ptr, &d_err};
+
+  REQUIRE(CUDA_SUCCESS == cuLaunchKernel(kernel, 1, 1, 1, threads_in_block, 1, 1, 0, nullptr, args, 0));
+  REQUIRE(CUDA_SUCCESS == cuCtxSynchronize());
+  REQUIRE(CUDA_SUCCESS == cuMemcpyDtoH(h_ptr, d_ptr, tile_size * sizeof(int)));
+  REQUIRE(CUDA_SUCCESS == cuMemcpyDtoH(&h_err, d_err, sizeof(int)));
+
+  REQUIRE(h_err == 0);
+  for (int i = 0; i < tile_size; i++)
+  {
+    const int actual   = h_ptr[i];
+    const int expected = tile_size - i - 1;
+    REQUIRE(actual == expected);
+  }
+
+  REQUIRE(CUDA_SUCCESS == cuMemFree(d_ptr));
+  REQUIRE(CUDA_SUCCESS == cuMemFree(d_err));
+  REQUIRE(CUDA_SUCCESS == cuModuleUnload(module));
+  REQUIRE(CUDA_SUCCESS == cuCtxDestroy(context));
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_printing.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_printing.cu
new file mode 100644
index 000000000..631a6804c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_printing.cu
@@ -0,0 +1,36 @@
+#include <sstream>
+
+#include "test_util.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <typename T>
+std::string print(T val)
+{
+  std::stringstream ss;
+  ss << val;
+  return ss.str();
+}
+
+#if CUB_IS_INT128_ENABLED
+TEST_CASE("Test utils can print __int128", "[test][utils]")
+{
+  REQUIRE(print(__int128_t{0}) == "0");
+  REQUIRE(print(__int128_t{42}) == "42");
+  REQUIRE(print(__int128_t{-1}) == "-1");
+  REQUIRE(print(__int128_t{-42}) == "-42");
+  REQUIRE(print(-1 * (__int128_t{1} << 120)) == "-1329227995784915872903807060280344576");
+}
+
+TEST_CASE("Test utils can print __uint128", "[test][utils]")
+{
+  REQUIRE(print(__uint128_t{0}) == "0");
+  REQUIRE(print(__uint128_t{1}) == "1");
+  REQUIRE(print(__uint128_t{42}) == "42");
+  REQUIRE(print(__uint128_t{1} << 120) == "1329227995784915872903807060280344576");
+}
+#endif
+
+TEST_CASE("Test utils can print KeyValuePair", "[test][utils]")
+{
+  REQUIRE(print(cub::KeyValuePair<int, int>{42, -42}) == "(42,-42)");
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_radix_operations.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_radix_operations.cu
new file mode 100644
index 000000000..98cbdd388
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_radix_operations.cu
@@ -0,0 +1,665 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/block/radix_rank_sort_operations.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+
+#include <bitset>
+#include <climits>
+#include <limits>
+#include <type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <typename KeyT>
+struct fundamental_extractor_t
+{
+  std::uint32_t bit_start;
+  std::uint32_t mask;
+
+  __host__ __device__ fundamental_extractor_t(std::uint32_t bit_start = 0, std::uint32_t num_bits = 0)
+      : bit_start(bit_start)
+      , mask((1 << num_bits) - 1)
+  {}
+
+  __host__ __device__ std::uint32_t Digit(KeyT key) const
+  {
+    return std::uint32_t(key >> KeyT(bit_start)) & mask;
+  }
+};
+
+template <class T>
+c2h::host_vector<std::uint8_t> get_random_buffer()
+{
+  c2h::device_vector<std::uint8_t> buffer(sizeof(T));
+  c2h::gen(C2H_SEED(3), buffer);
+  return buffer;
+}
+
+constexpr int max_digit_bits = sizeof(std::uint32_t) * CHAR_BIT;
+using digit_bits_t           = std::bitset<max_digit_bits>;
+
+digit_bits_t buffer_to_digit_bits(const char* buffer, int current_bit, int num_bits)
+{
+  digit_bits_t dst; // all bits set to zero
+
+  for (int bit = current_bit; bit < current_bit + num_bits; bit++)
+  {
+    const int dst_bit  = bit - current_bit;
+    const int src_byte = bit / CHAR_BIT;
+    const int src_bit  = bit % CHAR_BIT;
+
+    std::bitset<CHAR_BIT> src(buffer[src_byte]);
+    dst[dst_bit] = src[src_bit];
+  }
+
+  return dst;
+}
+
+using fundamental_types       = c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>;
+using a_few_fundamental_types = c2h::type_list<std::uint8_t, std::uint64_t>;
+
+/**
+ * This test checks that radix operations can extract certain bits out of unsigned integers.
+ * Test runs for all possible combinations of `current_bit` and `num_bits`.
+ * Example for `current_bit = 5`, and `num_bits = 4`:
+ *
+ *          [-------]
+ *    src: 1 1 0 0 1 1 0 0 1 1
+ *    bit: 9 8 7 6 5 4 3 2 1 0
+ *    dst: 0 0 0 0 0 0 1 0 0 1
+ *
+ */
+C2H_TEST("Radix operations extract digits from fundamental types", "[radix][operations]", fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using extractor_t  = fundamental_extractor_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  auto decomposer            = decomposer_t{};
+  constexpr int max_key_bits = sizeof(key_t) * CHAR_BIT;
+  REQUIRE(traits::default_end_bit(decomposer) == max_key_bits);
+
+  key_t val{};
+  c2h::host_vector<char> output_buffer_mem(sizeof(std::uint32_t));
+  const c2h::host_vector<char> input_buffer_mem = get_random_buffer<key_t>();
+
+  char* output_buffer      = thrust::raw_pointer_cast(output_buffer_mem.data());
+  const char* input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  std::memcpy(&val, input_buffer, sizeof(key_t));
+
+  for (int current_bit = 0; current_bit < max_key_bits; current_bit++)
+  {
+    const int max_bits = std::min(max_key_bits - current_bit, max_digit_bits);
+
+    for (int num_bits = 1; num_bits < max_bits; num_bits++)
+    {
+      auto extractor = traits::template digit_extractor<extractor_t>(current_bit, num_bits, decomposer);
+
+      std::uint32_t digit = extractor.Digit(val);
+      std::memcpy(output_buffer, &digit, sizeof(std::uint32_t));
+
+      digit_bits_t result    = buffer_to_digit_bits(output_buffer, 0, num_bits);
+      digit_bits_t reference = buffer_to_digit_bits(input_buffer, current_bit, num_bits);
+
+      REQUIRE(reference == result);
+    }
+  }
+}
+
+template <class T>
+struct tuple_decomposer_t;
+
+template <class... Ts>
+struct tuple_decomposer_t<::cuda::std::tuple<Ts...>>
+{
+  template <std::size_t... Is>
+  __host__ __device__ ::cuda::std::tuple<Ts&...>
+  extract(::cuda::std::tuple<Ts...>& key, thrust::index_sequence<Is...>) const
+  {
+    return ::cuda::std::tie(::cuda::std::get<Is>(key)...);
+  }
+
+  __host__ __device__ ::cuda::std::tuple<Ts&...> operator()(::cuda::std::tuple<Ts...>& key) const
+  {
+    return extract(key, thrust::make_index_sequence<sizeof...(Ts)>{});
+  }
+};
+
+// clang-format off
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if<I == 0>::type
+buffer_to_tpl_helper(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  constexpr std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(&::cuda::std::get<I>(tpl), buffer, element_size);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if <I != 0>::type
+buffer_to_tpl_helper(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  constexpr std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(&::cuda::std::get<I>(tpl), buffer, element_size);
+  buffer_to_tpl_helper<I - 1>(buffer + element_size, tpl);
+}
+
+template <class... Ts>
+void buffer_to_tpl(const char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  buffer_to_tpl_helper<sizeof...(Ts) - 1>(buffer, tpl);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if<I == 0>::type
+tpl_to_buffer_helper(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  constexpr std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(buffer, &::cuda::std::get<I>(tpl), element_size);
+}
+
+template <std::size_t I, class... Ts>
+typename ::cuda::std::enable_if <I != 0>::type
+tpl_to_buffer_helper(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  constexpr std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  std::memcpy(buffer, &::cuda::std::get<I>(tpl), element_size);
+  tpl_to_buffer_helper<I - 1>(buffer + element_size, tpl);
+}
+
+template <class... Ts>
+void tpl_to_buffer(char *buffer, ::cuda::std::tuple<Ts...> &tpl)
+{
+  tpl_to_buffer_helper<sizeof...(Ts) - 1>(buffer, tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts), int>::type
+tpl_to_max_bits(::cuda::std::tuple<Ts...> &)
+{
+  return 0;
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts), int>::type
+tpl_to_max_bits(::cuda::std::tuple<Ts...> &tpl)
+{
+  constexpr std::size_t element_size =
+    sizeof(typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type);
+  return element_size * CHAR_BIT + tpl_to_max_bits<I + 1>(tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts)>::type
+tpl_to_min(::cuda::std::tuple<Ts...> &)
+{}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts)>::type
+tpl_to_min(::cuda::std::tuple<Ts...> &tpl)
+{
+  using T = typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type;
+  ::cuda::std::get<I>(tpl) = std::numeric_limits<T>::lowest();
+  tpl_to_min<I + 1>(tpl);
+}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if<I >= sizeof...(Ts)>::type
+tpl_to_max(::cuda::std::tuple<Ts...> &)
+{}
+
+template <std::size_t I = 0, class... Ts>
+typename ::cuda::std::enable_if <I < sizeof...(Ts)>::type
+tpl_to_max(::cuda::std::tuple<Ts...> &tpl)
+{
+  using T = typename ::cuda::std::tuple_element<I, ::cuda::std::tuple<Ts...>>::type;
+  ::cuda::std::get<I>(tpl) = std::numeric_limits<T>::max();
+  tpl_to_max<I + 1>(tpl);
+}
+// clang-format on
+
+/**
+ * This test checks that radix operations can extract certain bits out of aggregate types.
+ * Test runs for all possible combinations of `current_bit` and `num_bits` excluding padding bits.
+ * For example, `struct custom_t { short s = 65535; float f = -42.2f; };` has the following binary
+ * representation:
+ *
+ *    <------------ `.f` ------------><-- padding ---><---- `.s` ---->
+ *    s< exp. ><----- mantissa ------><-- padding ---><--- short ---->
+ *    1100000010000110011001100110011000000000000000001111111111111111
+ *                               +---~                ~--+
+ *    <           <----  higher bits  /  lower bits  ---->           >
+ *
+ * For `current_bit = 12`, and `num_bits = 9`:
+ *    dst: 0000011011111
+ *         <   fp  ><sh>
+ *
+ */
+template <class... Ts>
+void test_tuple()
+{
+  using tpl_t        = ::cuda::std::tuple<Ts...>;
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+  using extractor_t  = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  tpl_t tpl{};
+  c2h::host_vector<char> output_buffer_mem(sizeof(std::uint32_t));
+  const c2h::host_vector<char> input_buffer_mem = get_random_buffer<tpl_t>();
+
+  char* output_buffer      = thrust::raw_pointer_cast(output_buffer_mem.data());
+  const char* input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  buffer_to_tpl(input_buffer, tpl);
+
+  auto decomposer        = decomposer_t{};
+  const int max_key_bits = tpl_to_max_bits(tpl);
+  REQUIRE(traits::default_end_bit(decomposer) == max_key_bits);
+
+  for (int current_bit = 0; current_bit < max_key_bits; current_bit++)
+  {
+    const int max_bits = std::min(max_key_bits - current_bit, max_digit_bits);
+
+    for (int num_bits = 1; num_bits < max_bits; num_bits++)
+    {
+      auto extractor = traits::template digit_extractor<extractor_t>(current_bit, num_bits, decomposer);
+
+      std::uint32_t digit = extractor.Digit(tpl);
+      std::memcpy(output_buffer, &digit, sizeof(std::uint32_t));
+
+      digit_bits_t result    = buffer_to_digit_bits(output_buffer, 0, num_bits);
+      digit_bits_t reference = buffer_to_digit_bits(input_buffer, current_bit, num_bits);
+
+      // Provides readable error messages:
+      //  00000000000000000000000000000000
+      //  ==
+      //  00000000000000000000000000000001
+      REQUIRE(reference == result);
+    }
+  }
+}
+
+C2H_TEST("Radix operations extract digits from pairs", "[radix][operations]", fundamental_types, fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>>();
+}
+
+C2H_TEST("Radix operations extract digits from triples",
+         "[radix][operations]",
+         fundamental_types,
+         fundamental_types,
+         fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>, //
+             typename c2h::get<2, TestType>>();
+}
+
+C2H_TEST("Radix operations extract digits from tetrads",
+         "[radix][operations]",
+         a_few_fundamental_types,
+         a_few_fundamental_types,
+         a_few_fundamental_types,
+         a_few_fundamental_types)
+{
+  test_tuple<typename c2h::get<0, TestType>, //
+             typename c2h::get<1, TestType>, //
+             typename c2h::get<2, TestType>, //
+             typename c2h::get<3, TestType>>();
+}
+
+/**
+ * This test checks that radix operations can invert bits (`~`) of fundamental types.
+ *
+ *    src: 1 1 0 0 1 1 0 0 1 1
+ *    dst: 0 0 1 1 0 0 1 1 0 0
+ *
+ */
+C2H_TEST("Radix operations inverse fundamental types", "[radix][operations]", fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using extractor_t  = fundamental_extractor_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  auto decomposer = decomposer_t{};
+
+  key_t val{};
+  c2h::host_vector<char> output_buffer_mem(sizeof(key_t));
+  c2h::host_vector<char> input_buffer_mem = get_random_buffer<key_t>();
+
+  char* output_buffer = thrust::raw_pointer_cast(output_buffer_mem.data());
+  char* input_buffer  = thrust::raw_pointer_cast(input_buffer_mem.data());
+  std::memcpy(&val, input_buffer, sizeof(key_t));
+
+  for (std::size_t i = 0; i < input_buffer_mem.size(); i++)
+  {
+    input_buffer[i] = ~input_buffer[i];
+  }
+
+  key_t inv = traits::bit_ordered_inversion_policy::inverse(decomposer, val);
+  std::memcpy(output_buffer, &inv, sizeof(key_t));
+
+  REQUIRE(input_buffer_mem == output_buffer_mem);
+}
+
+/**
+ * This test checks that radix operations can invert bits (`~`) of aggregate types.
+ * For example, `struct custom_t { short s = 65535; float f = -42.2f; };`:
+ *
+ *      <------------ `.f` ------------><-- padding ---><---- `.s` ---->
+ *      s< exp. ><----- mantissa ------><-- padding ---><--- short ---->
+ * src: 1100000010000110011001100110011000000000000000001111111111111111
+ *      +------------------------------~                ~--------------+
+ * dst: 0011111101111001100110011001100111111111111111110000000000000000
+ *      <           <----  higher bits  /  lower bits  ---->           >
+ *
+ */
+C2H_TEST("Radix operations inverse pairs", "[radix][operations]", fundamental_types, fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+  using extractor_t  = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  auto decomposer = decomposer_t{};
+
+  tpl_t tpl{};
+  c2h::host_vector<char> input_buffer_mem = get_random_buffer<tpl_t>();
+
+  char* input_buffer = thrust::raw_pointer_cast(input_buffer_mem.data());
+  buffer_to_tpl(input_buffer, tpl);
+
+  for (std::size_t i = 0; i < input_buffer_mem.size(); i++)
+  {
+    input_buffer[i] = ~input_buffer[i];
+  }
+
+  c2h::host_vector<char> output_buffer_mem = input_buffer_mem;
+  char* output_buffer                      = thrust::raw_pointer_cast(output_buffer_mem.data());
+
+  tpl_t inv = traits::bit_ordered_inversion_policy::inverse(decomposer, tpl);
+  tpl_to_buffer(output_buffer, inv);
+
+  REQUIRE(input_buffer_mem == output_buffer_mem);
+}
+
+/**
+ * This tests checks that radix operations can get a value that when converted
+ * to binary-comparable representation, yields smallest possible value.
+ */
+C2H_TEST("Radix operations infere minimal value for fundamental types", "[radix][operations]", fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  c2h::host_vector<char> output_buffer_mem(sizeof(key_t));
+  c2h::host_vector<char> input_buffer_mem(sizeof(key_t));
+
+  key_t ref = std::numeric_limits<key_t>::lowest();
+  key_t val = traits::min_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+C2H_TEST(
+  "Radix operations infere minimal value for pair types", "[radix][operations]", fundamental_types, fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+
+  tpl_t ref;
+  tpl_to_min(ref);
+
+  tpl_t val = traits::min_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+/**
+ * This tests checks that radix operations can get a value that when converted
+ * to binary-comparable representation, yields largest possible value.
+ */
+C2H_TEST("Radix operations infere maximal value for fundamental types", "[radix][operations]", fundamental_types)
+{
+  using key_t        = typename c2h::get<0, TestType>;
+  using traits       = cub::detail::radix::traits_t<key_t>;
+  using decomposer_t = cub::detail::identity_decomposer_t;
+
+  key_t ref = std::numeric_limits<key_t>::max();
+  key_t val = traits::max_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+C2H_TEST(
+  "Radix operations infere maximal value for pair types", "[radix][operations]", fundamental_types, fundamental_types)
+{
+  using tpl_t = ::cuda::std::tuple<typename c2h::get<0, TestType>, //
+                                   typename c2h::get<1, TestType>>;
+
+  using traits       = cub::detail::radix::traits_t<tpl_t>;
+  using decomposer_t = tuple_decomposer_t<tpl_t>;
+
+  tpl_t ref;
+  tpl_to_max(ref);
+
+  tpl_t val = traits::max_raw_binary_key(decomposer_t{});
+
+  REQUIRE(ref == val);
+}
+
+using fundamental_signed_types = c2h::type_list<std::int8_t, std::int16_t, std::int32_t, std::int64_t>;
+
+/**
+ * This tests checks that radix operations can convert a value to a binary-comparable
+ * represetation. For example, `42.0f` is larger than `-42.0f`, but if we look at the
+ * binary representation, it's not the case because of the sign bit:
+ *
+ *         s< exp. ><----- mantissa ------>
+ *  42.0f: 01000010001010000000000000000000
+ * -42.0f: 11000010001010000000000000000000
+ *
+ */
+C2H_TEST("Radix operations reorder values for pair types",
+         "[radix][operations]",
+         fundamental_signed_types,
+         fundamental_signed_types)
+{
+  using T1    = typename c2h::get<0, TestType>;
+  using UT1   = typename std::make_unsigned<T1>::type;
+  using T2    = typename c2h::get<1, TestType>;
+  using UT2   = typename std::make_unsigned<T2>::type;
+  using tpl_t = ::cuda::std::tuple<T1, T2>;
+
+  using traits            = cub::detail::radix::traits_t<tpl_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = tuple_decomposer_t<tpl_t>;
+
+  std::bitset<sizeof(T1) * CHAR_BIT> bs_1;
+  std::bitset<sizeof(T2) * CHAR_BIT> bs_2;
+
+  // 10000(0)
+  bs_1.set(sizeof(T1) * CHAR_BIT - 1);
+  bs_2.set(sizeof(T2) * CHAR_BIT - 1);
+
+  UT1 ul_1 = static_cast<UT1>(bs_1.to_ullong());
+  UT2 ul_2 = static_cast<UT2>(bs_2.to_ullong());
+
+  T1 l_1 = reinterpret_cast<T1&>(ul_1);
+  T2 l_2 = reinterpret_cast<T2&>(ul_2);
+
+  REQUIRE(l_1 == std::numeric_limits<T1>::lowest());
+  REQUIRE(l_2 == std::numeric_limits<T2>::lowest());
+
+  {
+    tpl_t ref{T1{0}, T2{0}};
+    const tpl_t unordered_val = tpl_t{l_1, l_2};
+    const tpl_t ordered_val   = conversion_policy::to_bit_ordered(decomposer_t{}, unordered_val);
+
+    REQUIRE(ref == ordered_val);
+
+    const tpl_t restored_val = conversion_policy::from_bit_ordered(decomposer_t{}, ordered_val);
+    REQUIRE(restored_val == unordered_val);
+  }
+
+  ul_1 = static_cast<UT1>(std::numeric_limits<T1>::max());
+  ul_2 = static_cast<UT2>(std::numeric_limits<T2>::max());
+
+  l_1 = reinterpret_cast<T1&>(ul_1);
+  l_2 = reinterpret_cast<T2&>(ul_2);
+
+  bs_1 = ul_1;
+  bs_2 = ul_2;
+
+  REQUIRE_FALSE(bs_1[sizeof(T1) * CHAR_BIT - 1]);
+  REQUIRE_FALSE(bs_2[sizeof(T2) * CHAR_BIT - 1]);
+
+  {
+    const tpl_t unordered_val = tpl_t{l_1, l_2};
+    const tpl_t ordered_val   = conversion_policy::to_bit_ordered(decomposer_t{}, unordered_val);
+
+    ul_1 = reinterpret_cast<const UT1&>(::cuda::std::get<0>(ordered_val));
+    ul_2 = reinterpret_cast<const UT2&>(::cuda::std::get<1>(ordered_val));
+
+    REQUIRE(ul_1 == std::numeric_limits<UT1>::max());
+    REQUIRE(ul_2 == std::numeric_limits<UT2>::max());
+
+    const tpl_t restored_val = conversion_policy::from_bit_ordered(decomposer_t{}, ordered_val);
+    REQUIRE(restored_val == unordered_val);
+  }
+}
+
+struct fp_aggregate_t
+{
+  double fp64;
+  float fp32;
+};
+
+struct fp_aggregate_decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<double&, float&> operator()(fp_aggregate_t& val) const
+  {
+    return {val.fp64, val.fp32};
+  }
+};
+
+struct flipped_fp_aggregate_decomposer_t
+{
+  __host__ __device__ ::cuda::std::tuple<float&, double&> operator()(fp_aggregate_t& val) const
+  {
+    return {val.fp32, val.fp64};
+  }
+};
+
+/**
+ * This tests checks radix sort guarantees to treat +0/-0 as the same value.
+ */
+TEST_CASE("Radix operations treat -0/+0 as being equal", "[radix][operations]")
+{
+  using traits            = cub::detail::radix::traits_t<fp_aggregate_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = fp_aggregate_decomposer_t;
+  using extractor_t       = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  fp_aggregate_t negative{-0.0, -0.0f};
+  fp_aggregate_t positive{+0.0, +0.0f};
+  fp_aggregate_t ordered_negative = conversion_policy::to_bit_ordered(decomposer_t{}, negative);
+  fp_aggregate_t ordered_positibe = conversion_policy::to_bit_ordered(decomposer_t{}, positive);
+
+  constexpr int num_bits = CHAR_BIT;
+
+  for (int bit = 0; bit < 8; bit += num_bits)
+  {
+    auto extractor = traits::digit_extractor<extractor_t>(bit, num_bits, decomposer_t{});
+
+    const std::uint32_t digit_positive = extractor.Digit(ordered_positibe);
+    const std::uint32_t digit_negative = extractor.Digit(ordered_negative);
+
+    REQUIRE(digit_positive == digit_negative);
+  }
+}
+
+/**
+ * This tests checks that radix operations respect the order of fields in the
+ * tuple instead of looking at the binary key representation.
+ */
+TEST_CASE("Radix operations allow fields permutation", "[radix][operations]")
+{
+  using traits            = cub::detail::radix::traits_t<fp_aggregate_t>;
+  using conversion_policy = typename traits::bit_ordered_conversion_policy;
+  using decomposer_t      = flipped_fp_aggregate_decomposer_t;
+  using extractor_t       = cub::detail::radix::custom_digit_extractor_t<decomposer_t>;
+
+  fp_aggregate_t lhs{4.2, 2.4f};
+  fp_aggregate_t rhs{2.4, 4.2f};
+
+  REQUIRE(::cuda::std::tie(lhs.fp64, lhs.fp32) > cuda::std::tie(rhs.fp64, rhs.fp32));
+
+  fp_aggregate_t ordered_lhs = conversion_policy::to_bit_ordered(decomposer_t{}, lhs);
+  fp_aggregate_t ordered_rhs = conversion_policy::to_bit_ordered(decomposer_t{}, lhs);
+
+  constexpr int num_bits       = CHAR_BIT;
+  constexpr int aggregate_bits = (sizeof(float) + sizeof(double)) * CHAR_BIT;
+
+  for (int current_bit = aggregate_bits - num_bits; current_bit >= 0; current_bit -= num_bits)
+  {
+    auto extractor = traits::digit_extractor<extractor_t>(current_bit, num_bits, decomposer_t{});
+
+    const std::uint32_t digit_lhs = extractor.Digit(ordered_lhs);
+    const std::uint32_t digit_rhs = extractor.Digit(ordered_rhs);
+
+    if (digit_lhs == digit_rhs)
+    {
+      continue;
+    }
+
+    std::bitset<32> bs_lhs(digit_lhs);
+    std::bitset<32> bs_rhs(digit_rhs);
+
+    for (int bit = 31; bit >= 0; bit--)
+    {
+      REQUIRE_FALSE(bs_lhs[bit]);
+      if (bs_rhs[bit])
+      {
+        return;
+      }
+    }
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_temporary_storage_layout.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_temporary_storage_layout.cu
new file mode 100644
index 000000000..dcff17f03
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_temporary_storage_layout.cu
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <memory>
+
+#include "cub/detail/temporary_storage.cuh"
+#include <c2h/catch2_test_helper.cuh>
+
+using num_storage_slots = c2h::enum_type_list<int, 1, 4, 42>;
+
+template <int Items>
+std::size_t get_temporary_storage_size(std::size_t (&sizes)[Items])
+{
+  void* pointers[Items]{};
+  std::size_t temp_storage_bytes{};
+  CubDebugExit(cub::AliasTemporaries(nullptr, temp_storage_bytes, pointers, sizes));
+  return temp_storage_bytes;
+}
+
+std::size_t get_actual_zero()
+{
+  std::size_t sizes[1]{};
+
+  return get_temporary_storage_size(sizes);
+}
+
+C2H_TEST("Test empty storage", "[temporary_storage_layout]", num_storage_slots)
+{
+  constexpr auto storage_slots = c2h::get<0, TestType>::value;
+  cub::detail::temporary_storage::layout<storage_slots> temporary_storage;
+  CHECK(temporary_storage.get_size() == get_actual_zero());
+}
+
+C2H_TEST("Test partially filled storage", "[temporary_storage_layout]", num_storage_slots)
+{
+  constexpr auto storage_slots             = c2h::get<0, TestType>::value;
+  using target_type                        = std::uint64_t;
+  constexpr std::size_t target_elements    = 42;
+  constexpr std::size_t full_slot_elements = target_elements * sizeof(target_type);
+  constexpr std::size_t empty_slot_elements{};
+
+  cub::detail::temporary_storage::layout<storage_slots> temporary_storage;
+
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>> arrays[storage_slots];
+  std::size_t sizes[storage_slots]{};
+
+  for (int slot_id = 0; slot_id < storage_slots; slot_id++)
+  {
+    auto slot = temporary_storage.get_slot(slot_id);
+
+    const std::size_t elements = slot_id % 2 == 0 ? full_slot_elements : empty_slot_elements;
+
+    sizes[slot_id] = elements * sizeof(target_type);
+    arrays[slot_id].reset(
+      new cub::detail::temporary_storage::alias<target_type>(slot->template create_alias<target_type>(elements)));
+  }
+
+  const std::size_t temp_storage_bytes = temporary_storage.get_size();
+
+  std::unique_ptr<std::uint8_t[]> temp_storage(new std::uint8_t[temp_storage_bytes]);
+
+  temporary_storage.map_to_buffer(temp_storage.get(), temp_storage_bytes);
+
+  CHECK(temp_storage_bytes == get_temporary_storage_size(sizes));
+
+  for (int slot_id = 0; slot_id < storage_slots; slot_id++)
+  {
+    if (slot_id % 2 == 0)
+    {
+      CHECK(arrays[slot_id]->get() != nullptr);
+    }
+    else
+    {
+      CHECK(arrays[slot_id]->get() == nullptr);
+    }
+  }
+}
+
+C2H_TEST("Test grow", "[temporary_storage_layout]", num_storage_slots)
+{
+  constexpr auto StorageSlots                  = c2h::get<0, TestType>::value;
+  using target_type                            = std::uint64_t;
+  constexpr std::size_t target_elements_number = 42;
+
+  cub::detail::temporary_storage::layout<StorageSlots> preset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>> preset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    preset_arrays[slot_id].reset(new cub::detail::temporary_storage::alias<target_type>(
+      preset_layout.get_slot(slot_id)->template create_alias<target_type>(target_elements_number)));
+  }
+
+  cub::detail::temporary_storage::layout<StorageSlots> postset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>> postset_arrays[StorageSlots];
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    postset_arrays[slot_id].reset(new cub::detail::temporary_storage::alias<target_type>(
+      postset_layout.get_slot(slot_id)->template create_alias<target_type>()));
+    postset_arrays[slot_id]->grow(target_elements_number);
+  }
+
+  CHECK(preset_layout.get_size() == postset_layout.get_size());
+
+  const std::size_t tmp_storage_bytes = preset_layout.get_size();
+  std::unique_ptr<std::uint8_t[]> temp_storage(new std::uint8_t[tmp_storage_bytes]);
+
+  preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+  postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+
+  for (int slot_id = 0; slot_id < StorageSlots; slot_id++)
+  {
+    CHECK(postset_arrays[slot_id]->get() == preset_arrays[slot_id]->get());
+  }
+}
+
+C2H_TEST("Test double grow", "[temporary_storage_layout]", num_storage_slots)
+{
+  constexpr auto storage_slots                 = c2h::get<0, TestType>::value;
+  using target_type                            = std::uint64_t;
+  constexpr std::size_t target_elements_number = 42;
+
+  cub::detail::temporary_storage::layout<storage_slots> preset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>> preset_arrays[storage_slots];
+
+  for (int slot_id = 0; slot_id < storage_slots; slot_id++)
+  {
+    preset_arrays[slot_id].reset(new cub::detail::temporary_storage::alias<target_type>(
+      preset_layout.get_slot(slot_id)->template create_alias<target_type>(2 * target_elements_number)));
+  }
+
+  cub::detail::temporary_storage::layout<storage_slots> postset_layout;
+  std::unique_ptr<cub::detail::temporary_storage::alias<target_type>> postset_arrays[storage_slots];
+
+  for (int slot_id = 0; slot_id < storage_slots; slot_id++)
+  {
+    postset_arrays[slot_id].reset(new cub::detail::temporary_storage::alias<target_type>(
+      postset_layout.get_slot(slot_id)->template create_alias<target_type>(target_elements_number)));
+    postset_arrays[slot_id]->grow(2 * target_elements_number);
+  }
+
+  CHECK(preset_layout.get_size() == postset_layout.get_size());
+
+  const std::size_t tmp_storage_bytes = preset_layout.get_size();
+  std::unique_ptr<std::uint8_t[]> temp_storage(new std::uint8_t[tmp_storage_bytes]);
+
+  preset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+  postset_layout.map_to_buffer(temp_storage.get(), tmp_storage_bytes);
+
+  for (int slot_id = 0; slot_id < storage_slots; slot_id++)
+  {
+    CHECK(postset_arrays[slot_id]->get() == preset_arrays[slot_id]->get());
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_operators.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_operators.cu
new file mode 100644
index 000000000..c96bb411c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_operators.cu
@@ -0,0 +1,249 @@
+/*******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+#include <c2h/catch2_test_helper.cuh>
+
+template <class T>
+T Make(int val)
+{
+  return T{val};
+}
+
+template <bool>
+class BaseT
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+};
+
+template <>
+class BaseT<true>
+{
+protected:
+  int m_val{};
+
+public:
+  BaseT(int val)
+      : m_val{val}
+  {}
+
+  __host__ __device__ operator int() const
+  {
+    return m_val;
+  }
+};
+
+#define CUSTOM_TYPE_FACTORY(NAME, RT, OP, CONVERTABLE) \
+  class Custom##NAME##T : public BaseT<CONVERTABLE>    \
+  {                                                    \
+    explicit Custom##NAME##T(int val)                  \
+        : BaseT<CONVERTABLE>(val)                      \
+    {}                                                 \
+                                                       \
+    friend Custom##NAME##T Make<Custom##NAME##T>(int); \
+                                                       \
+  public:                                              \
+    __host__ __device__ RT operator OP(int val) const  \
+    {                                                  \
+      return m_val OP val;                             \
+    }                                                  \
+  }
+
+//                  NAME  RT    OP  CONVERTABLE
+CUSTOM_TYPE_FACTORY(Eq, bool, ==, false);
+CUSTOM_TYPE_FACTORY(Ineq, bool, !=, false);
+CUSTOM_TYPE_FACTORY(Sum, int, +, false);
+CUSTOM_TYPE_FACTORY(Diff, int, -, false);
+CUSTOM_TYPE_FACTORY(Div, int, /, false);
+CUSTOM_TYPE_FACTORY(Gt, bool, >, true);
+CUSTOM_TYPE_FACTORY(Lt, bool, <, true);
+
+C2H_TEST("Equality", "[thread_operator]")
+{
+  cub::Equality op{};
+
+  constexpr int const_magic_val = 42;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, const_magic_val) == true);
+  CHECK(op(const_magic_val, magic_val) == true);
+  CHECK(op(const_magic_val, magic_val + 1) == false);
+
+  CHECK(op(Make<CustomEqT>(magic_val), magic_val) == true);
+  CHECK(op(Make<CustomEqT>(magic_val), magic_val + 1) == false);
+}
+
+C2H_TEST("Inequality", "[thread_operator]")
+{
+  cub::Inequality op{};
+
+  constexpr int const_magic_val = 42;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, const_magic_val) == false);
+  CHECK(op(const_magic_val, magic_val) == false);
+  CHECK(op(const_magic_val, magic_val + 1) == true);
+
+  CHECK(op(Make<CustomIneqT>(magic_val), magic_val) == false);
+  CHECK(op(Make<CustomIneqT>(magic_val), magic_val + 1) == true);
+}
+
+C2H_TEST("InequalityWrapper", "[thread_operator]")
+{
+  cub::Equality wrapped_op{};
+  cub::InequalityWrapper<cub::Equality> op{wrapped_op};
+
+  constexpr int const_magic_val = 42;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, const_magic_val) == false);
+  CHECK(op(const_magic_val, magic_val) == false);
+  CHECK(op(const_magic_val, magic_val + 1) == true);
+
+  CHECK(op(Make<CustomEqT>(magic_val), magic_val) == false);
+  CHECK(op(Make<CustomEqT>(magic_val), magic_val + 1) == true);
+}
+
+#define CUSTOM_SYNC_T(NAME, RT, OP)               \
+  struct Custom##NAME##Sink                       \
+  {                                               \
+    template <class T>                            \
+    __host__ __device__ RT operator OP(T&&) const \
+    {                                             \
+      return RT{};                                \
+    }                                             \
+  }
+
+CUSTOM_SYNC_T(SumInt, int, +);
+CUSTOM_SYNC_T(SumCustomInt, CustomSumIntSink, +);
+
+CUSTOM_SYNC_T(DiffInt, int, -);
+CUSTOM_SYNC_T(DiffCustomInt, CustomDiffIntSink, -);
+
+CUSTOM_SYNC_T(DivInt, int, /);
+CUSTOM_SYNC_T(DivCustomInt, CustomDivIntSink, /);
+
+template <class ExpectedT, class ActualT>
+void StaticSame()
+{
+  STATIC_REQUIRE(std::is_same<ExpectedT, ActualT>::value);
+}
+
+C2H_TEST("Sum", "[thread_operator]")
+{
+  cub::Sum op{};
+
+  constexpr int const_magic_val = 40;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, 2) == 42);
+  CHECK(op(magic_val, 2) == 42);
+  CHECK(op(Make<CustomSumT>(magic_val), 2) == 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomSumIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomSumCustomIntSink{}, 1.0)), CustomSumIntSink>();
+}
+
+C2H_TEST("Difference", "[thread_operator]")
+{
+  cub::Difference op{};
+
+  constexpr int const_magic_val = 44;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, 2) == 42);
+  CHECK(op(magic_val, 2) == 42);
+
+  CHECK(op(Make<CustomDiffT>(magic_val), 2) == 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDiffIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDiffCustomIntSink{}, 1.0)), CustomDiffIntSink>();
+}
+
+C2H_TEST("Division", "[thread_operator]")
+{
+  cub::Division op{};
+
+  constexpr int const_magic_val = 44;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, 2) == 22);
+  CHECK(op(magic_val, 2) == 22);
+
+  CHECK(op(Make<CustomDivT>(magic_val), 2) == 22);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(CustomDivIntSink{}, 1.0)), int>();
+  StaticSame<decltype(op(CustomDivCustomIntSink{}, 1.0)), CustomDivIntSink>();
+}
+
+C2H_TEST("Max", "[thread_operator]")
+{
+  cub::Max op{};
+
+  constexpr int const_magic_val = 42;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, 2) == 42);
+  CHECK(op(magic_val, 2) == 42);
+
+  CHECK(op(2, Make<CustomGtT>(magic_val)) == 42);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomGtT>(magic_val))), int>();
+}
+
+C2H_TEST("Min", "[thread_operator]")
+{
+  cub::Min op{};
+
+  constexpr int const_magic_val = 42;
+  int magic_val                 = const_magic_val;
+
+  CHECK(op(const_magic_val, 2) == 2);
+  CHECK(op(magic_val, 2) == 2);
+
+  CHECK(op(2, Make<CustomLtT>(magic_val)) == 2);
+
+  StaticSame<decltype(op(42, 42)), int>();
+  StaticSame<decltype(op(1, 1.0)), double>();
+  StaticSame<decltype(op(1, Make<CustomLtT>(magic_val))), int>();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_sort.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_sort.cu
new file mode 100644
index 000000000..e05cef8df
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_thread_sort.cu
@@ -0,0 +1,121 @@
+/*******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+
+#include "cub/thread/thread_sort.cuh"
+#include <c2h/catch2_test_helper.cuh>
+
+struct CustomLess
+{
+  template <typename DataType>
+  __host__ __device__ bool operator()(DataType& lhs, DataType& rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+template <typename KeyT, typename ValueT, int ItemsPerThread>
+__global__ void kernel(const KeyT* keys_in, KeyT* keys_out, const ValueT* values_in, ValueT* values_out)
+{
+  KeyT thread_keys[ItemsPerThread];
+  KeyT thread_values[ItemsPerThread];
+
+  const auto thread_offset = ItemsPerThread * threadIdx.x;
+  keys_in += thread_offset;
+  keys_out += thread_offset;
+  values_in += thread_offset;
+  values_out += thread_offset;
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    thread_keys[item]   = keys_in[item];
+    thread_values[item] = values_in[item];
+  }
+
+  cub::StableOddEvenSort(thread_keys, thread_values, CustomLess{});
+
+  for (int item = 0; item < ItemsPerThread; item++)
+  {
+    keys_out[item]   = thread_keys[item];
+    values_out[item] = thread_values[item];
+  }
+}
+
+using value_types           = c2h::type_list<std::uint32_t, std::uint64_t>;
+using items_per_thread_list = c2h::enum_type_list<int, 2, 3, 4, 5, 7, 8, 9, 11>;
+
+C2H_TEST("Test", "[thread_sort]", value_types, items_per_thread_list)
+{
+  using key_t                             = std::uint32_t;
+  using value_t                           = c2h::get<0, TestType>;
+  constexpr int items_per_thread          = c2h::get<1, TestType>::value;
+  constexpr unsigned int threads_in_block = 1024;
+  constexpr unsigned int elements         = threads_in_block * items_per_thread;
+
+  thrust::default_random_engine re;
+  c2h::device_vector<std::uint8_t> data_source(elements);
+
+  for (int iteration = 0; iteration < 10; iteration++)
+  {
+    c2h::gen(C2H_SEED(2), data_source);
+    c2h::device_vector<key_t> in_keys(data_source);
+    c2h::device_vector<key_t> out_keys(elements);
+
+    thrust::shuffle(data_source.begin(), data_source.end(), re);
+    c2h::device_vector<value_t> in_values(data_source);
+    c2h::device_vector<value_t> out_values(elements);
+
+    c2h::host_vector<key_t> host_keys(in_keys);
+    c2h::host_vector<value_t> host_values(in_values);
+
+    kernel<key_t, value_t, items_per_thread><<<1, threads_in_block>>>(
+      thrust::raw_pointer_cast(in_keys.data()),
+      thrust::raw_pointer_cast(out_keys.data()),
+      thrust::raw_pointer_cast(in_values.data()),
+      thrust::raw_pointer_cast(out_values.data()));
+
+    for (unsigned int tid = 0; tid < threads_in_block; tid++)
+    {
+      const auto thread_begin = tid * items_per_thread;
+      const auto thread_end   = thread_begin + items_per_thread;
+
+      thrust::sort_by_key(host_keys.begin() + thread_begin,
+                          host_keys.begin() + thread_end,
+                          host_values.begin() + thread_begin,
+                          CustomLess{});
+    }
+
+    CHECK(host_keys == out_keys);
+    CHECK(host_values == out_values);
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_choose_offset.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_choose_offset.cu
new file mode 100644
index 000000000..08fa0eb98
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_choose_offset.cu
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/detail/choose_offset.cuh>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+C2H_TEST("Tests choose_offset", "[util][type]")
+{
+  // Uses unsigned 32-bit type for signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_offset_t<std::int32_t>, std::uint32_t>::value);
+
+  // Uses unsigned 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_offset_t<std::int8_t>, std::uint32_t>::value);
+
+  // Uses unsigned 64-bit type for signed 64-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_offset_t<std::int64_t>, unsigned long long>::value);
+}
+
+C2H_TEST("Tests choose_signed_offset", "[util][type]")
+{
+  // Uses signed 64-bit type for unsigned signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::uint32_t>, std::int64_t>::value);
+
+  // Uses signed 32-bit type for signed 32-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int32_t>, std::int32_t>::value);
+
+  // Uses signed 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int8_t>, std::int32_t>::value);
+
+  // Uses signed 64-bit type for signed 64-bit type
+  STATIC_REQUIRE(::cuda::std::is_same<cub::detail::choose_signed_offset_t<std::int64_t>, std::int64_t>::value);
+
+  // Offset type covers maximum number representable by a signed 32-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int32_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int32_t>::max()));
+
+  // Offset type covers maximum number representable by a signed 64-bit integer
+  REQUIRE(cudaSuccess
+          == cub::detail::choose_signed_offset<std::int64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::int64_t>::max()));
+
+  // Offset type does not support maximum number representable by an unsigned 64-bit integer
+  REQUIRE(cudaErrorInvalidValue
+          == cub::detail::choose_signed_offset<std::uint64_t>::is_exceeding_offset_type(
+            ::cuda::std::numeric_limits<std::uint64_t>::max()));
+}
+
+C2H_TEST("Tests promote_small_offset", "[util][type]")
+{
+  // Uses input type for types of at least 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::int32_t>, std::int32_t>::value);
+
+  // Uses input type for types of at least 32 bits
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::uint32_t>, std::uint32_t>::value);
+
+  // Uses input type for types of at least 32 bits
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::uint64_t>, std::uint64_t>::value);
+
+  // Uses input type for types of at least 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::int64_t>, std::int64_t>::value);
+
+  // Uses 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::int8_t>, std::int32_t>::value);
+
+  // Uses 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::int16_t>, std::int32_t>::value);
+
+  // Uses 32-bit type for type smaller than 32 bits
+  STATIC_REQUIRE(
+    ::cuda::std::is_same<typename cub::detail::promote_small_offset_t<std::uint16_t>, std::int32_t>::value);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_device.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_device.cu
new file mode 100644
index 000000000..897f5c6ec
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_device.cu
@@ -0,0 +1,284 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_arch.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_macro.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/std/__algorithm/find_if.h>
+#include <cuda/std/array>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+CUB_DETAIL_KERNEL_ATTRIBUTES void write_ptx_version_kernel(int* d_kernel_cuda_arch)
+{
+  *d_kernel_cuda_arch = CUB_PTX_ARCH;
+}
+
+CUB_RUNTIME_FUNCTION static cudaError_t get_cuda_arch_from_kernel(
+  void* d_temp_storage, size_t& temp_storage_bytes, int* d_kernel_cuda_arch, int* ptx_version, cudaStream_t stream = 0)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+  write_ptx_version_kernel<<<1, 1, 0, stream>>>(d_kernel_cuda_arch);
+  return cub::PtxVersion(*ptx_version);
+}
+
+CUB_NAMESPACE_END
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+DECLARE_LAUNCH_WRAPPER(cub::get_cuda_arch_from_kernel, get_cuda_arch_from_kernel);
+
+C2H_TEST("CUB correctly identifies the ptx version the kernel was compiled for", "[util][dispatch]")
+{
+  constexpr std::size_t single_item = 1;
+  c2h::device_vector<int> cuda_arch(single_item);
+
+  // Query the arch the kernel was actually compiled for
+  int ptx_version = [&]() -> int {
+    int* buffer{};
+    cudaMallocHost(&buffer, sizeof(*buffer));
+    get_cuda_arch_from_kernel(thrust::raw_pointer_cast(cuda_arch.data()), buffer);
+    int result = *buffer;
+    cudaFreeHost(buffer);
+    return result;
+  }();
+
+  int kernel_cuda_arch = cuda_arch[0];
+
+  // Host cub::PtxVersion
+  int host_ptx_version{};
+  cub::PtxVersion(host_ptx_version);
+
+  // Ensure variable was properly populated
+  REQUIRE(0 != kernel_cuda_arch);
+
+  // Ensure that the ptx version corresponds to the arch the kernel was compiled for
+  REQUIRE(ptx_version == kernel_cuda_arch);
+  REQUIRE(host_ptx_version == kernel_cuda_arch);
+}
+
+#ifdef __CUDA_ARCH_LIST__
+C2H_TEST("PtxVersion returns a value from __CUDA_ARCH_LIST__", "[util][dispatch]")
+{
+  int ptx_version = 0;
+  REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess);
+  const auto arch_list = std::vector<int>{__CUDA_ARCH_LIST__};
+  REQUIRE(std::find(arch_list.begin(), arch_list.end(), ptx_version) != arch_list.end());
+}
+#endif
+
+#define GEN_POLICY(cur, prev)                                             \
+  struct policy##cur : cub::ChainedPolicy<cur, policy##cur, policy##prev> \
+  {                                                                       \
+    static constexpr int value = cur;                                     \
+  }
+
+#ifdef __CUDA_ARCH_LIST__
+// We list policies for all virtual architectures that __CUDA_ARCH_LIST__ can contain, so the actual architectures the
+// tests are compiled for should match to one of those
+struct policy_hub_all
+{
+  // for the list of supported architectures, see libcudacxx/include/nv/target
+  GEN_POLICY(350, 350);
+  GEN_POLICY(370, 350);
+  GEN_POLICY(500, 370);
+  GEN_POLICY(520, 500);
+  GEN_POLICY(530, 520);
+  GEN_POLICY(600, 530);
+  GEN_POLICY(610, 600);
+  GEN_POLICY(620, 610);
+  GEN_POLICY(700, 620);
+  GEN_POLICY(720, 700);
+  GEN_POLICY(750, 720);
+  GEN_POLICY(800, 750);
+  GEN_POLICY(860, 800);
+  GEN_POLICY(870, 860);
+  GEN_POLICY(890, 870);
+  GEN_POLICY(900, 890);
+  GEN_POLICY(1000, 900);
+  // add more policies here when new architectures emerge
+  GEN_POLICY(2000, 1000); // non-existing architecture, just to test pruning
+
+  using max_policy = policy2000;
+};
+
+// Check that selected is one of arches
+template <int Selected, int... ArchList>
+struct check
+{
+  static_assert(::cuda::std::_Or<::cuda::std::bool_constant<Selected == ArchList>...>::value, "");
+  using type = cudaError_t;
+};
+
+struct closure_all
+{
+  int ptx_version;
+
+  // We need to fail template instantiation if ActivePolicy::value is not one from the __CUDA_ARCH_LIST__
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION auto Invoke() const -> typename check<ActivePolicy::value, __CUDA_ARCH_LIST__>::type
+  {
+    // policy_hub_all must list all PTX virtual architectures, so we can do an exact comparison here
+#  if TEST_LAUNCH == 0
+    REQUIRE(+ActivePolicy::value == ptx_version);
+#  endif // TEST_LAUNCH == 0
+    // the returned error code will be checked by the launch helper
+    return +ActivePolicy::value == ptx_version ? cudaSuccess : cudaErrorInvalidValue;
+  }
+};
+
+CUB_RUNTIME_FUNCTION cudaError_t
+check_chained_policy_prunes_to_arch_list(void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t = 0)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+  int ptx_version = 0;
+  cub::PtxVersion(ptx_version);
+  closure_all c{ptx_version};
+  return policy_hub_all::max_policy::Invoke(ptx_version, c);
+}
+
+DECLARE_LAUNCH_WRAPPER(check_chained_policy_prunes_to_arch_list, check_wrapper_all);
+
+C2H_TEST("ChainedPolicy prunes based on __CUDA_ARCH_LIST__", "[util][dispatch]")
+{
+  check_wrapper_all();
+}
+#endif
+
+template <int NumPolicies>
+struct check_policy_closure
+{
+  int ptx_version;
+  ::cuda::std::array<int, NumPolicies> policies;
+
+  // quick way to get a comparator for find_if below
+  _CCCL_HOST_DEVICE bool operator()(int policy_ver) const
+  {
+    return policy_ver <= ptx_version;
+  }
+
+  template <typename ActivePolicy>
+  CUB_RUNTIME_FUNCTION cudaError_t Invoke() const
+  {
+#define CHECK_EXPR +ActivePolicy::value == *::cuda::std::find_if(policies.rbegin(), policies.rend(), *this)
+
+#if TEST_LAUNCH == 0
+    CAPTURE(ptx_version, policies);
+    REQUIRE(CHECK_EXPR);
+#else // TEST_LAUNCH == 0
+    if (!(CHECK_EXPR))
+    {
+      printf("Check `%s` failed!\n  ptx_version=%d\n  ActivePolicy::value=%d\n  policies=",
+             THRUST_PP_STRINGIZE(CHECK_EXPR),
+             ptx_version,
+             ActivePolicy::value);
+      for (int i = 0; i < NumPolicies; i++)
+      {
+        printf("%d,", policies[i]);
+      }
+      printf("\n");
+    }
+#endif // TEST_LAUNCH == 0
+    // the returned error code will be checked by the launch helper
+    return (CHECK_EXPR) ? cudaSuccess : cudaErrorInvalidValue;
+#undef CHECK_EXPR
+  }
+};
+
+template <typename PolicyHub, int NumPolicies>
+CUB_RUNTIME_FUNCTION cudaError_t check_chained_policy_selects_correct_policy(
+  void* d_temp_storage, size_t& temp_storage_bytes, ::cuda::std::array<int, NumPolicies> policies, cudaStream_t = 0)
+{
+  if (d_temp_storage == nullptr)
+  {
+    temp_storage_bytes = 1;
+    return cudaSuccess;
+  }
+  int ptx_version = 0;
+  cub::PtxVersion(ptx_version);
+  check_policy_closure<NumPolicies> c{ptx_version, std::move(policies)};
+  return PolicyHub::max_policy::Invoke(ptx_version, c);
+}
+
+DECLARE_TMPL_LAUNCH_WRAPPER(check_chained_policy_selects_correct_policy,
+                            check_wrapper_some,
+                            ESCAPE_LIST(typename PolicyHub, int NumPolicies),
+                            ESCAPE_LIST(PolicyHub, NumPolicies));
+
+struct policy_hub_some
+{
+  GEN_POLICY(350, 350);
+  GEN_POLICY(500, 350);
+  GEN_POLICY(700, 500);
+  GEN_POLICY(900, 700);
+  GEN_POLICY(2000, 900); // non-existing architecture, just to test
+  using max_policy = policy2000;
+};
+
+struct policy_hub_few
+{
+  GEN_POLICY(350, 350);
+  GEN_POLICY(860, 350);
+  GEN_POLICY(2000, 860); // non-existing architecture, just to test
+  using max_policy = policy2000;
+};
+
+struct policy_hub_minimal
+{
+  GEN_POLICY(350, 350);
+  using max_policy = policy350;
+};
+
+C2H_TEST("ChainedPolicy invokes correct policy", "[util][dispatch]")
+{
+  SECTION("policy_hub_some")
+  {
+    check_wrapper_some<policy_hub_some, 5>(::cuda::std::array<int, 5>{350, 500, 700, 900, 2000});
+  }
+  SECTION("policy_hub_few")
+  {
+    check_wrapper_some<policy_hub_few, 3>(::cuda::std::array<int, 3>{350, 860, 2000});
+  }
+  SECTION("policy_hub_minimal")
+  {
+    check_wrapper_some<policy_hub_minimal, 1>(::cuda::std::array<int, 1>{350});
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_math.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_math.cu
new file mode 100644
index 000000000..8da2fd8a6
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_math.cu
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_math.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+C2H_TEST("Tests safe_add_bound_to_max", "[util][math]")
+{
+  REQUIRE(cub::detail::safe_add_bound_to_max(0U, ::cuda::std::numeric_limits<std::uint32_t>::max())
+          == ::cuda::std::numeric_limits<std::uint32_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(::cuda::std::numeric_limits<std::uint32_t>::max(), 0U)
+          == ::cuda::std::numeric_limits<std::uint32_t>::max());
+
+  // We do not overflow
+  REQUIRE(cub::detail::safe_add_bound_to_max(std::int32_t{0}, ::cuda::std::numeric_limits<std::int32_t>::max())
+          == ::cuda::std::numeric_limits<std::int32_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(::cuda::std::numeric_limits<std::int32_t>::max(), std::int32_t{0})
+          == ::cuda::std::numeric_limits<std::int32_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(std::int32_t{1}, ::cuda::std::numeric_limits<std::int32_t>::max())
+          == ::cuda::std::numeric_limits<std::int32_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(::cuda::std::numeric_limits<std::int32_t>::max(), std::int32_t{1})
+          == ::cuda::std::numeric_limits<std::int32_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(
+            ::cuda::std::numeric_limits<std::int32_t>::max(), ::cuda::std::numeric_limits<std::int32_t>::max())
+          == ::cuda::std::numeric_limits<std::int32_t>::max());
+
+  // We do not overflow
+  REQUIRE(cub::detail::safe_add_bound_to_max(std::int64_t{0}, ::cuda::std::numeric_limits<std::int64_t>::max())
+          == ::cuda::std::numeric_limits<std::int64_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(::cuda::std::numeric_limits<std::int64_t>::max(), std::int64_t{0LL})
+          == ::cuda::std::numeric_limits<std::int64_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(std::int64_t{1LL}, ::cuda::std::numeric_limits<std::int64_t>::max())
+          == ::cuda::std::numeric_limits<std::int64_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(::cuda::std::numeric_limits<std::int64_t>::max(), std::int64_t{1LL})
+          == ::cuda::std::numeric_limits<std::int64_t>::max());
+  REQUIRE(cub::detail::safe_add_bound_to_max(
+            ::cuda::std::numeric_limits<std::int64_t>::max(), ::cuda::std::numeric_limits<std::int64_t>::max())
+          == ::cuda::std::numeric_limits<std::int64_t>::max());
+
+  // We do not underflow for negative rhs (not, lhs must not be negative per documentation)
+  REQUIRE(cub::detail::safe_add_bound_to_max(0, -1) == -1);
+  REQUIRE(cub::detail::safe_add_bound_to_max(1, -1) == 0);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_type.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_type.cu
new file mode 100644
index 000000000..05469035a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_util_type.cu
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_type.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+
+C2H_TEST("Tests non_void_value_t", "[util][type]")
+{
+  using fallback_t        = float;
+  using void_fancy_it     = cub::DiscardOutputIterator<std::size_t>;
+  using non_void_fancy_it = cub::CountingInputIterator<int>;
+
+  // falls back for const void*
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<const void*, fallback_t>>::value);
+  // falls back for const volatile void*
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<const volatile void*, fallback_t>>::value);
+  // falls back for volatile void*
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<volatile void*, fallback_t>>::value);
+  // falls back for void*
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<void*, fallback_t>>::value);
+  // works for int*
+  STATIC_REQUIRE(::cuda::std::is_same<int, //
+                                      cub::detail::non_void_value_t<int*, void>>::value);
+  // falls back for fancy iterator with a void value type
+  STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
+                                      cub::detail::non_void_value_t<void_fancy_it, fallback_t>>::value);
+  // works for a fancy iterator that has int as value type
+  STATIC_REQUIRE(::cuda::std::is_same<int, //
+                                      cub::detail::non_void_value_t<non_void_fancy_it, fallback_t>>::value);
+}
+
+CUB_DEFINE_DETECT_NESTED_TYPE(cat_detect, cat);
+
+struct HasCat
+{
+  using cat = int;
+};
+struct HasDog
+{
+  using dog = int;
+};
+
+C2H_TEST("Test CUB_DEFINE_DETECT_NESTED_TYPE", "[util][type]")
+{
+  STATIC_REQUIRE(cat_detect<HasCat>::value);
+  STATIC_REQUIRE(!cat_detect<HasDog>::value);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_vsmem.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_vsmem.cu
new file mode 100644
index 000000000..f8715ad6f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_vsmem.cu
@@ -0,0 +1,479 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/config.cuh>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include "catch2/catch.hpp"
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.cuh>
+
+//----------------------------------------------------------------------------
+// Helper section
+//----------------------------------------------------------------------------
+template <int Size>
+struct large_custom_t
+{
+  uint8_t data[Size];
+};
+
+struct kernel_test_info_t
+{
+  // Whether the kernel is using virtual shared memory
+  bool uses_vsmem_ptr;
+  // Whether the kernel is using the fallback agent
+  bool uses_fallback_agent;
+  // Whether the kernel is using the fallback policy
+  bool uses_fallback_policy;
+};
+
+struct launch_config_test_info_t
+{
+  // The tile size that's assumed during launch configuration
+  std::size_t config_assumes_tile_size;
+  // The block size that's assumed during launch configuration
+  std::size_t config_assumes_block_threads;
+  // The total amount of virtual shared memory that has to be allocated
+  std::size_t config_vsmem_per_block;
+};
+
+// CUB_NAMESPACE_BEGIN
+
+//----------------------------------------------------------------------------
+// Tuning policy definition
+//----------------------------------------------------------------------------
+template <int BlockThreads, int ItemsPerThread>
+struct agent_dummy_algorithm_policy_t
+{
+  static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
+  static constexpr int BLOCK_THREADS    = BlockThreads;
+};
+
+//----------------------------------------------------------------------------
+// Agent template definition
+//----------------------------------------------------------------------------
+template <typename ActivePolicyT, typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+struct agent_dummy_algorithm_t
+{
+  static constexpr auto block_threads    = ActivePolicyT::BLOCK_THREADS;
+  static constexpr auto items_per_thread = ActivePolicyT::ITEMS_PER_THREAD;
+  static constexpr auto tile_size        = block_threads * items_per_thread;
+
+  using item_t = cub::detail::value_t<InputIteratorT>;
+
+  using block_load_t = cub::BlockLoad<item_t, block_threads, items_per_thread, cub::BLOCK_LOAD_TRANSPOSE>;
+
+  using block_store_t = cub::BlockStore<item_t, block_threads, items_per_thread, cub::BLOCK_STORE_TRANSPOSE>;
+
+  // We are intentionally not aliasing the TempStorage here to double the required shared memory of the test and be able
+  // to use a smaller `large_custom_t`, as we experienced slow compilation times for large a `large_custom_t`.
+  struct _temp_storage_t
+  {
+    typename block_load_t::TempStorage load;
+    typename block_store_t::TempStorage store;
+  };
+
+  struct TempStorage : cub::Uninitialized<_temp_storage_t>
+  {};
+
+  _temp_storage_t& temp_storage; ///< Reference to temp_storage
+  InputIteratorT d_in; ///< Input data
+  OutputIteratorT d_out; ///< Output data
+
+  __device__ __forceinline__
+  agent_dummy_algorithm_t(TempStorage& temp_storage, InputIteratorT d_in, OutputIteratorT d_out)
+      : temp_storage(temp_storage.Alias())
+      , d_in(d_in)
+      , d_out(d_out)
+  {}
+
+  __device__ __forceinline__ void consume_tile(int tile_idx, OffsetT total_num_items)
+  {
+    const OffsetT tile_offset   = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(tile_size);
+    const OffsetT num_remaining = total_num_items - tile_offset;
+
+    // Load items into a blocked arrangement
+    item_t items[items_per_thread]{};
+    block_load_t(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+
+    // Store items from blocked arrangement
+    block_store_t(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+  }
+};
+
+//----------------------------------------------------------------------------
+// Kernel template definition
+//----------------------------------------------------------------------------
+template <typename ChainedPolicyT, typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+void __global__ __launch_bounds__(
+  cub::detail::vsmem_helper_fallback_policy_t<
+    typename ChainedPolicyT::ActivePolicy::DummyAlgorithmPolicy,
+    typename ChainedPolicyT::ActivePolicy::FallbackDummyAlgorithmPolicy,
+    agent_dummy_algorithm_t,
+    InputIteratorT,
+    OutputIteratorT,
+    OffsetT>::agent_policy_t::BLOCK_THREADS)
+  dummy_algorithm_kernel(
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    kernel_test_info_t* kernel_test_info,
+    cub::detail::vsmem_t vsmem)
+{
+  using active_policy_t   = typename ChainedPolicyT::ActivePolicy;
+  using default_policy_t  = typename active_policy_t::DummyAlgorithmPolicy;
+  using fallback_policy_t = typename active_policy_t::FallbackDummyAlgorithmPolicy;
+  using fallback_agent_t  = agent_dummy_algorithm_t<fallback_policy_t, InputIteratorT, OutputIteratorT, OffsetT>;
+
+  using vsmem_helper_t = cub::detail::vsmem_helper_fallback_policy_t<
+    default_policy_t,
+    fallback_policy_t,
+    agent_dummy_algorithm_t,
+    InputIteratorT,
+    OutputIteratorT,
+    OffsetT>;
+
+  using agent_t = typename vsmem_helper_t::agent_t;
+
+  // Static shared memory allocation
+  __shared__ typename vsmem_helper_t::static_temp_storage_t static_temp_storage;
+
+  // Get temporary storage
+  typename agent_t::TempStorage& temp_storage = vsmem_helper_t::get_temp_storage(static_temp_storage, vsmem);
+
+  // Populate test meta data
+  kernel_test_info->uses_vsmem_ptr =
+    (reinterpret_cast<char*>(&temp_storage)
+     == (static_cast<char*>(vsmem.gmem_ptr) + (blockIdx.x * vsmem_helper_t::vsmem_per_block)));
+  kernel_test_info->uses_fallback_agent =
+    ::cuda::std::is_same<typename vsmem_helper_t::agent_t, fallback_agent_t>::value;
+  kernel_test_info->uses_fallback_policy =
+    ::cuda::std::is_same<typename vsmem_helper_t::agent_policy_t, fallback_policy_t>::value;
+
+  // Instantiate the algorithm's agent
+  agent_t agent(temp_storage, d_in, d_out);
+
+  // Process this thread block's tile
+  agent.consume_tile(blockIdx.x, num_items);
+
+  // If applicable, hints to discard modified cache lines for vsmem
+  vsmem_helper_t::discard_temp_storage(temp_storage);
+}
+
+//----------------------------------------------------------------------------
+// Tuning policy chain
+//----------------------------------------------------------------------------
+template <typename InputIteratorT>
+struct device_dummy_algorithm_policy_t
+{
+  using item_t = cub::detail::value_t<InputIteratorT>;
+
+  static constexpr int FALLBACK_BLOCK_THREADS = 64;
+
+  struct policy_350 : cub::ChainedPolicy<350, policy_350, policy_350>
+  {
+    using DummyAlgorithmPolicy = agent_dummy_algorithm_policy_t<256, cub::Nominal4BItemsToItems<item_t>(17)>;
+
+    // The fallback policy that's used if there's insufficient shared memory for the default policy,
+    // yet still sufficient memory for the fallback policy
+    using FallbackDummyAlgorithmPolicy = cub::detail::policy_wrapper_t<DummyAlgorithmPolicy, FALLBACK_BLOCK_THREADS>;
+  };
+
+  /// MaxPolicy
+  using max_policy_t = policy_350;
+};
+
+//----------------------------------------------------------------------------
+// Dispatch layer
+//----------------------------------------------------------------------------
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename OffsetT,
+          typename SelectedPolicy = device_dummy_algorithm_policy_t<InputIteratorT>>
+struct dispatch_dummy_algorithm_t : SelectedPolicy
+{
+  using item_t = cub::detail::value_t<InputIteratorT>;
+
+  /// Device-accessible allocation of temporary storage. When nullptr, the required
+  /// allocation size is written to \p temp_storage_bytes and no work is done.
+  void* d_temp_storage;
+
+  /// Reference to size in bytes of \p d_temp_storage allocation
+  std::size_t& temp_storage_bytes;
+
+  InputIteratorT d_in;
+  OutputIteratorT d_out;
+  OffsetT num_items;
+  kernel_test_info_t* kernel_test_info;
+  launch_config_test_info_t* launch_config_info;
+  cudaStream_t stream;
+  int ptx_version;
+
+  CUB_RUNTIME_FUNCTION __forceinline__ dispatch_dummy_algorithm_t(
+    void* d_temp_storage,
+    std::size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    kernel_test_info_t* kernel_test_info,
+    launch_config_test_info_t* launch_config_info,
+    cudaStream_t stream,
+    int ptx_version)
+      : d_temp_storage(d_temp_storage)
+      , temp_storage_bytes(temp_storage_bytes)
+      , d_in(d_in)
+      , d_out(d_out)
+      , num_items(num_items)
+      , kernel_test_info(kernel_test_info)
+      , launch_config_info(launch_config_info)
+      , stream(stream)
+      , ptx_version(ptx_version)
+  {}
+
+  /**
+   * @brief During compilation, CUB's dispatch mechanism (more specifically, `cub::ChainedPolicy`)
+   * instantiates the `Invoke` function template for *all* tunings policies that are defined in the
+   * algorithm's chain of tuning policies. At runtime, when an algorithm is invoked, of all the
+   * instantiated `Invoke`-function templates, `cub::ChainedPolicy` makes sure to *call* only the
+   * first tuning policy whose PTX version is less-than-or-equal to the GPU's SM version to which
+   * the algorithm is dispatched.
+   * Since the `Invoke`function template is instantiated for *all* tunings policies in the chain, we
+   * want to avoid making any of the algorithm's kernel template parameters depend on the
+   * `ActivePolicyT` template argument of the `Invoke` function template, as that would result in
+   * multiple kernel template instances.
+   */
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    using max_policy_t = typename dispatch_dummy_algorithm_t::max_policy_t;
+
+    using vsmem_helper_t = cub::detail::vsmem_helper_fallback_policy_t<
+      typename ActivePolicyT::DummyAlgorithmPolicy,
+      typename ActivePolicyT::FallbackDummyAlgorithmPolicy,
+      agent_dummy_algorithm_t,
+      InputIteratorT,
+      OutputIteratorT,
+      OffsetT>;
+
+    // Empty problem size
+    if (num_items == 0)
+    {
+      if (d_temp_storage)
+      {
+        return cudaSuccess;
+      }
+      else
+      {
+        temp_storage_bytes = 0;
+        return cudaSuccess;
+      }
+    }
+
+    // Compute launch configurations
+    constexpr auto block_threads    = vsmem_helper_t::agent_policy_t::BLOCK_THREADS;
+    constexpr auto items_per_thread = vsmem_helper_t::agent_policy_t::ITEMS_PER_THREAD;
+    constexpr auto tile_size        = block_threads * items_per_thread;
+    const auto num_tiles            = ::cuda::ceil_div(num_items, tile_size);
+    const auto total_vsmem          = num_tiles * vsmem_helper_t::vsmem_per_block;
+
+    // Get device ordinal
+    cudaError error = cudaSuccess;
+
+    // Compute temporary storage requirements
+    void* allocations[1]            = {nullptr};
+    std::size_t allocation_sizes[1] = {total_vsmem};
+    error = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Return if the caller is simply requesting the size of the storage allocation
+    if (d_temp_storage == nullptr)
+    {
+      return error;
+    }
+    launch_config_info->config_assumes_tile_size     = static_cast<std::size_t>(tile_size);
+    launch_config_info->config_assumes_block_threads = static_cast<std::size_t>(block_threads);
+    launch_config_info->config_vsmem_per_block       = vsmem_helper_t::vsmem_per_block;
+
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_tiles, block_threads, 0, stream)
+      .doit(dummy_algorithm_kernel<max_policy_t, InputIteratorT, OutputIteratorT, OffsetT>,
+            d_in,
+            d_out,
+            num_items,
+            kernel_test_info,
+            cub::detail::vsmem_t{allocations[0]});
+    return cudaPeekAtLastError();
+  }
+
+  /**
+   * @brief Static member function as the entry point for algorithm dispatch
+   */
+  CUB_RUNTIME_FUNCTION static cudaError_t dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    OffsetT num_items,
+    kernel_test_info_t* kernel_test_info,
+    launch_config_test_info_t* launch_config_info,
+    cudaStream_t stream = 0)
+  {
+    using max_policy_t = typename dispatch_dummy_algorithm_t::max_policy_t;
+
+    // Get PTX version
+    int ptx_version = 0;
+    cudaError error = cub::PtxVersion(ptx_version);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    // Create dispatch functor
+    dispatch_dummy_algorithm_t dispatch(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      num_items,
+      kernel_test_info,
+      launch_config_info,
+      stream,
+      ptx_version);
+
+    // Dispatch to chained policy
+    error = max_policy_t::Invoke(ptx_version, dispatch);
+    if (cudaSuccess != error)
+    {
+      return error;
+    }
+
+    return error;
+  }
+};
+
+//----------------------------------------------------------------------------
+// Device-scope interface layer
+//----------------------------------------------------------------------------
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION static cudaError_t device_dummy_algorithm(
+  void* d_temp_storage,
+  size_t& temp_storage_bytes,
+  InputIteratorT d_in,
+  OutputIteratorT d_out,
+  OffsetT num_items,
+  kernel_test_info_t* kernel_test_info,
+  launch_config_test_info_t* launch_config_info,
+  cudaStream_t stream = 0)
+{
+  using dispatch_dummy_algorithm_t = dispatch_dummy_algorithm_t<InputIteratorT, OutputIteratorT, OffsetT>;
+  return dispatch_dummy_algorithm_t::dispatch(
+    d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, kernel_test_info, launch_config_info, stream);
+}
+
+// CUB_NAMESPACE_END
+
+DECLARE_LAUNCH_WRAPPER(device_dummy_algorithm, dummy_algorithm);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+using type_list = c2h::type_list<large_custom_t<1>, large_custom_t<80>, large_custom_t<128>, large_custom_t<512>>;
+
+C2H_TEST("Virtual shared memory works within algorithms", "[util][vsmem]", type_list)
+{
+  using item_t   = typename c2h::get<0, TestType>;
+  using offset_t = int32_t;
+
+  constexpr offset_t target_size = 10000000;
+  const offset_t num_items       = target_size / sizeof(item_t);
+
+  // Prepare input and output buffers for a simple copy algorithm test
+  c2h::device_vector<uint8_t> in(num_items * sizeof(item_t));
+  c2h::device_vector<uint8_t> out(num_items * sizeof(item_t));
+  auto const in_ptr  = reinterpret_cast<item_t*>(thrust::raw_pointer_cast(in.data()));
+  auto const out_ptr = reinterpret_cast<item_t*>(thrust::raw_pointer_cast(out.data()));
+
+  // Generate some random noise input data
+  c2h::gen(C2H_SEED(1), in);
+
+  // Query default and fallback policies and agents so we can confirm vsmem
+  using default_policy_t  = typename device_dummy_algorithm_policy_t<item_t*>::policy_350::DummyAlgorithmPolicy;
+  using default_agent_t   = agent_dummy_algorithm_t<default_policy_t, item_t*, item_t*, offset_t>;
+  using fallback_policy_t = typename device_dummy_algorithm_policy_t<item_t*>::policy_350::FallbackDummyAlgorithmPolicy;
+  using fallback_agent_t  = agent_dummy_algorithm_t<fallback_policy_t, item_t*, item_t*, offset_t>;
+
+  // Get the information as it is expected from the vsmem helper to work as epxected
+  std::size_t default_smem_size  = sizeof(typename default_agent_t::TempStorage);
+  std::size_t fallback_smem_size = sizeof(typename fallback_agent_t::TempStorage);
+  bool expected_to_use_fallback =
+    default_smem_size > cub::detail::max_smem_per_block && fallback_smem_size <= cub::detail::max_smem_per_block;
+  std::size_t expected_smem_per_block = expected_to_use_fallback ? fallback_smem_size : default_smem_size;
+  bool expected_needs_vsmem           = expected_smem_per_block > cub::detail::max_smem_per_block;
+  std::size_t expected_block_threads =
+    expected_to_use_fallback ? fallback_policy_t::BLOCK_THREADS : default_policy_t::BLOCK_THREADS;
+  std::size_t expected_items_per_thread =
+    expected_to_use_fallback ? fallback_policy_t::ITEMS_PER_THREAD : default_policy_t::ITEMS_PER_THREAD;
+  std::size_t expected_tile_size       = expected_block_threads * expected_items_per_thread;
+  std::size_t expected_vsmem_per_block = (expected_needs_vsmem ? expected_smem_per_block : 0ULL);
+
+  // Setup vsmem test
+  launch_config_test_info_t* launch_config_info = nullptr;
+  cudaMallocHost(&launch_config_info, sizeof(launch_config_test_info_t));
+  c2h::device_vector<kernel_test_info_t> device_kernel_test_info(1);
+  dummy_algorithm(
+    in_ptr, out_ptr, num_items, thrust::raw_pointer_cast(device_kernel_test_info.data()), launch_config_info);
+
+  // Make sure the algorithm worked correctly
+  REQUIRE(in == out);
+
+  // Make sure the kernel information retrieved from the vsmem helper is correct
+  c2h::host_vector<kernel_test_info_t> kernel_test_info = device_kernel_test_info;
+  REQUIRE(kernel_test_info[0].uses_vsmem_ptr == expected_needs_vsmem);
+  REQUIRE(kernel_test_info[0].uses_fallback_agent == expected_to_use_fallback);
+  REQUIRE(kernel_test_info[0].uses_fallback_policy == expected_to_use_fallback);
+
+  // Make sure the launch configuration information retrieved from the vsmem helper is correct
+  REQUIRE(launch_config_info->config_assumes_tile_size == expected_tile_size);
+  REQUIRE(launch_config_info->config_assumes_block_threads == expected_block_threads);
+  if (expected_vsmem_per_block == 0)
+  {
+    REQUIRE(launch_config_info->config_vsmem_per_block == 0);
+  }
+  else
+  {
+    // The virtual shared memory helper pads vsmem to a multiple of a line size, hence the range check
+    REQUIRE(launch_config_info->config_vsmem_per_block >= expected_vsmem_per_block);
+  }
+
+  cudaFreeHost(launch_config_info);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange.cuh b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange.cuh
new file mode 100644
index 000000000..46d46d0ee
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_exchange.cuh>
+
+#include <thrust/reverse.h>
+#include <thrust/sequence.h>
+
+#include <type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/fill_striped.cuh>
+
+template <typename InputT, typename OutputT, int ItemsPerThread, cub::WarpExchangeAlgorithm Alg, typename = void>
+struct exchange_data_t;
+
+template <typename InputT, typename OutputT, int ItemsPerThread, cub::WarpExchangeAlgorithm Alg>
+struct exchange_data_t<InputT,
+                       OutputT,
+                       ItemsPerThread,
+                       Alg,
+                       typename std::enable_if<std::is_same<InputT, OutputT>::value>::type>
+{
+  InputT input[ItemsPerThread];
+  OutputT (&output)[ItemsPerThread] = input;
+
+  template <int LogicalWarpThreads>
+  inline __device__ void
+  scatter(cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads, 0, Alg>& exchange, int (&ranks)[ItemsPerThread])
+  {
+    exchange.ScatterToStriped(input, ranks);
+  }
+};
+
+template <typename InputT, typename OutputT, int ItemsPerThread, cub::WarpExchangeAlgorithm Alg>
+struct exchange_data_t<InputT,
+                       OutputT,
+                       ItemsPerThread,
+                       Alg,
+                       typename std::enable_if<!std::is_same<InputT, OutputT>::value>::type>
+{
+  InputT input[ItemsPerThread];
+  OutputT output[ItemsPerThread];
+
+  template <int LogicalWarpThreads>
+  inline __device__ void
+  scatter(cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads, 0, Alg>& exchange, int (&ranks)[ItemsPerThread])
+  {
+    exchange.ScatterToStriped(input, output, ranks);
+  }
+};
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          cub::WarpExchangeAlgorithm Alg,
+          typename InputT,
+          typename OutputT>
+__global__ void scatter_kernel(const InputT* input_data, OutputT* output_data)
+{
+  using warp_exchange_t = cub::WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, 0, Alg>;
+  using storage_t       = typename warp_exchange_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t temp_storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  const int lane_id = tid % LOGICAL_WARP_THREADS;
+
+  warp_exchange_t exchange(temp_storage[warp_id]);
+
+  exchange_data_t<InputT, OutputT, ITEMS_PER_THREAD, Alg> exchange_data;
+
+  // Reverse data
+  int ranks[ITEMS_PER_THREAD];
+
+  input_data += warp_id * tile_size;
+  output_data += warp_id * tile_size;
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const auto item_idx       = lane_id * ITEMS_PER_THREAD + item;
+    exchange_data.input[item] = input_data[item_idx];
+    ranks[item]               = tile_size - 1 - item_idx;
+  }
+
+  exchange_data.scatter(exchange, ranks);
+
+  // Striped to blocked
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    output_data[item * LOGICAL_WARP_THREADS + lane_id] = exchange_data.output[item];
+  }
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          cub::WarpExchangeAlgorithm Alg,
+          typename InputT,
+          typename OutputT>
+void warp_scatter_strided(c2h::device_vector<InputT>& in, c2h::device_vector<OutputT>& out)
+{
+  scatter_kernel<LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, Alg, InputT, OutputT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+      thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          cub::WarpExchangeAlgorithm Alg,
+          typename InputT,
+          typename OutputT,
+          typename ActionT>
+__global__ void kernel(const InputT* input_data, OutputT* output_data, ActionT action)
+{
+  using warp_exchange_t = cub::WarpExchange<InputT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, 0, Alg>;
+  using storage_t       = typename warp_exchange_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t temp_storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  const int lane_id = tid % LOGICAL_WARP_THREADS;
+
+  warp_exchange_t exchange(temp_storage[warp_id]);
+
+  exchange_data_t<InputT, OutputT, ITEMS_PER_THREAD, Alg> exchange_data;
+
+  input_data += warp_id * tile_size;
+  output_data += warp_id * tile_size;
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    exchange_data.input[item] = input_data[lane_id * ITEMS_PER_THREAD + item];
+  }
+
+  action(exchange_data.input, exchange_data.output, exchange);
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    output_data[lane_id * ITEMS_PER_THREAD + item] = exchange_data.output[item];
+  }
+}
+
+template <int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          cub::WarpExchangeAlgorithm Alg,
+          typename InputT,
+          typename OutputT,
+          typename ActionT>
+void warp_exchange(c2h::device_vector<InputT>& in, c2h::device_vector<OutputT>& out, ActionT action)
+{
+  kernel<LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, Alg, InputT, OutputT, ActionT>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+      thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), action);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+struct blocked_to_striped
+{
+  template <typename InputT,
+            typename OutputT,
+            int LogicalWarpThreads,
+            int ItemsPerThread,
+            int ITEMS_PER_THREAD,
+            cub::WarpExchangeAlgorithm Alg>
+  __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD],
+                             OutputT (&output)[ITEMS_PER_THREAD],
+                             cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads, 0, Alg>& exchange)
+  {
+    exchange.BlockedToStriped(input, output);
+  }
+};
+
+struct striped_to_blocked
+{
+  template <typename InputT,
+            typename OutputT,
+            int LogicalWarpThreads,
+            int ItemsPerThread,
+            int ITEMS_PER_THREAD,
+            cub::WarpExchangeAlgorithm Alg>
+  __device__ void operator()(InputT (&input)[ITEMS_PER_THREAD],
+                             OutputT (&output)[ITEMS_PER_THREAD],
+                             cub::WarpExchange<InputT, ItemsPerThread, LogicalWarpThreads, 0, Alg>& exchange)
+  {
+    exchange.StripedToBlocked(input, output);
+  }
+};
+
+template <typename T>
+c2h::host_vector<T> compute_host_reference(const c2h::device_vector<T>& d_input, int tile_size)
+{
+  c2h::host_vector<T> input = d_input;
+
+  int num_warps = CUB_QUOTIENT_CEILING(static_cast<int>(d_input.size()), tile_size);
+  for (int warp_id = 0; warp_id < num_warps; warp_id++)
+  {
+    const int warp_data_begin = tile_size * warp_id;
+    const int warp_data_end   = warp_data_begin + tile_size;
+    thrust::reverse(input.begin() + warp_data_begin, input.begin() + warp_data_end);
+  }
+  return input;
+}
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value()
+  {
+    return total_warps;
+  }
+};
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_shfl.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_shfl.cu
new file mode 100644
index 000000000..6a0f7c927
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_shfl.cu
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "catch2_test_warp_exchange.cuh"
+
+namespace
+{
+
+using inout_types = c2h::type_list<std::uint16_t, std::int32_t, std::int64_t, double>;
+
+using items_per_thread = c2h::enum_type_list<int, 2, 4, 8, 16, 32>;
+
+template <class TestType>
+struct params_t
+{
+  using in_type  = c2h::get<0, TestType>;
+  using out_type = c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread     = c2h::get<1, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count     = total_warps * tile_size;
+};
+
+} // namespace
+
+C2H_TEST("Blocked to striped works", "[exchange][warp][shfl]", inout_types, items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  c2h::device_vector<out_type> d_out(params::total_item_count, out_type{});
+  c2h::device_vector<in_type> d_in(params::total_item_count);
+
+  c2h::gen(c2h::modulo_t{d_in.size()}, d_in);
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps, cub::WARP_EXCHANGE_SHUFFLE>(
+    d_in, d_out, blocked_to_striped{});
+  c2h::host_vector<out_type> h_expected_output(d_out.size());
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_expected_output.begin());
+
+  REQUIRE(h_expected_output == d_out);
+}
+
+C2H_TEST("Striped to blocked works", "[exchange][warp][shfl]", inout_types, items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  c2h::device_vector<out_type> d_out(params::total_item_count, out_type{});
+
+  c2h::host_vector<in_type> h_in(params::total_item_count);
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_in.begin());
+  c2h::device_vector<in_type> d_in = h_in;
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps, cub::WARP_EXCHANGE_SHUFFLE>(
+    d_in, d_out, striped_to_blocked{});
+  c2h::device_vector<out_type> d_expected_output(d_out.size());
+  c2h::gen(c2h::modulo_t{d_out.size()}, d_expected_output);
+
+  REQUIRE(d_expected_output == d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_smem.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_smem.cu
new file mode 100644
index 000000000..adb6081c8
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_exchange_smem.cu
@@ -0,0 +1,117 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "catch2_test_warp_exchange.cuh"
+
+namespace
+{
+
+using inout_types =
+  c2h::type_list<c2h::pair<std::uint16_t, std::int64_t>,
+                 c2h::pair<std::uint16_t, std::uint32_t>,
+                 c2h::pair<std::int32_t, std::int32_t>,
+                 c2h::pair<std::int64_t, std::int64_t>,
+                 c2h::pair<uchar3, uchar3>,
+                 c2h::pair<ulonglong4, ulonglong4>>;
+
+using logical_warp_threads = c2h::enum_type_list<int, 4, 16, 32>;
+using items_per_thread     = c2h::enum_type_list<int, 1, 4, 7>;
+
+template <class TestType>
+struct params_t
+{
+  using in_type  = typename c2h::first<c2h::get<0, TestType>>;
+  using out_type = typename c2h::second<c2h::get<0, TestType>>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread     = c2h::get<2, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count     = total_warps * tile_size;
+};
+
+} // namespace
+
+C2H_TEST("Scatter to striped works", "[exchange][warp][smem]", inout_types, logical_warp_threads, items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  c2h::device_vector<out_type> d_out(params::total_item_count);
+  c2h::device_vector<in_type> d_in(params::total_item_count);
+
+  c2h::gen(c2h::modulo_t{d_in.size()}, d_in);
+
+  warp_scatter_strided<params::logical_warp_threads,
+                       params::items_per_thread,
+                       params::total_warps,
+                       cub::WARP_EXCHANGE_SMEM>(d_in, d_out);
+
+  auto h_expected_output = compute_host_reference(d_in, params::tile_size);
+  REQUIRE(h_expected_output == d_out);
+}
+
+C2H_TEST("Blocked to striped works", "[exchange][warp][smem]", inout_types, logical_warp_threads, items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  c2h::device_vector<out_type> d_out(params::total_item_count, out_type{});
+  c2h::device_vector<in_type> d_in(params::total_item_count);
+
+  c2h::gen(c2h::modulo_t{d_in.size()}, d_in);
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps, cub::WARP_EXCHANGE_SMEM>(
+    d_in, d_out, blocked_to_striped{});
+  c2h::host_vector<out_type> h_expected_output(d_out.size());
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_expected_output.begin());
+
+  REQUIRE(h_expected_output == d_out);
+}
+
+C2H_TEST("Striped to blocked works", "[exchange][warp][smem]", inout_types, logical_warp_threads, items_per_thread)
+{
+  using params   = params_t<TestType>;
+  using in_type  = typename params::in_type;
+  using out_type = typename params::out_type;
+  c2h::device_vector<out_type> d_out(params::total_item_count, out_type{});
+
+  c2h::host_vector<in_type> h_in(params::total_item_count);
+  fill_striped<params::logical_warp_threads,
+               params::items_per_thread,
+               params::logical_warp_threads * params::total_warps>(h_in.begin());
+  c2h::device_vector<in_type> d_in = h_in;
+
+  warp_exchange<params::logical_warp_threads, params::items_per_thread, params::total_warps, cub::WARP_EXCHANGE_SMEM>(
+    d_in, d_out, striped_to_blocked{});
+  c2h::device_vector<out_type> d_expected_output(d_out.size());
+  c2h::gen(c2h::modulo_t{d_out.size()}, d_expected_output);
+
+  REQUIRE(d_expected_output == d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_load.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_load.cu
new file mode 100644
index 000000000..4d96b2a91
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_load.cu
@@ -0,0 +1,340 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/warp/warp_load.cuh>
+
+#include <thrust/sequence.h>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/fill_striped.cuh>
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename InputIteratorT,
+          typename ActionT>
+__global__ void warp_load_kernel(InputIteratorT input_iterator, ActionT action, int* error_counter)
+{
+  using warp_load_t = cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_load_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int linear_tid = threadIdx.x;
+
+  const int warp_id = linear_tid / LOGICAL_WARP_THREADS;
+  warp_load_t load(storage[warp_id]);
+
+  // Test WarpLoad specialization
+  T reg[ITEMS_PER_THREAD];
+  action.load(load, input_iterator + (warp_id * tile_size), reg);
+
+  // Verify data was loaded as expected
+  action.verify(reg, error_counter);
+}
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename InputIteratorT,
+          typename ActionT>
+void warp_load(InputIteratorT input_iterator, ActionT action, int* error_counter)
+{
+  warp_load_kernel<LoadAlgorithm, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, T, InputIteratorT, ActionT>
+    <<<1, TOTAL_WARPS * LOGICAL_WARP_THREADS>>>(input_iterator, action, error_counter);
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief WarpLoad test specialisation for guarded loads
+ */
+template <cub::WarpLoadAlgorithm LoadAlgorithm, int LOGICAL_WARP_THREADS, typename T>
+struct guarded_load_t
+{
+  int valid_items;
+  T oob_default;
+
+  template <int ITEMS_PER_THREAD, typename InputIteratorT>
+  __device__ void load(cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS> load,
+                       InputIteratorT input,
+                       T (&reg)[ITEMS_PER_THREAD])
+  {
+    load.Load(input, reg, valid_items, oob_default);
+  }
+
+  template <int ITEMS_PER_THREAD>
+  __device__ void verify(T (&reg)[ITEMS_PER_THREAD], int* error_counter)
+  {
+    const auto linear_tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+    const auto lane_id    = linear_tid % LOGICAL_WARP_THREADS;
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const auto expected_value = static_cast<T>(linear_tid * ITEMS_PER_THREAD + item);
+
+      const bool is_oob = LoadAlgorithm == cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED
+                          ? item * LOGICAL_WARP_THREADS + lane_id >= valid_items
+                          : lane_id * ITEMS_PER_THREAD + item >= valid_items;
+
+      if (is_oob)
+      {
+        if (reg[item] != oob_default)
+        {
+          atomicAdd(error_counter, 1);
+        }
+      }
+      else if (reg[item] != expected_value)
+      {
+        atomicAdd(error_counter, 1);
+      }
+    }
+  }
+};
+
+/**
+ * @brief WarpLoad test specialisation for unguarded loads
+ */
+struct unguarded_load_t
+{
+  template <cub::WarpLoadAlgorithm LoadAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename InputIteratorT>
+  __device__ void load(cub::WarpLoad<T, ITEMS_PER_THREAD, LoadAlgorithm, LOGICAL_WARP_THREADS> load,
+                       InputIteratorT input,
+                       T (&reg)[ITEMS_PER_THREAD])
+  {
+    load.Load(input, reg);
+  }
+
+  template <typename T, int ITEMS_PER_THREAD>
+  __device__ void verify(T (&reg)[ITEMS_PER_THREAD], int* error_counter)
+  {
+    for (int item = 0; item < ITEMS_PER_THREAD; item++)
+    {
+      const auto expected_value = static_cast<T>(threadIdx.x * ITEMS_PER_THREAD + item);
+
+      if (reg[item] != expected_value)
+      {
+        atomicAdd(error_counter, 1);
+      }
+    }
+  }
+};
+
+template <cub::WarpLoadAlgorithm LoadAlgorithm, int LOGICAL_WARP_THREADS, int ITEMS_PER_THREAD, int TOTAL_WARPS, typename T>
+c2h::device_vector<T> generate_input()
+{
+  constexpr int tile_size = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD;
+  constexpr int num_items = TOTAL_WARPS * tile_size;
+
+  c2h::device_vector<T> d_input(num_items);
+
+  _CCCL_IF_CONSTEXPR (LoadAlgorithm == cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED)
+  {
+    c2h::host_vector<T> h_input(num_items);
+
+    // In this case we need different stripe pattern, so the
+    // items/threads parameters are swapped
+
+    constexpr int FAKE_BLOCK_SIZE = ITEMS_PER_THREAD * TOTAL_WARPS;
+
+    fill_striped<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, FAKE_BLOCK_SIZE>(h_input.begin());
+    d_input = h_input;
+  }
+  else
+  {
+    c2h::gen(c2h::modulo_t{num_items}, d_input);
+  }
+
+  return d_input;
+}
+
+// %PARAM% LWT lwt 4:16:32
+// %PARAM% ALGO_TYPE alg 0:1:2:3
+
+using types                = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using items_per_thread     = c2h::enum_type_list<int, 1, 4, 7>;
+using logical_warp_threads = c2h::enum_type_list<int, LWT>;
+using algorithms =
+  c2h::enum_type_list<cub::WarpLoadAlgorithm,
+                      cub::WarpLoadAlgorithm::WARP_LOAD_DIRECT,
+                      cub::WarpLoadAlgorithm::WARP_LOAD_STRIPED,
+                      cub::WarpLoadAlgorithm::WARP_LOAD_TRANSPOSE,
+                      cub::WarpLoadAlgorithm::WARP_LOAD_VECTORIZE>;
+using algorithm = c2h::enum_type_list<cub::WarpLoadAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+using cache_load_modifier =
+  c2h::enum_type_list<cub::CacheLoadModifier,
+                      cub::CacheLoadModifier::LOAD_DEFAULT,
+                      cub::CacheLoadModifier::LOAD_CA,
+                      cub::CacheLoadModifier::LOAD_CG,
+                      cub::CacheLoadModifier::LOAD_CS,
+                      cub::CacheLoadModifier::LOAD_CV,
+                      cub::CacheLoadModifier::LOAD_LDG,
+                      cub::CacheLoadModifier::LOAD_VOLATILE>;
+
+constexpr int guarded_load_tests_count = 30;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value()
+  {
+    return total_warps;
+  }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads         = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread             = c2h::get<2, TestType>::value;
+  static constexpr cub::WarpLoadAlgorithm algorithm = c2h::get<3, TestType>::value;
+  static constexpr int total_warps                  = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size                    = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count             = total_warps * tile_size;
+};
+
+C2H_TEST(
+  "Warp load guarded range works with pointer", "[load][warp]", types, logical_warp_threads, items_per_thread, algorithm)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = guarded_load_t<params::algorithm, params::logical_warp_threads, type>;
+
+  const int valid_items  = GENERATE_COPY(take(guarded_load_tests_count, random(0, params::tile_size - 1)));
+  const auto oob_default = static_cast<type>(valid_items);
+
+  auto d_in =
+    generate_input<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>();
+  c2h::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    thrust::raw_pointer_cast(d_in.data()),
+    delegate_t{valid_items, oob_default},
+    thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const int num_errors               = d_error_counter[0];
+  constexpr int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+C2H_TEST("Warp load guarded range works with cache modified iterator",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_load_modifier)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = guarded_load_t<params::algorithm, params::logical_warp_threads, type>;
+  constexpr cub::CacheLoadModifier load_modifier = c2h::get<4, TestType>::value;
+
+  const int valid_items  = GENERATE_COPY(take(guarded_load_tests_count, random(0, params::tile_size - 1)));
+  const auto oob_default = static_cast<type>(valid_items);
+
+  auto d_in =
+    generate_input<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>();
+  auto in_it = cub::CacheModifiedInputIterator<load_modifier, type>(thrust::raw_pointer_cast(d_in.data()));
+  c2h::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    in_it, delegate_t{valid_items, oob_default}, thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors              = d_error_counter[0];
+  constexpr int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+C2H_TEST("Warp load unguarded range works with pointer",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params     = params_t<TestType>;
+  using type       = typename params::type;
+  using delegate_t = unguarded_load_t;
+
+  auto d_in =
+    generate_input<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>();
+  c2h::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    thrust::raw_pointer_cast(d_in.data()), delegate_t{}, thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors              = d_error_counter[0];
+  constexpr int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
+
+C2H_TEST("Warp load unguarded range works with cache modified iterator",
+         "[load][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_load_modifier)
+{
+  using params                                   = params_t<TestType>;
+  using type                                     = typename params::type;
+  using delegate_t                               = unguarded_load_t;
+  constexpr cub::CacheLoadModifier load_modifier = c2h::get<4, TestType>::value;
+
+  auto d_in =
+    generate_input<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>();
+  auto in_it = cub::CacheModifiedInputIterator<load_modifier, type>(thrust::raw_pointer_cast(d_in.data()));
+  c2h::device_vector<int> d_error_counter(1, 0);
+
+  warp_load<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    in_it, delegate_t{}, thrust::raw_pointer_cast(d_error_counter.data()));
+
+  const auto num_errors              = d_error_counter[0];
+  constexpr int expected_error_count = 0;
+  REQUIRE(num_errors == expected_error_count);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_mask.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_mask.cu
new file mode 100644
index 000000000..1766bfb40
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_mask.cu
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_ptx.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr unsigned int total_warps =
+    (cub::PowerOfTwo<logical_warp_threads>::VALUE) ? CUB_WARP_THREADS(0) / logical_warp_threads : 1;
+
+public:
+  static constexpr unsigned int value()
+  {
+    return total_warps;
+  }
+};
+
+bool is_lane_involved(unsigned int member_mask, unsigned int lane)
+{
+  return member_mask & (1 << lane);
+}
+
+using logical_warp_threads      = c2h::iota<1, 32>;
+using power_of_two_warp_threads = c2h::enum_type_list<int, 1, 2, 4, 8, 16, 32>;
+
+C2H_TEST("Warp mask ignores lanes before current logical warp", "[mask][warp]", power_of_two_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+
+    for (unsigned int prev_warp_lane = 0; prev_warp_lane < warp_begin; prev_warp_lane++)
+    {
+      REQUIRE_FALSE(is_lane_involved(warp_mask, prev_warp_lane));
+    }
+  }
+}
+
+C2H_TEST("Warp mask involves lanes of current logical warp", "[mask][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+    const unsigned int warp_end   = warp_begin + logical_warp_thread;
+
+    for (unsigned int warp_lane = warp_begin; warp_lane < warp_end; warp_lane++)
+    {
+      REQUIRE(is_lane_involved(warp_mask, warp_lane));
+    }
+  }
+}
+
+C2H_TEST("Warp mask ignores lanes after current logical warp", "[mask][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
+  constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
+
+  for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
+  {
+    const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
+    const unsigned int warp_begin = logical_warp_thread * warp_id;
+    const unsigned int warp_end   = warp_begin + logical_warp_thread;
+
+    for (unsigned int post_warp_lane = warp_end; post_warp_lane < CUB_WARP_THREADS(0); post_warp_lane++)
+    {
+      REQUIRE_FALSE(is_lane_involved(warp_mask, post_warp_lane));
+    }
+  }
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_merge_sort.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_merge_sort.cu
new file mode 100644
index 000000000..f450d159e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_merge_sort.cu
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+struct CustomLess
+{
+  template <typename T>
+  __device__ __host__ bool operator()(const T& lhs, const T& rhs)
+  {
+    return lhs < rhs;
+  }
+};
+
+/**
+ * @brief Kernel to dispatch to the appropriate WarpMergeSort member function, sorting keys-only.
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename T,
+          typename SegmentSizeItT,
+          typename ActionT>
+__global__ void warp_merge_sort_kernel(T* in, T* out, SegmentSizeItT segment_sizes, T oob_default, ActionT action)
+{
+  using warp_merge_sort_t = cub::WarpMergeSort<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS>;
+  using storage_t         = typename warp_merge_sort_t::TempStorage;
+
+  // Get linear thread and warp index
+  const int tid     = threadIdx.x;
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Test case of partially finished CTA
+  if (warp_id >= TOTAL_WARPS)
+  {
+    return;
+  }
+
+  // Thread-local storage & warp-scope temporary storage allocation
+  T thread_data[ITEMS_PER_THREAD];
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  // Instantiate warp-scope algorithm
+  warp_merge_sort_t warp_sort(storage[warp_id]);
+
+  const int warp_offset   = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD * warp_id;
+  const int thread_offset = warp_offset + warp_sort.get_linear_tid() * ITEMS_PER_THREAD;
+  const int valid_items   = segment_sizes[warp_id];
+
+  // Load data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx     = thread_offset + item;
+    thread_data[item] = in[idx];
+  }
+  cub::WARP_SYNC(warp_sort.get_member_mask());
+
+  // Run merge sort test
+  action(warp_sort, thread_data, valid_items, oob_default);
+
+  // Store data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx = thread_offset + item;
+    out[idx]      = (idx - warp_offset) >= valid_items ? oob_default : thread_data[item];
+  }
+}
+
+/**
+ * @brief Kernel to dispatch to the appropriate WarpMergeSort member function, sorting key-value
+ * pairs.
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename KeyT,
+          typename ValueT,
+          typename SegmentSizeItT,
+          typename ActionT>
+__global__ void warp_merge_sort_kernel(
+  KeyT* keys_in,
+  KeyT* keys_out,
+  ValueT* values_in,
+  ValueT* values_out,
+  SegmentSizeItT segment_sizes,
+  KeyT oob_default,
+  ActionT action)
+{
+  using warp_merge_sort_t = cub::WarpMergeSort<KeyT, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ValueT>;
+  using storage_t         = typename warp_merge_sort_t::TempStorage;
+
+  // Get linear thread and warp index
+  const int tid     = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Test case of partially finished CTA
+  if (warp_id >= TOTAL_WARPS)
+  {
+    return;
+  }
+
+  // Thread-local storage & warp-scope temporary storage allocation
+  KeyT keys[ITEMS_PER_THREAD];
+  ValueT values[ITEMS_PER_THREAD];
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  // Instantiate warp-scope algorithm
+  warp_merge_sort_t warp_sort(storage[warp_id]);
+
+  const int warp_offset   = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD * warp_id;
+  const int thread_offset = warp_offset + warp_sort.get_linear_tid() * ITEMS_PER_THREAD;
+  const int valid_items   = segment_sizes[warp_id];
+
+  // Load data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx = thread_offset + item;
+    keys[item]    = keys_in[idx];
+    values[item]  = values_in[idx];
+  }
+  cub::WARP_SYNC(warp_sort.get_member_mask());
+
+  // Run merge sort test
+  action(warp_sort, keys, values, valid_items, oob_default);
+
+  // Store data
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    const int idx   = thread_offset + item;
+    keys_out[idx]   = (idx - warp_offset) >= valid_items ? oob_default : keys[item];
+    values_out[idx] = (idx - warp_offset) >= valid_items ? ValueT{} : values[item];
+  }
+}
+
+// -----------------------------------------------------------
+// Dimensions being instantiated:
+// {full,partial} x {stable, 'unstable'} x {keys, kv-pairs}
+// -----------------------------------------------------------
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::StableSort on keys-only
+ */
+struct warp_stable_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void
+  operator()(WarpSortT& warp_sort, T (&thread_data)[ITEMS_PER_THREAD], int /*valid_items*/, T /*oob_default*/) const
+  {
+    warp_sort.StableSort(thread_data, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort keys-only
+ */
+struct warp_partial_stable_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void
+  operator()(WarpSortT& warp_sort, T (&thread_data)[ITEMS_PER_THREAD], int valid_items, T oob_default) const
+  {
+    warp_sort.StableSort(thread_data, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::Sort on keys-only
+ */
+struct warp_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void
+  operator()(WarpSortT& warp_sort, T (&thread_data)[ITEMS_PER_THREAD], int /*valid_items*/, T /*oob_default*/) const
+  {
+    warp_sort.Sort(thread_data, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort keys-only
+ */
+struct warp_partial_sort_keys_t
+{
+  template <typename T, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void
+  operator()(WarpSortT& warp_sort, T (&thread_data)[ITEMS_PER_THREAD], int valid_items, T oob_default) const
+  {
+    warp_sort.Sort(thread_data, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::StableSort on key-value pairs
+ */
+struct warp_stable_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(
+    WarpSortT& warp_sort,
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int /*valid_items*/,
+    KeyT /*oob_default*/) const
+  {
+    warp_sort.StableSort(keys, values, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort key-value pairs
+ */
+struct warp_partial_stable_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(
+    WarpSortT& warp_sort,
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int valid_items,
+    KeyT oob_default) const
+  {
+    warp_sort.StableSort(keys, values, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpMergeSort::Sort on key-value pairs
+ */
+struct warp_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(
+    WarpSortT& warp_sort,
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int /*valid_items*/,
+    KeyT /*oob_default*/) const
+  {
+    warp_sort.Sort(keys, values, CustomLess{});
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpMergeSort::StableSort key-value pairs
+ */
+struct warp_partial_sort_pairs_t
+{
+  template <typename KeyT, typename ValueT, int ITEMS_PER_THREAD, typename WarpSortT>
+  __device__ void operator()(
+    WarpSortT& warp_sort,
+    KeyT (&keys)[ITEMS_PER_THREAD],
+    ValueT (&values)[ITEMS_PER_THREAD],
+    int valid_items,
+    KeyT oob_default) const
+  {
+    warp_sort.Sort(keys, values, CustomLess{}, valid_items, oob_default);
+  }
+};
+
+/**
+ * @brief Dispatch helper function for sorting keys
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename T,
+          typename SegmentSizesItT,
+          typename ActionT>
+void warp_merge_sort(
+  c2h::device_vector<T>& in, c2h::device_vector<T>& out, SegmentSizesItT segment_sizes, T oob_default, ActionT action)
+{
+  warp_merge_sort_kernel<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, TOTAL_WARPS>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+      thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), segment_sizes, oob_default, action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Dispatch helper function for sorting key-value pairs
+ */
+template <int ITEMS_PER_THREAD,
+          int LOGICAL_WARP_THREADS,
+          int TOTAL_WARPS,
+          typename KeyT,
+          typename ValueT,
+          typename SegmentSizesItT,
+          typename ActionT>
+void warp_merge_sort(
+  c2h::device_vector<KeyT>& keys_in,
+  c2h::device_vector<KeyT>& keys_out,
+  c2h::device_vector<ValueT>& values_in,
+  c2h::device_vector<ValueT>& values_out,
+  SegmentSizesItT segment_sizes,
+  KeyT oob_default,
+  ActionT action)
+{
+  warp_merge_sort_kernel<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, TOTAL_WARPS>
+    <<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+      thrust::raw_pointer_cast(keys_in.data()),
+      thrust::raw_pointer_cast(keys_out.data()),
+      thrust::raw_pointer_cast(values_in.data()),
+      thrust::raw_pointer_cast(values_out.data()),
+      segment_sizes,
+      oob_default,
+      action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Performs a stable sort on per-warp segments of data and assigns oob_default to items that
+ * are out-of-bounds.
+ */
+template <typename RandomItT, typename SegmentSizeItT, typename T>
+void compute_host_reference(
+  RandomItT h_data, SegmentSizeItT segment_sizes, unsigned int num_segments, T oob_default, int logical_warp_items)
+{
+  for (unsigned int segment_id = 0; segment_id < num_segments; segment_id++)
+  {
+    unsigned int segment_size = segment_sizes[segment_id];
+    std::stable_sort(h_data, h_data + segment_size);
+    std::fill(h_data + segment_size, h_data + logical_warp_items, oob_default);
+    h_data += logical_warp_items;
+  }
+}
+
+/**
+ * @brief Stability requirement of the sorting algorithm
+ */
+enum class stability
+{
+  stable,
+  unstable
+};
+
+// List of key types to test
+using custom_t  = c2h::custom_type_t<c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
+using key_types = c2h::type_list<std::uint8_t, std::int32_t, std::int64_t, custom_t>;
+
+// List of value types
+using value_types = c2h::type_list<std::int32_t, custom_t>;
+
+// Logical warp sizes to test
+using logical_warp_threads = c2h::enum_type_list<int, 32, 4>;
+
+// Number of items per thread to test
+using items_per_thread_list = c2h::enum_type_list<int, 1, 4, 7>;
+
+// Whether the sort is required to be stable or not
+using stability_list = c2h::enum_type_list<stability, stability::stable, stability::unstable>;
+
+template <typename TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread     = c2h::get<2, TestType>::value;
+  static constexpr int logical_warp_items   = logical_warp_threads * items_per_thread;
+  static constexpr int total_warps          = 2;
+  static constexpr int tile_size            = items_per_thread * total_warps * logical_warp_threads;
+  static constexpr bool is_stable           = c2h::get<3, TestType>::value == stability::stable;
+};
+
+C2H_TEST(
+  "Warp sort on keys-only works", "[sort][warp]", key_types, logical_warp_threads, items_per_thread_list, stability_list)
+{
+  using params             = params_t<TestType>;
+  using type               = typename params::type;
+  using warp_sort_delegate = ::cuda::std::_If<params::is_stable, warp_stable_sort_keys_t, warp_sort_keys_t>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  auto segment_sizes     = thrust::make_constant_iterator(params::logical_warp_items);
+  const auto oob_default = std::numeric_limits<type>::max();
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, segment_sizes, oob_default, warp_sort_delegate{});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in_out = d_in;
+  compute_host_reference(h_in_out.begin(), segment_sizes, params::total_warps, oob_default, params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_in_out == d_out);
+}
+
+C2H_TEST("Warp sort keys-only on partial warp-tile works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+  using warp_sort_delegate =
+    ::cuda::std::_If<params::is_stable, warp_partial_stable_sort_keys_t, warp_partial_sort_keys_t>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<int> d_segment_sizes(params::total_warps);
+  const auto oob_default = std::numeric_limits<type>::max();
+  c2h::gen(C2H_SEED(5), d_in);
+  c2h::gen(C2H_SEED(5), d_segment_sizes, 0, params::logical_warp_items);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, d_segment_sizes.cbegin(), oob_default, warp_sort_delegate{});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in_out     = d_in;
+  c2h::host_vector<int> segment_sizes = d_segment_sizes;
+  compute_host_reference(h_in_out.begin(), segment_sizes, params::total_warps, oob_default, params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_in_out == d_out);
+}
+
+C2H_TEST("Warp sort on keys-value pairs works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list,
+         value_types)
+{
+  using params             = params_t<TestType>;
+  using key_type           = typename params::type;
+  using value_type         = typename c2h::get<4, TestType>;
+  using warp_sort_delegate = ::cuda::std::_If<params::is_stable, warp_stable_sort_pairs_t, warp_sort_pairs_t>;
+
+  // Prepare test data
+  c2h::device_vector<key_type> d_keys_in(params::tile_size);
+  c2h::device_vector<key_type> d_keys_out(params::tile_size);
+  c2h::device_vector<value_type> d_values_in(params::tile_size);
+  c2h::device_vector<value_type> d_values_out(params::tile_size);
+  auto segment_sizes     = thrust::make_constant_iterator(params::logical_warp_items);
+  const auto oob_default = std::numeric_limits<key_type>::max();
+  c2h::gen(C2H_SEED(10), d_keys_in);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_keys_in, d_keys_out, d_values_in, d_values_out, segment_sizes, oob_default, warp_stable_sort_pairs_t{});
+
+  // Prepare verification data
+  c2h::host_vector<key_type> h_keys_in_out     = d_keys_in;
+  c2h::host_vector<value_type> h_values_in_out = d_values_in;
+  auto cpu_kv_pairs = thrust::make_zip_iterator(h_keys_in_out.begin(), h_values_in_out.begin());
+  compute_host_reference(
+    cpu_kv_pairs,
+    segment_sizes,
+    params::total_warps,
+    thrust::make_tuple(oob_default, value_type{}),
+    params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_keys_in_out == d_keys_out);
+  REQUIRE(h_values_in_out == d_values_out);
+}
+
+C2H_TEST("Warp sort on key-value pairs of a partial warp-tile works",
+         "[sort][warp]",
+         key_types,
+         logical_warp_threads,
+         items_per_thread_list,
+         stability_list,
+         value_types)
+{
+  using params     = params_t<TestType>;
+  using key_type   = typename params::type;
+  using value_type = typename c2h::get<4, TestType>;
+  using warp_sort_delegate =
+    ::cuda::std::_If<params::is_stable, warp_partial_stable_sort_pairs_t, warp_partial_sort_pairs_t>;
+
+  // Prepare test data
+  c2h::device_vector<key_type> d_keys_in(params::tile_size);
+  c2h::device_vector<key_type> d_keys_out(params::tile_size);
+  c2h::device_vector<value_type> d_values_in(params::tile_size);
+  c2h::device_vector<value_type> d_values_out(params::tile_size);
+  c2h::device_vector<int> d_segment_sizes(params::total_warps);
+  const auto oob_default = std::numeric_limits<key_type>::max();
+  c2h::gen(C2H_SEED(5), d_keys_in);
+  c2h::gen(C2H_SEED(5), d_segment_sizes, 0, params::logical_warp_items);
+
+  // Run test
+  warp_merge_sort<params::items_per_thread, params::logical_warp_threads, params::total_warps>(
+    d_keys_in, d_keys_out, d_values_in, d_values_out, d_segment_sizes.cbegin(), oob_default, warp_sort_delegate{});
+
+  // Prepare verification data
+  c2h::host_vector<key_type> h_keys_in_out     = d_keys_in;
+  c2h::host_vector<value_type> h_values_in_out = d_values_in;
+  c2h::host_vector<int> segment_sizes          = d_segment_sizes;
+  auto cpu_kv_pairs = thrust::make_zip_iterator(h_keys_in_out.begin(), h_values_in_out.begin());
+  compute_host_reference(
+    cpu_kv_pairs,
+    segment_sizes,
+    params::total_warps,
+    thrust::make_tuple(oob_default, value_type{}),
+    params::logical_warp_items);
+
+  // Verify results
+  REQUIRE(h_keys_in_out == d_keys_out);
+  REQUIRE(h_values_in_out == d_values_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_reduce.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_reduce.cu
new file mode 100644
index 000000000..c962a66de
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_reduce.cu
@@ -0,0 +1,558 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_reduce.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/custom_type.cuh>
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, typename T, typename ActionT>
+__global__ void warp_reduce_kernel(T* in, T* out, ActionT action)
+{
+  using warp_reduce_t = cub::WarpReduce<T, LOGICAL_WARP_THREADS>;
+  using storage_t     = typename warp_reduce_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = threadIdx.x;
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  // Load data
+  T thread_data = in[tid];
+  // Instantiate and run warp reduction
+  warp_reduce_t warp_reduce(storage[warp_id]);
+  auto result = action(tid, warp_reduce, thread_data);
+
+  // Write warp aggregate
+  out[tid] = result;
+}
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Sum
+ */
+template <typename T>
+struct warp_sum_t
+{
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    auto result = warp_reduce.Sum(thread_data);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpReduce::Sum
+ */
+template <typename T>
+struct warp_sum_partial_t
+{
+  int num_valid;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ __forceinline__ T
+  operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    auto result = warp_reduce.Sum(thread_data, num_valid);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::Reduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_reduce_t
+{
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ __forceinline__ T
+  operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    auto result = warp_reduce.Reduce(thread_data, reduction_op);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for partial WarpReduce::Reduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_reduce_partial_t
+{
+  int num_valid;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    auto result = warp_reduce.Reduce(thread_data, reduction_op, num_valid);
+    return ((linear_tid % LOGICAL_WARP_THREADS) == 0) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::TailSegmentedSum
+ */
+template <typename T>
+struct warp_seg_sum_tail_t
+{
+  uint8_t* d_flags;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    const bool has_agg = (linear_tid % LOGICAL_WARP_THREADS == 0) || ((linear_tid == 0) ? 0 : d_flags[linear_tid - 1]);
+    auto result        = warp_reduce.TailSegmentedSum(thread_data, d_flags[linear_tid]);
+    return has_agg ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::HeadSegmentedSum
+ */
+template <typename T>
+struct warp_seg_sum_head_t
+{
+  uint8_t* d_flags;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    const bool has_agg = ((linear_tid % LOGICAL_WARP_THREADS == 0) || d_flags[linear_tid]);
+    auto result        = warp_reduce.HeadSegmentedSum(thread_data, d_flags[linear_tid]);
+    return (has_agg) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::TailSegmentedReduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_seg_reduce_tail_t
+{
+  uint8_t* d_flags;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    const bool has_agg = (linear_tid % LOGICAL_WARP_THREADS == 0) || ((linear_tid == 0) ? 0 : d_flags[linear_tid - 1]);
+    auto result        = warp_reduce.TailSegmentedReduce(thread_data, d_flags[linear_tid], reduction_op);
+    return has_agg ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Delegate wrapper for WarpReduce::HeadSegmentedReduce
+ */
+template <typename T, typename ReductionOpT>
+struct warp_seg_reduce_head_t
+{
+  uint8_t* d_flags;
+  ReductionOpT reduction_op;
+  template <int LOGICAL_WARP_THREADS>
+  __device__ T operator()(int linear_tid, cub::WarpReduce<T, LOGICAL_WARP_THREADS>& warp_reduce, T& thread_data) const
+  {
+    const bool has_agg = ((linear_tid % LOGICAL_WARP_THREADS == 0) || d_flags[linear_tid]);
+    auto result        = warp_reduce.HeadSegmentedReduce(thread_data, d_flags[linear_tid], reduction_op);
+    return (has_agg) ? result : thread_data;
+  }
+};
+
+/**
+ * @brief Dispatch helper function
+ */
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, typename T, typename ActionT>
+void warp_reduce(c2h::device_vector<T>& in, c2h::device_vector<T>& out, ActionT action)
+{
+  warp_reduce_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT><<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T, typename ::cuda::std::enable_if<::cuda::std::is_floating_point<T>::value, int>::type = 0>
+void verify_results(const c2h::host_vector<T>& expected_data, const c2h::device_vector<T>& test_results)
+{
+  REQUIRE_APPROX_EQ(expected_data, test_results);
+}
+
+/**
+ * @brief Compares the results returned from system under test against the expected results.
+ */
+template <typename T, typename ::cuda::std::enable_if<!::cuda::std::is_floating_point<T>::value, int>::type = 0>
+void verify_results(const c2h::host_vector<T>& expected_data, const c2h::device_vector<T>& test_results)
+{
+  REQUIRE(expected_data == test_results);
+}
+
+enum class reduce_mode
+{
+  all,
+  partial,
+  head_flags,
+  tail_flags,
+};
+
+template <typename InputItT, typename FlagInputItT, typename ReductionOp, typename ResultOutItT>
+void compute_host_reference(
+  reduce_mode mode,
+  InputItT h_in,
+  FlagInputItT h_flags,
+  int warps,
+  int logical_warp_threads,
+  int valid_warp_threads,
+  ReductionOp reduction_op,
+  ResultOutItT h_data_out)
+{
+  // Accumulate segments (lane 0 of each warp is implicitly a segment head)
+  for (int warp = 0; warp < warps; ++warp)
+  {
+    int warp_offset = warp * logical_warp_threads;
+    int item_offset = warp_offset + valid_warp_threads - 1;
+
+    // Last item in warp
+    auto head_aggregate = h_in[item_offset];
+    auto tail_aggregate = h_in[item_offset];
+
+    if (mode != reduce_mode::tail_flags && h_flags[item_offset])
+    {
+      h_data_out[item_offset] = head_aggregate;
+    }
+    item_offset--;
+
+    // Work backwards
+    while (item_offset >= warp_offset)
+    {
+      if (h_flags[item_offset + 1])
+      {
+        head_aggregate = h_in[item_offset];
+      }
+      else
+      {
+        head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
+      }
+
+      if (h_flags[item_offset])
+      {
+        if (mode == reduce_mode::head_flags)
+        {
+          h_data_out[item_offset] = head_aggregate;
+        }
+        else if (mode == reduce_mode::tail_flags)
+        {
+          h_data_out[item_offset + 1] = tail_aggregate;
+          tail_aggregate              = h_in[item_offset];
+        }
+      }
+      else
+      {
+        tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
+      }
+
+      item_offset--;
+    }
+
+    // Record last segment aggregate
+    if (mode == reduce_mode::tail_flags)
+    {
+      h_data_out[warp_offset] = tail_aggregate;
+    }
+    else
+    {
+      h_data_out[warp_offset] = head_aggregate;
+    }
+  }
+}
+
+// List of types to test
+using custom_t =
+  c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t, c2h::lexicographical_less_comparable_t>;
+using full_type_list =
+  c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t, custom_t, ulonglong4, uchar3, short2>;
+
+using builtin_type_list = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+
+// Logical warp sizes to test
+using logical_warp_threads = c2h::enum_type_list<int, 32, 16, 9, 7, 1>;
+
+using segmented_modes = c2h::enum_type_list<reduce_mode, reduce_mode::head_flags, reduce_mode::tail_flags>;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value()
+  {
+    return total_warps;
+  }
+};
+
+template <typename TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = total_warps * logical_warp_threads;
+};
+
+C2H_TEST("Warp sum works", "[reduce][warp]", full_type_list, logical_warp_threads)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  constexpr auto valid_items = params::logical_warp_threads;
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_t<type>{});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    ::cuda::std::plus<type>{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp reduce works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  constexpr auto valid_items = params::logical_warp_threads;
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_reduce_t<type, red_op_t>{red_op_t{}});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    red_op_t{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp sum on partial warp works", "[reduce][warp]", full_type_list, logical_warp_threads)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(d_in, d_out, warp_sum_partial_t<type>{valid_items});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    ::cuda::std::plus<type>{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp reduce on partial warp works", "[reduce][warp]", builtin_type_list, logical_warp_threads)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  const int valid_items = GENERATE_COPY(take(2, random(1, params::logical_warp_threads)));
+  c2h::gen(C2H_SEED(10), d_in);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, warp_reduce_partial_t<type, red_op_t>{valid_items, red_op_t{}});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in  = d_in;
+  c2h::host_vector<type> h_out = h_in;
+  auto h_flags                 = thrust::make_constant_iterator(false);
+  compute_host_reference(
+    reduce_mode::all,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    red_op_t{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp segmented sum works", "[reduce][warp]", full_type_list, logical_warp_threads, segmented_modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  constexpr auto segmented_mod = c2h::get<2, TestType>::value;
+  static_assert(segmented_mod == reduce_mode::tail_flags || segmented_mod == reduce_mode::head_flags,
+                "Segmented tests must either be head or tail flags");
+  using warp_seg_sum_t =
+    ::cuda::std::_If<(segmented_mod == reduce_mode::tail_flags), warp_seg_sum_tail_t<type>, warp_seg_sum_head_t<type>>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<uint8_t> d_flags(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  constexpr auto valid_items = params::logical_warp_threads;
+  constexpr uint8_t min      = 0;
+  constexpr uint8_t max      = 2;
+  c2h::gen(C2H_SEED(5), d_in);
+  c2h::gen(C2H_SEED(5), d_flags, min, max);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, warp_seg_sum_t{thrust::raw_pointer_cast(d_flags.data())});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in       = d_in;
+  c2h::host_vector<uint8_t> h_flags = d_flags;
+  c2h::host_vector<type> h_out      = h_in;
+  compute_host_reference(
+    segmented_mod,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    ::cuda::std::plus<type>{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
+
+C2H_TEST("Warp segmented reduction works", "[reduce][warp]", builtin_type_list, logical_warp_threads, segmented_modes)
+{
+  using params   = params_t<TestType>;
+  using type     = typename params::type;
+  using red_op_t = cub::Min;
+
+  constexpr auto segmented_mod = c2h::get<2, TestType>::value;
+  static_assert(segmented_mod == reduce_mode::tail_flags || segmented_mod == reduce_mode::head_flags,
+                "Segmented tests must either be head or tail flags");
+  using warp_seg_reduction_t =
+    ::cuda::std::_If<(segmented_mod == reduce_mode::tail_flags),
+                     warp_seg_reduce_tail_t<type, red_op_t>,
+                     warp_seg_reduce_head_t<type, red_op_t>>;
+
+  // Prepare test data
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::device_vector<uint8_t> d_flags(params::tile_size);
+  c2h::device_vector<type> d_out(params::tile_size);
+  constexpr auto valid_items = params::logical_warp_threads;
+  constexpr uint8_t min      = 0;
+  constexpr uint8_t max      = 2;
+  c2h::gen(C2H_SEED(5), d_in);
+  c2h::gen(C2H_SEED(5), d_flags, min, max);
+
+  // Run test
+  warp_reduce<params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, warp_seg_reduction_t{thrust::raw_pointer_cast(d_flags.data()), red_op_t{}});
+
+  // Prepare verification data
+  c2h::host_vector<type> h_in       = d_in;
+  c2h::host_vector<uint8_t> h_flags = d_flags;
+  c2h::host_vector<type> h_out      = h_in;
+  compute_host_reference(
+    segmented_mod,
+    h_in,
+    h_flags,
+    params::total_warps,
+    params::logical_warp_threads,
+    valid_items,
+    red_op_t{},
+    h_out.begin());
+
+  // Verify results
+  verify_results(h_out, d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan.cu
new file mode 100644
index 000000000..261bcb281
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan.cu
@@ -0,0 +1,684 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/util_macro.cuh>
+#include <cub/warp/warp_scan.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+__global__ void warp_combine_scan_kernel(T* in, T* inclusive_out, T* exclusive_out, ActionT action)
+{
+  using warp_scan_t = cub::WarpScan<T, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_scan_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  T inc_out, exc_out;
+  T thread_data = in[tid];
+
+  warp_scan_t scan(storage[warp_id]);
+
+  action(scan, thread_data, inc_out, exc_out);
+
+  inclusive_out[tid] = inc_out;
+  exclusive_out[tid] = exc_out;
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+void warp_combine_scan(
+  c2h::device_vector<T>& in, c2h::device_vector<T>& inclusive_out, c2h::device_vector<T>& exclusive_out, ActionT action)
+{
+  warp_combine_scan_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT><<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+    thrust::raw_pointer_cast(in.data()),
+    thrust::raw_pointer_cast(inclusive_out.data()),
+    thrust::raw_pointer_cast(exclusive_out.data()),
+    action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+__global__ void warp_scan_kernel(T* in, T* out, ActionT action)
+{
+  using warp_scan_t = cub::WarpScan<T, LOGICAL_WARP_THREADS>;
+  using storage_t   = typename warp_scan_t::TempStorage;
+
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+  // Get warp index
+  int warp_id = tid / LOGICAL_WARP_THREADS;
+
+  T thread_data = in[tid];
+
+  warp_scan_t scan(storage[warp_id]);
+
+  action(scan, thread_data);
+
+  out[tid] = thread_data;
+}
+
+template <int LOGICAL_WARP_THREADS, int TOTAL_WARPS, class T, class ActionT>
+void warp_scan(c2h::device_vector<T>& in, c2h::device_vector<T>& out, ActionT action)
+{
+  warp_scan_kernel<LOGICAL_WARP_THREADS, TOTAL_WARPS, T, ActionT><<<1, LOGICAL_WARP_THREADS * TOTAL_WARPS>>>(
+    thrust::raw_pointer_cast(in.data()), thrust::raw_pointer_cast(out.data()), action);
+
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+}
+
+enum class scan_mode
+{
+  exclusive,
+  inclusive
+};
+
+template <scan_mode Mode>
+struct sum_op_t
+{
+  template <class WarpScanT, class T>
+  __device__ void operator()(WarpScanT& scan, T& thread_data) const
+  {
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data);
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct sum_aggregate_op_t
+{
+  int m_target_thread_id;
+  T* m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS>& scan, T& thread_data) const
+  {
+    T warp_aggregate{};
+
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveSum(thread_data, thread_data, warp_aggregate);
+    }
+    else
+    {
+      scan.InclusiveSum(thread_data, thread_data, warp_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+template <scan_mode Mode>
+struct min_op_t
+{
+  template <class T, class WarpScanT>
+  __device__ void operator()(WarpScanT& scan, T& thread_data) const
+  {
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{});
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_aggregate_op_t
+{
+  int m_target_thread_id;
+  T* m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS>& scan, T& thread_data) const
+  {
+    T warp_aggregate{};
+
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, cub::Min{}, warp_aggregate);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, cub::Min{}, warp_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_init_value_op_t
+{
+  T initial_value;
+  template <class WarpScanT>
+  __device__ void operator()(WarpScanT& scan, T& thread_data) const
+  {
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{});
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, initial_value, cub::Min{});
+    }
+  }
+};
+
+template <class T, scan_mode Mode>
+struct min_init_value_aggregate_op_t
+{
+  int m_target_thread_id;
+  T initial_value;
+  T* m_d_warp_aggregate;
+
+  template <int LOGICAL_WARP_THREADS>
+  __device__ void operator()(cub::WarpScan<T, LOGICAL_WARP_THREADS>& scan, T& thread_data) const
+  {
+    T warp_aggregate{};
+
+    _CCCL_IF_CONSTEXPR (Mode == scan_mode::exclusive)
+    {
+      scan.ExclusiveScan(thread_data, thread_data, initial_value, cub::Min{}, warp_aggregate);
+    }
+    else
+    {
+      scan.InclusiveScan(thread_data, thread_data, initial_value, cub::Min{}, warp_aggregate);
+    }
+
+    const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+
+    if (tid % LOGICAL_WARP_THREADS == m_target_thread_id)
+    {
+      m_d_warp_aggregate[tid / LOGICAL_WARP_THREADS] = warp_aggregate;
+    }
+  }
+};
+
+struct min_scan_op_t
+{
+  template <class T, class WarpScanT>
+  __device__ void operator()(WarpScanT& scan, T& thread_data, T& inclusive_output, T& exclusive_output) const
+  {
+    scan.Scan(thread_data, inclusive_output, exclusive_output, cub::Min{});
+  }
+};
+
+template <class T>
+struct min_init_value_scan_op_t
+{
+  T initial_value;
+  template <class WarpScanT>
+  __device__ void operator()(WarpScanT& scan, T& thread_data, T& inclusive_output, T& exclusive_output) const
+  {
+    scan.Scan(thread_data, inclusive_output, exclusive_output, initial_value, cub::Min{});
+  }
+};
+
+template <class T, class ScanOpT>
+c2h::host_vector<T> compute_host_reference(
+  scan_mode mode, c2h::host_vector<T>& result, int logical_warp_threads, ScanOpT scan_op, T initial_value = T{})
+{
+  if (result.empty())
+  {
+    return c2h::host_vector<T>{};
+  }
+  // TODO : assert result.size() % logical_warp_threads == 0
+
+  // The accumulator variable is used to calculate warp_aggregate without
+  // taking initial_value into consideration in both exclusive and inclusive scan.
+  int num_warps = CUB_QUOTIENT_CEILING(static_cast<int>(result.size()), logical_warp_threads);
+  c2h::host_vector<T> warp_accumulator(num_warps);
+  if (mode == scan_mode::exclusive)
+  {
+    for (int w = 0; w < num_warps; ++w)
+    {
+      T* output     = result.data() + w * logical_warp_threads;
+      T accumulator = output[0];
+      T current     = static_cast<T>(scan_op(initial_value, output[0]));
+      output[0]     = initial_value;
+      for (int i = 1; i < logical_warp_threads; i++)
+      {
+        accumulator = static_cast<T>(scan_op(accumulator, output[i]));
+        T tmp       = output[i];
+        output[i]   = current;
+        current     = static_cast<T>(scan_op(current, tmp));
+      }
+      warp_accumulator[w] = accumulator;
+    }
+  }
+  else
+  {
+    for (int w = 0; w < num_warps; ++w)
+    {
+      T* output     = result.data() + w * logical_warp_threads;
+      T accumulator = output[0];
+      T current     = static_cast<T>(scan_op(initial_value, output[0]));
+      output[0]     = current;
+      for (int i = 1; i < logical_warp_threads; i++)
+      {
+        T tmp       = output[i];
+        current     = static_cast<T>(scan_op(current, tmp));
+        accumulator = static_cast<T>(scan_op(accumulator, tmp));
+        output[i]   = current;
+      }
+      warp_accumulator[w] = accumulator;
+    }
+  }
+
+  return warp_accumulator;
+}
+
+using types                = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using logical_warp_threads = c2h::enum_type_list<int, 32, 16, 9, 2>;
+using modes                = c2h::enum_type_list<scan_mode, scan_mode::exclusive, scan_mode::inclusive>;
+
+using vec_types = c2h::type_list<ulonglong4, uchar3, short2>;
+
+using warp_combine_type = int;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value()
+  {
+    return total_warps;
+  }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads = c2h::get<1, TestType>::value;
+  static constexpr scan_mode mode           = c2h::get<2, TestType>::value;
+  static constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size            = total_warps * logical_warp_threads;
+};
+
+C2H_TEST("Warp scan works with sum", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in, d_out, sum_op_t<params::mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+C2H_TEST("Warp scan works with vec_types", "[scan][warp]", vec_types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in, d_out, sum_op_t<params::mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Warp scan works with custom types",
+         "[scan][warp]",
+         c2h::type_list<c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>>,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in, d_out, sum_op_t<params::mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+}
+
+C2H_TEST("Warp scan returns valid warp aggregate",
+         "[scan][warp]",
+         c2h::type_list<c2h::custom_type_t<c2h::accumulateable_t, c2h::equal_comparable_t>>,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_warp_aggregates(params::total_warps);
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    sum_aggregate_op_t<type, params::mode>{target_thread_id, thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates = compute_host_reference(params::mode, h_out, params::logical_warp_threads, std::plus<type>{});
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+// TODO : Do we need all the types?
+C2H_TEST("Warp scan works with custom scan op", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(d_in, d_out, min_op_t<params::mode>{});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    std::numeric_limits<type>::max());
+
+  // From the documentation -
+  // Computes an exclusive prefix scan using the specified binary scan functor
+  // across the calling warp. Because no initial value is supplied, the output
+  // computed for warp-lane0 is undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  _CCCL_IF_CONSTEXPR (params::mode == scan_mode::exclusive)
+  {
+    for (size_t i = 0; i < h_out.size(); i += params::logical_warp_threads)
+    {
+      d_out[i] = h_out[i];
+    }
+  }
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+C2H_TEST("Warp custom op scan returns valid warp aggregate", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_warp_aggregates(params::total_warps);
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    min_aggregate_op_t<type, params::mode>{target_thread_id, thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates = compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    std::numeric_limits<type>::max());
+
+  // From the documentation -
+  // Computes an exclusive prefix scan using the specified binary scan functor
+  // across the calling warp. Because no initial value is supplied, the output
+  // computed for warp-lane0 is undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  _CCCL_IF_CONSTEXPR (params::mode == scan_mode::exclusive)
+  {
+    for (size_t i = 0; i < h_out.size(); i += params::logical_warp_threads)
+    {
+      d_out[i] = h_out[i];
+    }
+  }
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+C2H_TEST("Warp custom op scan works with initial value", "[scan][warp]", types, logical_warp_threads, modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const type initial_value = static_cast<type>(GENERATE_COPY(take(2, random(0, params::tile_size))));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in, d_out, min_init_value_op_t<type, params::mode>{initial_value});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  REQUIRE_APPROX_EQ(h_out, d_out);
+}
+
+C2H_TEST("Warp custom op scan with initial value returns valid warp aggregate",
+         "[scan][warp]",
+         types,
+         logical_warp_threads,
+         modes)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_warp_aggregates(params::total_warps);
+  c2h::device_vector<type> d_out(params::tile_size);
+  c2h::device_vector<type> d_in(params::tile_size);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const int target_thread_id = GENERATE_COPY(take(2, random(0, params::logical_warp_threads - 1)));
+  const type initial_value   = static_cast<type>(GENERATE_COPY(take(2, random(0, params::tile_size))));
+
+  warp_scan<params::logical_warp_threads, params::total_warps>(
+    d_in,
+    d_out,
+    min_init_value_aggregate_op_t<type, params::mode>{
+      target_thread_id, initial_value, thrust::raw_pointer_cast(d_warp_aggregates.data())});
+
+  c2h::host_vector<type> h_out = d_in;
+
+  auto h_warp_aggregates = compute_host_reference(
+    params::mode,
+    h_out,
+    params::logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  REQUIRE(h_out == d_out);
+  REQUIRE(h_warp_aggregates == d_warp_aggregates);
+}
+
+C2H_TEST("Warp combination scan works with custom scan op", "[scan][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_threads = c2h::get<0, TestType>();
+  constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  using type                         = int;
+
+  c2h::device_vector<type> d_inclusive_out(total_warps * logical_warp_threads);
+  c2h::device_vector<type> d_exclusive_out(total_warps * logical_warp_threads);
+  c2h::device_vector<type> d_in(total_warps * logical_warp_threads);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  warp_combine_scan<logical_warp_threads, total_warps>(d_in, d_inclusive_out, d_exclusive_out, min_scan_op_t{});
+
+  c2h::host_vector<type> h_exclusive_out = d_in;
+  c2h::host_vector<type> h_inclusive_out = d_in;
+
+  compute_host_reference(
+    scan_mode::exclusive,
+    h_exclusive_out,
+    logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    std::numeric_limits<type>::max());
+
+  compute_host_reference(
+    scan_mode::inclusive,
+    h_inclusive_out,
+    logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    std::numeric_limits<type>::max());
+
+  // According to WarpScan::Scan documentation -
+  // Because no initial value is supplied, the exclusive_output computed for warp-lane0 is
+  // undefined.
+
+  // When comparing device output, the corresponding undefined data points need
+  // to be fixed
+
+  for (size_t i = 0; i < h_exclusive_out.size(); i += logical_warp_threads)
+  {
+    d_exclusive_out[i] = h_exclusive_out[i];
+  }
+
+  REQUIRE(h_inclusive_out == d_inclusive_out);
+  REQUIRE(h_exclusive_out == d_exclusive_out);
+}
+
+C2H_TEST("Warp combination custom scan works with initial value", "[scan][warp]", logical_warp_threads)
+{
+  constexpr int logical_warp_threads = c2h::get<0, TestType>();
+  constexpr int total_warps          = total_warps_t<logical_warp_threads>::value();
+  using type                         = int;
+
+  c2h::device_vector<type> d_inclusive_out(total_warps * logical_warp_threads);
+  c2h::device_vector<type> d_exclusive_out(total_warps * logical_warp_threads);
+  c2h::device_vector<type> d_in(total_warps * logical_warp_threads);
+  c2h::gen(C2H_SEED(10), d_in);
+
+  const type initial_value = GENERATE_COPY(take(2, random(0, total_warps * logical_warp_threads)));
+
+  warp_combine_scan<logical_warp_threads, total_warps>(
+    d_in, d_inclusive_out, d_exclusive_out, min_init_value_scan_op_t<type>{initial_value});
+
+  c2h::host_vector<type> h_exclusive_out = d_in;
+  c2h::host_vector<type> h_inclusive_out = d_in;
+
+  compute_host_reference(
+    scan_mode::exclusive,
+    h_exclusive_out,
+    logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  compute_host_reference(
+    scan_mode::inclusive,
+    h_inclusive_out,
+    logical_warp_threads,
+    [](type l, type r) {
+      return std::min(l, r);
+    },
+    initial_value);
+
+  REQUIRE(h_inclusive_out == d_inclusive_out);
+  REQUIRE(h_exclusive_out == d_exclusive_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan_api.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan_api.cu
new file mode 100644
index 000000000..6314f2d9e
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_scan_api.cu
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/warp/warp_scan.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "cuda/std/__algorithm/fill.h"
+#include "cuda/std/__algorithm/max.h"
+#include "cuda/std/__numeric/inclusive_scan.h"
+#include "cuda/std/__numeric/iota.h"
+#include <c2h/catch2_test_helper.cuh>
+
+constexpr int num_warps = 4;
+
+struct max_op
+{
+  __host__ __device__ int operator()(int i, int j)
+  {
+    return cuda::std::max(i, j);
+  }
+};
+
+struct sum_op
+{
+  __host__ __device__ int operator()(int i, int j)
+  {
+    return i + j;
+  }
+};
+
+// example-begin inclusive-warp-scan-init-value
+__global__ void InclusiveWarpScanKernel(int* output)
+{
+  // Specialize WarpScan for type int
+  using warp_scan_t = cub::WarpScan<int>;
+  // Allocate WarpScan shared memory for 4 warps
+  __shared__ typename warp_scan_t::TempStorage temp_storage[num_warps];
+
+  int warp_id       = threadIdx.x / 32;
+  int initial_value = 3;
+  int thread_data   = threadIdx.x % 32 + warp_id;
+
+  // warp #0 input: {0, 1, 2, 3, ..., 31}
+  // warp #1 input: {1, 2, 3, 4, ..., 32}
+  // warp #2 input: {2, 3, 4, 5, ..., 33}
+  // warp #4 input: {3, 4, 5, 6, ..., 34}
+
+  // Collectively compute the warp-wide inclusive prefix max scan
+  warp_scan_t(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, initial_value, cub::Max());
+
+  // initial value = 3 (for each warp)
+  // warp #0 output: {3, 3, 3, 3, ..., 31}
+  // warp #1 output: {3, 3, 3, 4, ..., 32}
+  // warp #2 output: {3, 3, 4, 5, ..., 33}
+  // warp #3 output: {3, 4, 5, 6, ..., 34}
+  output[threadIdx.x] = thread_data;
+
+  // example-end inclusive-warp-scan-init-value
+}
+
+C2H_TEST("Warp array-based inclusive scan works with initial value", "[scan][warp]")
+{
+  thrust::device_vector<int> d_out(num_warps * 32);
+
+  InclusiveWarpScanKernel<<<1, num_warps * 32>>>(thrust::raw_pointer_cast(d_out.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  c2h::host_vector<int> expected(d_out.size());
+
+  for (int i = 0; i < num_warps; ++i)
+  {
+    auto start = expected.begin() + i * 32;
+    auto end   = start + 32;
+
+    cuda::std::iota(start, end, i); // initialize host input for every warp
+
+    cuda::std::inclusive_scan(start, end, start, max_op{}, 3);
+  }
+
+  REQUIRE(expected == d_out);
+}
+
+// example-begin inclusive-warp-scan-init-value-aggregate
+__global__ void InclusiveWarpScanKernelAggr(int* output, int* d_warp_aggregate)
+{
+  // Specialize WarpScan for type int
+  using warp_scan_t = cub::WarpScan<int>;
+  // Allocate WarpScan shared memory for 4 warps
+  __shared__ typename warp_scan_t::TempStorage temp_storage[num_warps];
+
+  int warp_id       = threadIdx.x / 32;
+  int initial_value = 3; // for each warp
+  int thread_data   = 1;
+  int warp_aggregate;
+
+  // warp #0 input: {1, 1, 1, 1, ..., 1}
+  // warp #1 input: {1, 1, 1, 1, ..., 1}
+  // warp #2 input: {1, 1, 1, 1, ..., 1}
+  // warp #4 input: {1, 1, 1, 1, ..., 1}
+
+  // Collectively compute the warp-wide inclusive prefix max scan
+  warp_scan_t(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, initial_value, cub::Sum(), warp_aggregate);
+
+  // warp #1 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32
+  // warp #2 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32
+  // warp #0 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32
+  // warp #3 output: {4, 5, 6, 7, ..., 35} - warp aggregate: 32
+
+  // example-end inclusive-warp-scan-init-value-aggregate
+  output[threadIdx.x]       = thread_data;
+  d_warp_aggregate[warp_id] = warp_aggregate;
+}
+
+C2H_TEST("Warp array-based inclusive scan aggregate works with initial value", "[scan][warp]")
+{
+  thrust::device_vector<int> d_out(num_warps * 32);
+  c2h::device_vector<int> d_warp_aggregate(num_warps);
+
+  InclusiveWarpScanKernelAggr<<<1, num_warps * 32>>>(
+    thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_warp_aggregate.data()));
+  REQUIRE(cudaSuccess == cudaPeekAtLastError());
+  REQUIRE(cudaSuccess == cudaDeviceSynchronize());
+
+  c2h::host_vector<int> expected(d_out.size());
+  c2h::host_vector<int> expected_aggr{};
+
+  for (int i = 0; i < num_warps; ++i)
+  {
+    auto start   = expected.begin() + i * 32;
+    auto end     = start + 32;
+    int init_val = 3;
+
+    cuda::std::fill(start, end, 1); // initialize host input for every warp
+
+    cuda::std::inclusive_scan(start, end, start, sum_op{}, init_val);
+
+    expected_aggr.push_back(expected[i * 32 + 31] - init_val); // warp aggregate does not take
+                                                               // initial value into account
+  }
+
+  REQUIRE(expected == d_out);
+  REQUIRE(expected_aggr == d_warp_aggregate);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_store.cu b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_store.cu
new file mode 100644
index 000000000..9c9015c9a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/catch2_test_warp_store.cu
@@ -0,0 +1,281 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/warp/warp_store.cuh>
+
+#include <c2h/catch2_test_helper.cuh>
+#include <c2h/fill_striped.cuh>
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename OutputIteratorT,
+          typename ActionT>
+__global__ void warp_store_kernel(OutputIteratorT output_iterator, ActionT action)
+{
+  using warp_store_t = cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS>;
+  using storage_t    = typename warp_store_t::TempStorage;
+
+  constexpr int tile_size = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS;
+  __shared__ storage_t storage[TOTAL_WARPS];
+
+  const int tid = cub::RowMajorTid(blockDim.x, blockDim.y, blockDim.z);
+  T reg[ITEMS_PER_THREAD];
+
+  for (int item = 0; item < ITEMS_PER_THREAD; item++)
+  {
+    reg[item] = static_cast<T>(tid * ITEMS_PER_THREAD + item);
+  }
+
+  const int warp_id = tid / LOGICAL_WARP_THREADS;
+  warp_store_t store(storage[warp_id]);
+
+  action(store, output_iterator + (warp_id * tile_size), reg);
+}
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T,
+          typename OutputIteratorT,
+          typename ActionT>
+void warp_store(OutputIteratorT output_iterator, ActionT action)
+{
+  warp_store_kernel<StoreAlgorithm, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, TOTAL_WARPS, T, OutputIteratorT, ActionT>
+    <<<1, TOTAL_WARPS * LOGICAL_WARP_THREADS>>>(output_iterator, action);
+}
+
+struct guarded_store_t
+{
+  int valid_items;
+  template <cub::WarpStoreAlgorithm StoreAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename OutputIteratorT>
+  __device__ void operator()(cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS> store,
+                             OutputIteratorT output,
+                             T (&reg)[ITEMS_PER_THREAD])
+  {
+    store.Store(output, reg, valid_items);
+  }
+};
+
+struct unguarded_store_t
+{
+  template <cub::WarpStoreAlgorithm StoreAlgorithm,
+            int LOGICAL_WARP_THREADS,
+            int ITEMS_PER_THREAD,
+            typename T,
+            typename OutputIteratorT>
+  __device__ void operator()(cub::WarpStore<T, ITEMS_PER_THREAD, StoreAlgorithm, LOGICAL_WARP_THREADS> store,
+                             OutputIteratorT output,
+                             T (&reg)[ITEMS_PER_THREAD])
+  {
+    store.Store(output, reg);
+  }
+};
+
+template <cub::WarpStoreAlgorithm StoreAlgorithm,
+          int LOGICAL_WARP_THREADS,
+          int ITEMS_PER_THREAD,
+          int TOTAL_WARPS,
+          typename T>
+c2h::device_vector<T> compute_reference(int valid_items)
+{
+  constexpr int tile_size        = LOGICAL_WARP_THREADS * ITEMS_PER_THREAD;
+  constexpr int total_item_count = TOTAL_WARPS * tile_size;
+  c2h::device_vector<T> d_input(total_item_count);
+
+  _CCCL_IF_CONSTEXPR (StoreAlgorithm == cub::WarpStoreAlgorithm::WARP_STORE_STRIPED)
+  {
+    c2h::host_vector<T> input(total_item_count);
+    fill_striped<ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD * TOTAL_WARPS>(input.begin());
+    d_input = input;
+  }
+  else
+  {
+    c2h::gen(c2h::modulo_t{d_input.size()}, d_input);
+  }
+  if (valid_items != total_item_count)
+  {
+    for (int warp_id = 0; warp_id < TOTAL_WARPS; warp_id++)
+    {
+      thrust::fill(c2h::device_policy,
+                   d_input.begin() + warp_id * tile_size + valid_items,
+                   d_input.begin() + (warp_id + 1) * tile_size,
+                   T{});
+    }
+  }
+  return d_input;
+}
+
+// %PARAM% LWT lwt 4:16:32
+// %PARAM% ALGO_TYPE alg 0:1:2:3
+
+using types                = c2h::type_list<std::uint8_t, std::uint16_t, std::int32_t, std::int64_t>;
+using items_per_thread     = c2h::enum_type_list<int, 1, 4, 7>;
+using logical_warp_threads = c2h::enum_type_list<int, LWT>;
+using algorithms =
+  c2h::enum_type_list<cub::WarpStoreAlgorithm,
+                      cub::WarpStoreAlgorithm::WARP_STORE_DIRECT,
+                      cub::WarpStoreAlgorithm::WARP_STORE_STRIPED,
+                      cub::WarpStoreAlgorithm::WARP_STORE_TRANSPOSE,
+                      cub::WarpStoreAlgorithm::WARP_STORE_VECTORIZE>;
+using algorithm = c2h::enum_type_list<cub::WarpStoreAlgorithm, c2h::get<ALGO_TYPE, algorithms>::value>;
+
+using cache_store_modifier =
+  c2h::enum_type_list<cub::CacheStoreModifier,
+                      cub::CacheStoreModifier::STORE_DEFAULT,
+                      cub::CacheStoreModifier::STORE_WB,
+                      cub::CacheStoreModifier::STORE_CG,
+                      cub::CacheStoreModifier::STORE_CS,
+                      cub::CacheStoreModifier::STORE_WT,
+                      cub::CacheStoreModifier::STORE_VOLATILE>;
+
+constexpr int guarded_store_tests_count = 30;
+
+template <int logical_warp_threads>
+struct total_warps_t
+{
+private:
+  static constexpr int max_warps      = 2;
+  static constexpr bool is_arch_warp  = (logical_warp_threads == CUB_WARP_THREADS(0));
+  static constexpr bool is_pow_of_two = ((logical_warp_threads & (logical_warp_threads - 1)) == 0);
+  static constexpr int total_warps    = (is_arch_warp || is_pow_of_two) ? max_warps : 1;
+
+public:
+  static constexpr int value()
+  {
+    return total_warps;
+  }
+};
+
+template <class TestType>
+struct params_t
+{
+  using type = typename c2h::get<0, TestType>;
+
+  static constexpr int logical_warp_threads          = c2h::get<1, TestType>::value;
+  static constexpr int items_per_thread              = c2h::get<2, TestType>::value;
+  static constexpr cub::WarpStoreAlgorithm algorithm = c2h::get<3, TestType>::value;
+  static constexpr int total_warps                   = total_warps_t<logical_warp_threads>::value();
+  static constexpr int tile_size                     = logical_warp_threads * items_per_thread;
+  static constexpr int total_item_count              = total_warps * tile_size;
+};
+
+C2H_TEST("Warp store guarded range works with pointer",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items = GENERATE_COPY(take(guarded_store_tests_count, random(0, params::tile_size - 1)));
+  auto out              = thrust::raw_pointer_cast(d_out.data());
+  warp_store<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    out, guarded_store_t{valid_items});
+  auto d_expected_output =
+    compute_reference<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+      valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+C2H_TEST("Warp store guarded range works with cache modified iterator",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_store_modifier)
+{
+  using params                                     = params_t<TestType>;
+  using type                                       = typename params::type;
+  constexpr cub::CacheStoreModifier store_modifier = c2h::get<4, TestType>::value;
+
+  c2h::device_vector<type> d_out(params::total_item_count, type{});
+  const int valid_items = GENERATE_COPY(take(guarded_store_tests_count, random(0, params::tile_size - 1)));
+  auto out = cub::CacheModifiedOutputIterator<store_modifier, type>(thrust::raw_pointer_cast(d_out.data()));
+  warp_store<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    out, guarded_store_t{valid_items});
+  auto d_expected_output =
+    compute_reference<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+      valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+C2H_TEST("Warp store unguarded range works with pointer",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm)
+{
+  using params = params_t<TestType>;
+  using type   = typename params::type;
+
+  c2h::device_vector<type> d_out(params::total_item_count, type{});
+  constexpr int valid_items = params::tile_size;
+  auto out                  = thrust::raw_pointer_cast(d_out.data());
+  warp_store<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    out, unguarded_store_t{});
+  auto d_expected_output =
+    compute_reference<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+      valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
+
+C2H_TEST("Warp store unguarded range works with cache modified iterator",
+         "[store][warp]",
+         types,
+         logical_warp_threads,
+         items_per_thread,
+         algorithm,
+         cache_store_modifier)
+{
+  using params                                     = params_t<TestType>;
+  using type                                       = typename params::type;
+  constexpr cub::CacheStoreModifier store_modifier = c2h::get<4, TestType>::value;
+
+  c2h::device_vector<type> d_out(params::total_item_count, type{});
+  constexpr int valid_items = params::tile_size;
+  auto out = cub::CacheModifiedOutputIterator<store_modifier, type>(thrust::raw_pointer_cast(d_out.data()));
+  warp_store<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+    out, unguarded_store_t{});
+  auto d_expected_output =
+    compute_reference<params::algorithm, params::logical_warp_threads, params::items_per_thread, params::total_warps, type>(
+      valid_items);
+  REQUIRE(d_expected_output == d_out);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/cmake/CMakeLists.txt b/source/tnn/device/cuda/thirdparty/cub/test/cmake/CMakeLists.txt
new file mode 100644
index 000000000..73bfd0391
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/cmake/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Check source code for issues that can be found by pattern matching:
+add_test(
+  NAME cub.test.cmake.check_source_files
+  COMMAND
+    "${CMAKE_COMMAND}"
+      -D "CUB_SOURCE_DIR=${CUB_SOURCE_DIR}"
+      -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
+)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/cmake/check_source_files.cmake b/source/tnn/device/cuda/thirdparty/cub/test/cmake/check_source_files.cmake
new file mode 100644
index 000000000..1554a2256
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/cmake/check_source_files.cmake
@@ -0,0 +1,178 @@
+# Check all source files for various issues that can be detected using pattern
+# matching.
+#
+# This is run as a ctest test named `cub.test.cmake.check_namespace`, or
+# manually with:
+# cmake -D "CUB_SOURCE_DIR=<CUB project root>" -P check_namespace.cmake
+
+cmake_minimum_required(VERSION 3.15)
+
+function(count_substrings input search_regex output_var)
+  string(REGEX MATCHALL "${search_regex}" matches "${input}")
+  list(LENGTH matches num_matches)
+  set(${output_var} ${num_matches} PARENT_SCOPE)
+endfunction()
+
+set(found_errors 0)
+file(GLOB_RECURSE cub_srcs
+  RELATIVE "${CUB_SOURCE_DIR}"
+  "${CUB_SOURCE_DIR}/cub/*.cuh"
+  "${CUB_SOURCE_DIR}/cub/*.cu"
+  "${CUB_SOURCE_DIR}/cub/*.h"
+  "${CUB_SOURCE_DIR}/cub/*.cpp"
+)
+
+################################################################################
+# Namespace checks.
+# Check all files in thrust to make sure that they use
+# CUB_NAMESPACE_BEGIN/END instead of bare `namespace cub {}` declarations.
+set(namespace_exclusions
+  # This defines the macros and must have bare namespace declarations:
+  cub/util_namespace.cuh
+)
+
+set(bare_ns_regex "namespace[ \n\r\t]+cub[ \n\r\t]*\\{")
+
+# Validation check for the above regex:
+count_substrings([=[
+namespace cub{
+namespace cub {
+namespace  cub  {
+ namespace cub {
+namespace cub
+{
+namespace
+cub
+{
+]=]
+  ${bare_ns_regex} valid_count)
+if (NOT valid_count EQUAL 6)
+  message(FATAL_ERROR "Validation of bare namespace regex failed: "
+                      "Matched ${valid_count} times, expected 6.")
+endif()
+
+################################################################################
+# stdpar header checks.
+# Check all files in CUB to make sure that they aren't including <algorithm>
+# or <memory>, both of which will cause circular dependencies in nvc++'s
+# stdpar library.
+#
+# The headers following headers should be used instead:
+# <algorithm> -> <thrust/detail/algorithm_wrapper.h>
+# <memory>    -> <thrust/detail/memory_wrapper.h>
+#
+set(stdpar_header_exclusions
+  # Placeholder -- none yet.
+)
+
+set(algorithm_regex "#[ \t]*include[ \t]+<algorithm>")
+set(memory_regex    "#[ \t]*include[ \t]+<memory>")
+set(numeric_regex   "#[ \t]*include[ \t]+<numeric>")
+
+# Validation check for the above regex pattern:
+count_substrings([=[
+#include <algorithm>
+# include <algorithm>
+#include  <algorithm>
+# include  <algorithm>
+# include  <algorithm> // ...
+]=]
+  ${algorithm_regex} valid_count)
+if (NOT valid_count EQUAL 5)
+  message(FATAL_ERROR "Validation of stdpar header regex failed: "
+    "Matched ${valid_count} times, expected 5.")
+endif()
+
+################################################################################
+# Legacy macro checks.
+# Check all files in CUB to make sure that they aren't using the legacy
+# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
+#
+# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
+# They are provided for legacy purposes and should be replaced with
+# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
+#
+#
+set(legacy_macro_header_exclusions
+  # This header defines a legacy CUDART macro:
+  cub/detail/detect_cuda_runtime.cuh
+)
+
+set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
+set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
+
+################################################################################
+# Read source files:
+foreach(src ${cub_srcs})
+  file(READ "${CUB_SOURCE_DIR}/${src}" src_contents)
+
+  if (NOT ${src} IN_LIST namespace_exclusions)
+    count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count)
+    count_substrings("${src_contents}" CUB_NS_PREFIX prefix_count)
+    count_substrings("${src_contents}" CUB_NS_POSTFIX postfix_count)
+    count_substrings("${src_contents}" CUB_NAMESPACE_BEGIN begin_count)
+    count_substrings("${src_contents}" CUB_NAMESPACE_END end_count)
+
+    if (NOT bare_ns_count EQUAL 0)
+      message("'${src}' contains 'namespace cub {...}'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT prefix_count EQUAL 0)
+      message("'${src}' contains 'CUB_NS_PREFIX'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT postfix_count EQUAL 0)
+      message("'${src}' contains 'CUB_NS_POSTFIX'. Replace with CUB_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT begin_count EQUAL end_count)
+      message("'${src}' namespace macros are unbalanced:")
+      message(" - CUB_NAMESPACE_BEGIN occurs ${begin_count} times.")
+      message(" - CUB_NAMESPACE_END   occurs ${end_count} times.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST stdpar_header_exclusions)
+    count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count)
+    count_substrings("${src_contents}" "${memory_regex}" memory_count)
+    count_substrings("${src_contents}" "${numeric_regex}" numeric_count)
+
+    if (NOT algorithm_count EQUAL 0)
+      message("'${src}' includes the <algorithm> header. Replace with <thrust/detail/algorithm_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT memory_count EQUAL 0)
+      message("'${src}' includes the <memory> header. Replace with <thrust/detail/memory_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT numeric_count EQUAL 0)
+      message("'${src}' includes the <numeric> header. Replace with <thrust/detail/numeric_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
+    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
+    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
+
+    if (NOT thrust_count EQUAL 0)
+      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT cub_count EQUAL 0)
+      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+  endif()
+endforeach()
+
+if (NOT found_errors EQUAL 0)
+  message(FATAL_ERROR "Errors detected.")
+endif()
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/insert_nested_NVTX_range_guard.h b/source/tnn/device/cuda/thirdparty/cub/test/insert_nested_NVTX_range_guard.h
new file mode 100644
index 000000000..56d7aad6b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/insert_nested_NVTX_range_guard.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// Include this file at the top of a unit test for CUB device algorithms to check whether any inserted NVTX ranges nest.
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <catch2/catch.hpp>
+
+#if defined(__cpp_inline_variables)
+inline thread_local bool entered = false;
+
+struct NestedNVTXRangeGuard
+{
+  NestedNVTXRangeGuard(const char* name)
+  {
+    UNSCOPED_INFO("Entering NVTX range " << name);
+    if (entered)
+    {
+      FAIL("Nested NVTX range detected");
+    }
+    entered = true;
+  }
+
+  ~NestedNVTXRangeGuard()
+  {
+    entered = false;
+    UNSCOPED_INFO("Leaving NVTX range");
+  }
+};
+
+#  define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name)                              \
+    ::cuda::std::optional<::NestedNVTXRangeGuard> __cub_nvtx3_reentrency_guard; \
+    NV_IF_TARGET(NV_IS_HOST, __cub_nvtx3_reentrency_guard.emplace(name););
+#endif // defined(__cpp_inline_variables)
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/link_a.cu b/source/tnn/device/cuda/thirdparty/cub/test/link_a.cu
new file mode 100644
index 000000000..65e20189a
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/link_a.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void a()
+{
+  printf("a() called\n");
+
+  cub::DoubleBuffer<unsigned int> d_keys;
+  cub::DoubleBuffer<cub::NullType> d_values;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/link_b.cu b/source/tnn/device/cuda/thirdparty/cub/test/link_b.cu
new file mode 100644
index 000000000..be174fd24
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/link_b.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void b()
+{
+  printf("b() called\n");
+
+  cub::DoubleBuffer<unsigned int> d_keys;
+  cub::DoubleBuffer<cub::NullType> d_values;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp b/source/tnn/device/cuda/thirdparty/cub/test/link_main.cpp
similarity index 61%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp
rename to source/tnn/device/cuda/thirdparty/cub/test/link_main.cpp
index ef677ee03..65b6828c3 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/link_main.cpp
+++ b/source/tnn/device/cuda/thirdparty/cub/test/link_main.cpp
@@ -5,6 +5,6 @@ extern void b();
 
 int main()
 {
-    printf("hello world\n");
-    return 0;
+  printf("hello world\n");
+  return 0;
 }
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/mersenne.h b/source/tnn/device/cuda/thirdparty/cub/test/mersenne.h
new file mode 100644
index 000000000..cfa567dd5
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/mersenne.h
@@ -0,0 +1,163 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+#pragma once
+
+namespace mersenne
+{
+
+/* Period parameters */
+constexpr unsigned int N          = 624;
+constexpr unsigned int M          = 397;
+constexpr unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
+constexpr unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
+constexpr unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
+
+static unsigned int mt[N]; /* the array for the state vector  */
+static int mti = N + 1; /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+inline void init_genrand(unsigned int s)
+{
+  mt[0] = s & 0xffffffff;
+  for (mti = 1; mti < static_cast<int>(N); mti++)
+  {
+    mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
+
+    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
+    /* In the previous versions, MSBs of the seed affect   */
+    /* only MSBs of the array mt[].                        */
+    /* 2002/01/09 modified by Makoto Matsumoto             */
+
+    mt[mti] &= 0xffffffff;
+    /* for >32 bit machines */
+  }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+inline void init_by_array(unsigned int init_key[], int key_length)
+{
+  int i, j, k;
+  init_genrand(19650218);
+  i = 1;
+  j = 0;
+  k = (static_cast<int>(N) > key_length ? static_cast<int>(N) : key_length);
+  for (; k; k--)
+  {
+    mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525)) + init_key[j] + j; /* non linear */
+    mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+    i++;
+    j++;
+    if (i >= static_cast<int>(N))
+    {
+      mt[0] = mt[N - 1];
+      i     = 1;
+    }
+    if (j >= key_length)
+    {
+      j = 0;
+    }
+  }
+  for (k = N - 1; k; k--)
+  {
+    mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
+    mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+    i++;
+    if (i >= static_cast<int>(N))
+    {
+      mt[0] = mt[N - 1];
+      i     = 1;
+    }
+  }
+
+  mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+inline unsigned int genrand_int32()
+{
+  unsigned int y;
+  static unsigned int mag01[2] = {0x0, MATRIX_A};
+
+  /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+  if (mti >= static_cast<int>(N))
+  { /* generate N words at one time */
+    int kk;
+
+    if (mti == N + 1) /* if init_genrand() has not been called, */
+    {
+      init_genrand(5489); /* a defat initial seed is used */
+    }
+
+    for (kk = 0; kk < static_cast<int>(N - M); kk++)
+    {
+      y      = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+      mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
+    }
+    for (; kk < static_cast<int>(N - 1); kk++)
+    {
+      y      = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+      mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
+    }
+    y         = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+    mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+    mti = 0;
+  }
+
+  y = mt[mti++];
+
+  /* Tempering */
+  y ^= (y >> 11);
+  y ^= (y << 7) & 0x9d2c5680;
+  y ^= (y << 15) & 0xefc60000;
+  y ^= (y >> 18);
+
+  return y;
+}
+
+} // namespace mersenne
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_allocator.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_allocator.cu
new file mode 100644
index 000000000..4b4723fe9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_allocator.cu
@@ -0,0 +1,466 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for caching allocator of device memory
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+#include <stdio.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>]"
+           "[--bytes=<timing bytes>]"
+           "[--i=<timing iterations>]"
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Get number of GPUs and current GPU
+  int num_gpus;
+  int initial_gpu;
+  int timing_iterations = 10000;
+  int timing_bytes      = 1024 * 1024;
+
+  if (CubDebug(cudaGetDeviceCount(&num_gpus)))
+  {
+    exit(1);
+  }
+  if (CubDebug(cudaGetDevice(&initial_gpu)))
+  {
+    exit(1);
+  }
+  args.GetCmdLineArgument("i", timing_iterations);
+  args.GetCmdLineArgument("bytes", timing_bytes);
+
+  // Create default allocator (caches up to 6MB in device allocations per GPU)
+  CachingDeviceAllocator allocator;
+  allocator.debug = true;
+
+  printf("Running single-gpu tests...\n");
+  fflush(stdout);
+
+  //
+  // Test0
+  //
+
+  // Create a new stream
+  cudaStream_t other_stream;
+  CubDebugExit(cudaStreamCreate(&other_stream));
+
+  // Allocate 999 bytes on the current gpu in stream0
+  char* d_999B_stream0_a;
+  char* d_999B_stream0_b;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
+
+  // Run some big kernel in stream 0
+  EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+  // Free d_999B_stream0_a
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+
+  // Allocate another 999 bytes in stream 0
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));
+
+  // Check that that we have 1 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  // Check that that we have no cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 0);
+
+  // Run some big kernel in stream 0
+  EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+  // Free d_999B_stream0_b
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+  // Allocate 999 bytes on the current gpu in other_stream
+  char* d_999B_stream_other_a;
+  char* d_999B_stream_other_b;
+  allocator.DeviceAllocate((void**) &d_999B_stream_other_a, 999, other_stream);
+
+  // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is
+  // only available for stream 0 until it becomes idle)
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  // Check that that we have one cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 1);
+
+  // Run some big kernel in other_stream
+  EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+  // Free d_999B_stream_other
+  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+
+  // Check that we can now use both allocations in stream 0 after synchronizing the device
+  CubDebugExit(cudaDeviceSynchronize());
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));
+
+  // Check that that we have 2 live blocks on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 2);
+
+  // Check that that we have no cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 0);
+
+  // Free d_999B_stream0_a and d_999B_stream0_b
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+  // Check that we can now use both allocations in other_stream
+  CubDebugExit(cudaDeviceSynchronize());
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream_other_a, 999, other_stream));
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream_other_b, 999, other_stream));
+
+  // Check that that we have 2 live blocks on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 2);
+
+  // Check that that we have no cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 0);
+
+  // Run some big kernel in other_stream
+  EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+  // Free d_999B_stream_other_a and d_999B_stream_other_b
+  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+  CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
+
+  // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other
+  // stream
+  CubDebugExit(cudaDeviceSynchronize());
+  CubDebugExit(cudaStreamDestroy(other_stream));
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_a, 999, 0));
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_999B_stream0_b, 999, 0));
+
+  // Check that that we have 2 live blocks on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 2);
+
+  // Check that that we have no cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 0);
+
+  // Free d_999B_stream0_a and d_999B_stream0_b
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+  CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+  // Free all cached
+  CubDebugExit(allocator.FreeAllCached());
+
+  //
+  // Test1
+  //
+
+  // Allocate 5 bytes on the current gpu
+  char* d_5B;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_5B, 5));
+
+  // Check that that we have zero free bytes cached on the initial GPU
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+  // Check that that we have 1 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  //
+  // Test2
+  //
+
+  // Allocate 4096 bytes on the current gpu
+  char* d_4096B;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_4096B, 4096));
+
+  // Check that that we have 2 live blocks on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 2);
+
+  //
+  // Test3
+  //
+
+  // DeviceFree d_5B
+  CubDebugExit(allocator.DeviceFree(d_5B));
+
+  // Check that that we have min_bin_bytes free bytes cached on the initial gpu
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+  // Check that that we have 1 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  // Check that that we have 1 cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 1);
+
+  //
+  // Test4
+  //
+
+  // DeviceFree d_4096B
+  CubDebugExit(allocator.DeviceFree(d_4096B));
+
+  // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
+
+  // Check that that we have 0 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 0);
+
+  // Check that that we have 2 cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 2);
+
+  //
+  // Test5
+  //
+
+  // Allocate 768 bytes on the current gpu
+  char* d_768B;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_768B, 768));
+
+  // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+  // Check that that we have 1 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  // Check that that we have 1 cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 1);
+
+  //
+  // Test6
+  //
+
+  // Allocate max_cached_bytes on the current gpu
+  char* d_max_cached;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_max_cached, allocator.max_cached_bytes));
+
+  // DeviceFree d_max_cached
+  CubDebugExit(allocator.DeviceFree(d_max_cached));
+
+  // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we
+  // went over)
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+  // Check that that we have 1 live block on the initial GPU
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  // Check that that we still have 1 cached block on the initial GPU
+  AssertEquals(allocator.cached_blocks.size(), 1);
+
+  //
+  // Test7
+  //
+
+  // Free all cached blocks on all GPUs
+  CubDebugExit(allocator.FreeAllCached());
+
+  // Check that that we have 0 bytes cached on the initial GPU
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+  // Check that that we have 0 cached blocks across all GPUs
+  AssertEquals(allocator.cached_blocks.size(), 0);
+
+  // Check that that still we have 1 live block across all GPUs
+  AssertEquals(allocator.live_blocks.size(), 1);
+
+  //
+  // Test8
+  //
+
+  // Allocate max cached bytes + 1 on the current gpu
+  char* d_max_cached_plus;
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_max_cached_plus, allocator.max_cached_bytes + 1));
+
+  // DeviceFree max cached bytes
+  CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
+
+  // DeviceFree d_768B
+  CubDebugExit(allocator.DeviceFree(d_768B));
+
+  unsigned int power;
+  size_t rounded_bytes;
+  allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
+
+  // Check that that we have 4096 free bytes cached on the initial gpu
+  AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+  // Check that that we have 1 cached blocks across all GPUs
+  AssertEquals(allocator.cached_blocks.size(), 1);
+
+  // Check that that still we have 0 live block across all GPUs
+  AssertEquals(allocator.live_blocks.size(), 0);
+
+  // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
+
+  if (num_gpus > 1)
+  {
+    printf("\nRunning multi-gpu tests...\n");
+    fflush(stdout);
+
+    //
+    // Test9
+    //
+
+    // Allocate 768 bytes on the next gpu
+    int next_gpu = (initial_gpu + 1) % num_gpus;
+    char* d_768B_2;
+    CubDebugExit(allocator.DeviceAllocate(next_gpu, (void**) &d_768B_2, 768));
+
+    // DeviceFree d_768B on the next gpu
+    CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+    // Re-allocate 768 bytes on the next gpu
+    CubDebugExit(allocator.DeviceAllocate(next_gpu, (void**) &d_768B_2, 768));
+
+    // Re-free d_768B on the next gpu
+    CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+    // Check that that we have 4096 free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+    // Check that that we have 4096 free bytes cached on the second gpu
+    AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
+
+    // Check that that we have 2 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 2);
+
+    // Check that that still we have 0 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 0);
+  }
+
+  //
+  // Performance
+  //
+
+  printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+  fflush(stdout);
+  fflush(stderr);
+
+  // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
+  CpuTimer cpu_timer;
+  char* d_1024MB  = nullptr;
+  allocator.debug = false;
+
+  // Prime the caching allocator and the kernel
+  CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
+  CubDebugExit(allocator.DeviceFree(d_1024MB));
+  cub::EmptyKernel<void><<<1, 32>>>();
+
+  // CUDA
+  cpu_timer.Start();
+  for (int i = 0; i < timing_iterations; ++i)
+  {
+    CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes));
+    CubDebugExit(cudaFree(d_1024MB));
+  }
+  cpu_timer.Stop();
+  float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+  // CUB
+  cpu_timer.Start();
+  for (int i = 0; i < timing_iterations; ++i)
+  {
+    CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+  }
+  cpu_timer.Stop();
+  float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+  printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate "
+         "%.4f ms)\n",
+         cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+         cuda_malloc_elapsed_millis / timing_iterations,
+         cub_calloc_elapsed_millis / timing_iterations);
+
+  // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
+  GpuTimer gpu_timer;
+
+  printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+  fflush(stdout);
+  fflush(stderr);
+
+  // Kernel-only
+  gpu_timer.Start();
+  for (int i = 0; i < timing_iterations; ++i)
+  {
+    cub::EmptyKernel<void><<<1, 32>>>();
+  }
+  gpu_timer.Stop();
+  float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
+
+  // CUDA
+  gpu_timer.Start();
+  for (int i = 0; i < timing_iterations; ++i)
+  {
+    CubDebugExit(cudaMalloc((void**) &d_1024MB, timing_bytes));
+    cub::EmptyKernel<void><<<1, 32>>>();
+    CubDebugExit(cudaFree(d_1024MB));
+  }
+  gpu_timer.Stop();
+  cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+  // CUB
+  gpu_timer.Start();
+  for (int i = 0; i < timing_iterations; ++i)
+  {
+    CubDebugExit(allocator.DeviceAllocate((void**) &d_1024MB, timing_bytes));
+    cub::EmptyKernel<void><<<1, 32>>>();
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+  }
+  gpu_timer.Stop();
+  cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+  printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate "
+         "%.4f ms)\n",
+         cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+         cuda_malloc_elapsed_millis / timing_iterations,
+         cub_calloc_elapsed_millis / timing_iterations);
+
+  printf("Success\n");
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_block_radix_rank.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_block_radix_rank.cu
new file mode 100644
index 000000000..8c1df1a80
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_block_radix_rank.cu
@@ -0,0 +1,344 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#include "cub/util_type.cuh"
+#define CUB_STDERR
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_rank.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_vsmem.cuh>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+
+#include "test_util.h"
+#include <stdio.h>
+
+bool g_verbose = false;
+cub::CachingDeviceAllocator g_allocator(true);
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+__launch_bounds__(BlockThreads, 1) __global__ void kernel(Key* d_keys, int* d_ranks)
+{
+  using block_radix_rank =
+    cub::detail::block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
+
+  using storage_t = typename block_radix_rank::TempStorage;
+
+  // Allocate temp storage in shared memory
+  __shared__ storage_t temp_storage;
+
+  // Items per thread
+  Key keys[ItemsPerThread];
+  int ranks[ItemsPerThread];
+
+  constexpr bool uses_warp_striped_arrangement =
+    RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH
+    || RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+    || RankAlgorithm == cub::RadixRankAlgorithm::RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR;
+
+  if (uses_warp_striped_arrangement)
+  {
+    cub::LoadDirectWarpStriped(threadIdx.x, d_keys, keys);
+  }
+  else
+  {
+    cub::LoadDirectBlocked(threadIdx.x, d_keys, keys);
+  }
+
+  cub::BFEDigitExtractor<Key> extractor(0, RadixBits);
+  block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
+
+  if (uses_warp_striped_arrangement)
+  {
+    cub::StoreDirectWarpStriped(threadIdx.x, d_ranks, ranks);
+  }
+  else
+  {
+    cub::StoreDirectBlocked(threadIdx.x, d_ranks, ranks);
+  }
+}
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing
+ */
+template <typename Key>
+struct pair_t
+{
+  Key key;
+  int value;
+
+  bool operator<(const pair_t& b) const
+  {
+    return (key < b.key);
+  }
+};
+
+template <bool DESCENDING, typename Key>
+void Initialize(GenMode gen_mode, Key* h_keys, int* h_reference_ranks, int num_items, int num_bits)
+{
+  std::unique_ptr<pair_t<Key>[]> h_pairs_storage(new pair_t<Key>[num_items]);
+  pair_t<Key>* h_pairs = h_pairs_storage.get();
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    InitValue(gen_mode, h_keys[i], i);
+
+    // Mask off unwanted portions
+    std::uint64_t base = 0;
+    memcpy(&base, &h_keys[i], sizeof(Key));
+    base &= (1ull << num_bits) - 1;
+    memcpy(&h_keys[i], &base, sizeof(Key));
+
+    h_pairs[i].key   = h_keys[i];
+    h_pairs[i].value = i;
+  }
+
+  if (DESCENDING)
+  {
+    std::reverse(h_pairs, h_pairs + num_items);
+  }
+
+  std::stable_sort(h_pairs, h_pairs + num_items);
+
+  if (DESCENDING)
+  {
+    std::reverse(h_pairs, h_pairs + num_items);
+  }
+
+  for (int i = 0; i < num_items; ++i)
+  {
+    h_reference_ranks[h_pairs[i].value] = i;
+  }
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestDriver(GenMode gen_mode)
+{
+  constexpr int tile_size = BlockThreads * ItemsPerThread;
+
+  // Allocate host arrays
+  std::unique_ptr<Key[]> h_keys(new Key[tile_size]);
+  std::unique_ptr<int[]> h_ranks(new int[tile_size]);
+  std::unique_ptr<int[]> h_reference_ranks(new int[tile_size]);
+
+  // Allocate device arrays
+  Key* d_keys  = nullptr;
+  int* d_ranks = nullptr;
+
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_keys, sizeof(Key) * tile_size));
+  CubDebugExit(g_allocator.DeviceAllocate((void**) &d_ranks, sizeof(int) * tile_size));
+
+  // Initialize problem and solution on host
+  Initialize<Descending>(gen_mode, h_keys.get(), h_reference_ranks.get(), tile_size, RadixBits);
+
+  // Copy problem to device
+  CubDebugExit(cudaMemcpy(d_keys, h_keys.get(), sizeof(Key) * tile_size, cudaMemcpyHostToDevice));
+
+  // Run kernel
+  kernel<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>
+    <<<1, BlockThreads>>>(d_keys, d_ranks);
+
+  // Flush kernel output / errors
+  CubDebugExit(cudaPeekAtLastError());
+  CubDebugExit(cudaDeviceSynchronize());
+
+  // Check keys results
+  const bool compare = CompareDeviceResults(h_reference_ranks.get(), d_ranks, tile_size, g_verbose, g_verbose);
+  AssertEquals(0, compare);
+
+  if (d_keys)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_keys));
+  }
+
+  if (d_ranks)
+  {
+    CubDebugExit(g_allocator.DeviceFree(d_ranks));
+  }
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestValid(cub::Int2Type<true> /*fits_smem_capacity*/)
+{
+  TestDriver<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(UNIFORM);
+
+  TestDriver<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(INTEGER_SEED);
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          int Descending,
+          typename Key>
+void TestValid(cub::Int2Type<false> fits_smem_capacity)
+{}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          bool Descending,
+          typename Key>
+void Test()
+{
+  // Check size of smem storage for the target arch to make sure it will fit
+  using block_radix_rank =
+    cub::detail::block_radix_rank_t<RankAlgorithm, BlockThreads, RadixBits, Descending, ScanAlgorithm>;
+  using storage_t = typename block_radix_rank::TempStorage;
+
+  cub::Int2Type<(sizeof(storage_t) <= cub::detail::max_smem_per_block)> fits_smem_capacity;
+
+  TestValid<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, Descending, Key>(fits_smem_capacity);
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm,
+          typename Key>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, true, Key>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, false, Key>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm,
+          int BlockThreads,
+          int ItemsPerThread,
+          int RadixBits,
+          cub::BlockScanAlgorithm ScanAlgorithm>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, std::uint8_t>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, ScanAlgorithm, std::uint16_t>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads, int ItemsPerThread, int RadixBits>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, cub::BLOCK_SCAN_RAKING>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, RadixBits, cub::BLOCK_SCAN_WARP_SCANS>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads, int ItemsPerThread>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, 1>();
+  Test<RankAlgorithm, BlockThreads, ItemsPerThread, 5>();
+}
+
+template <cub::RadixRankAlgorithm RankAlgorithm, int BlockThreads>
+void Test()
+{
+  Test<RankAlgorithm, BlockThreads, 1>();
+  Test<RankAlgorithm, BlockThreads, 4>();
+}
+
+template <int BlockThreads>
+void Test(cub::Int2Type<true> /* multiple of hw warp */)
+{
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_MATCH, BlockThreads>();
+
+  // TODO(senior-zero):
+  // - RADIX_RANK_MATCH_EARLY_COUNTS_ANY
+  // - RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+}
+
+template <int BlockThreads>
+void Test(cub::Int2Type<false> /* multiple of hw warp */)
+{}
+
+template <int BlockThreads>
+void Test()
+{
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_BASIC, BlockThreads>();
+  Test<cub::RadixRankAlgorithm::RADIX_RANK_MEMOIZE, BlockThreads>();
+
+  Test<BlockThreads>(cub::Int2Type<(BlockThreads % 32) == 0>{});
+}
+
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--v] "
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  Test<16>();
+  Test<32>();
+  Test<128>();
+  Test<130>();
+
+  g_allocator.FreeAllCached();
+
+  return 0;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_cdp_variant_state.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_cdp_variant_state.cu
new file mode 100644
index 000000000..cb912b9b3
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_cdp_variant_state.cu
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <cstdlib>
+
+int main()
+{
+  // This test just checks that RDC is enabled and detected properly when using
+  // the %PARAM% system to request CDP support (see the README.md file in
+  // this directory).
+
+  // %PARAM% TEST_LAUNCH lid 0:1:2
+
+#ifdef CUB_RDC_ENABLED
+  return (TEST_LAUNCH == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
+#else
+  return (TEST_LAUNCH == 0 || TEST_LAUNCH == 2) ? EXIT_SUCCESS : EXIT_FAILURE;
+#endif
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_copy.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_copy.cu
new file mode 100644
index 000000000..1c9857b80
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_copy.cu
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_copy.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "test_util.h"
+#include <c2h/vector.cuh>
+
+/**
+ * @brief Host-side random data generation
+ */
+template <typename T>
+void GenerateRandomData(
+  T* rand_out,
+  const std::size_t num_items,
+  const T min_rand_val                                                           = std::numeric_limits<T>::min(),
+  const T max_rand_val                                                           = std::numeric_limits<T>::max(),
+  const std::uint_fast32_t seed                                                  = 320981U,
+  typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type* = nullptr)
+{
+  // initialize random number generator
+  std::mt19937 rng(seed);
+  std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val);
+
+  // generate random numbers
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    rand_out[i] = uni_dist(rng);
+  }
+}
+
+/**
+ * @brief Used for generating a shuffled but cohesive sequence of output-range offsets for the
+ * sequence of input-ranges.
+ */
+template <typename RangeOffsetT, typename ByteOffsetT, typename RangeSizeT>
+c2h::host_vector<ByteOffsetT>
+GetShuffledRangeOffsets(const c2h::host_vector<RangeSizeT>& range_sizes, const std::uint_fast32_t seed = 320981U)
+{
+  RangeOffsetT num_ranges = static_cast<RangeOffsetT>(range_sizes.size());
+
+  // We're remapping the i-th range to pmt_idxs[i]
+  std::mt19937 rng(seed);
+  c2h::host_vector<RangeOffsetT> pmt_idxs(num_ranges);
+  std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<RangeOffsetT>(0));
+  std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng);
+
+  // Compute the offsets using the new mapping
+  ByteOffsetT running_offset = {};
+  c2h::host_vector<ByteOffsetT> permuted_offsets;
+  permuted_offsets.reserve(num_ranges);
+  for (auto permuted_range_idx : pmt_idxs)
+  {
+    permuted_offsets.push_back(running_offset);
+    running_offset += range_sizes[permuted_range_idx];
+  }
+
+  // Generate the scatter indexes that identify where each range was mapped to
+  c2h::host_vector<RangeOffsetT> scatter_idxs(num_ranges);
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    scatter_idxs[pmt_idxs[i]] = i;
+  }
+
+  c2h::host_vector<ByteOffsetT> new_offsets(num_ranges);
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    new_offsets[i] = permuted_offsets[scatter_idxs[i]];
+  }
+
+  return new_offsets;
+}
+
+template <size_t n, typename... T>
+typename std::enable_if<n >= thrust::tuple_size<thrust::tuple<T...>>::value>::type
+print_tuple(std::ostream&, const thrust::tuple<T...>&)
+{}
+
+template <size_t n, typename... T>
+typename std::enable_if<n + 1 <= thrust::tuple_size<thrust::tuple<T...>>::value>::type
+print_tuple(std::ostream& os, const thrust::tuple<T...>& tup)
+{
+  _CCCL_IF_CONSTEXPR (n != 0)
+  {
+    os << ", ";
+  }
+  os << thrust::get<n>(tup);
+  print_tuple<n + 1>(os, tup);
+}
+
+template <typename... T>
+std::ostream& operator<<(std::ostream& os, const thrust::tuple<T...>& tup)
+{
+  os << "[";
+  print_tuple<0>(os, tup);
+  return os << "]";
+}
+
+struct Identity
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(T x)
+  {
+    return x;
+  }
+};
+
+/**
+ * @brief Function object class template that takes an offset and returns an iterator at the given
+ * offset relative to a fixed base iterator.
+ *
+ * @tparam IteratorT The random-access iterator type to be returned
+ */
+template <typename IteratorT>
+struct OffsetToIteratorOp
+{
+  template <typename OffsetT>
+  __host__ __device__ __forceinline__ thrust::transform_output_iterator<Identity, IteratorT>
+  operator()(OffsetT offset) const
+  {
+    return thrust::make_transform_output_iterator(base_it + offset, Identity{});
+  }
+  IteratorT base_it;
+};
+
+template <typename AtomicT>
+struct RepeatIndex
+{
+  template <typename OffsetT>
+  __host__ __device__ __forceinline__ thrust::constant_iterator<AtomicT> operator()(OffsetT i)
+  {
+    return thrust::constant_iterator<AtomicT>(static_cast<AtomicT>(i));
+  }
+};
+
+enum class TestDataGen
+{
+  // Random offsets into a data segment
+  RANDOM,
+
+  // Ranges cohesively reside next to each other
+  CONSECUTIVE
+};
+
+std::string TestDataGenToString(TestDataGen gen)
+{
+  switch (gen)
+  {
+    case TestDataGen::RANDOM:
+      return "TestDataGen::RANDOM";
+    case TestDataGen::CONSECUTIVE:
+      return "TestDataGen::CONSECUTIVE";
+    default:
+      return "Unknown";
+  }
+}
+
+/**
+ * @brief
+ *
+ * @tparam AtomicT The type of the elements being copied
+ * @tparam RangeOffsetT Type used for indexing into the array of ranges
+ * @tparam RangeSizeT Type used for indexing into individual elements of a range (large enough to
+ * cover the max range size)
+ * @tparam ByteOffsetT Type used for indexing into elements over *all* the ranges' sizes
+ */
+template <typename AtomicT, typename RangeOffsetT, typename RangeSizeT, typename ByteOffsetT>
+void RunTest(RangeOffsetT num_ranges, RangeSizeT min_range_size, RangeSizeT max_range_size, TestDataGen output_gen)
+try
+{
+  // Range segment data (their offsets and sizes)
+  c2h::host_vector<RangeSizeT> h_range_sizes(num_ranges);
+  thrust::counting_iterator<RangeOffsetT> iota(0);
+  auto d_range_srcs = thrust::make_transform_iterator(iota, RepeatIndex<AtomicT>{});
+  c2h::host_vector<ByteOffsetT> h_offsets(num_ranges + 1);
+
+  // Generate the range sizes
+  GenerateRandomData(h_range_sizes.data(), h_range_sizes.size(), min_range_size, max_range_size);
+
+  // Compute the total bytes to be copied
+  std::partial_sum(h_range_sizes.begin(), h_range_sizes.end(), h_offsets.begin() + 1);
+  const ByteOffsetT num_total_items = h_offsets.back();
+  h_offsets.pop_back();
+
+  constexpr int32_t shuffle_seed = 123241;
+
+  // Shuffle output range source-offsets
+  if (output_gen == TestDataGen::RANDOM)
+  {
+    h_offsets = GetShuffledRangeOffsets<RangeOffsetT, ByteOffsetT>(h_range_sizes, shuffle_seed);
+  }
+
+  // Device-side resources
+  c2h::device_vector<AtomicT> d_out(num_total_items);
+  c2h::device_vector<ByteOffsetT> d_offsets(h_offsets);
+  c2h::device_vector<RangeSizeT> d_range_sizes(h_range_sizes);
+
+  // Prepare d_range_dsts
+  using AtomicIterT = typename c2h::device_vector<AtomicT>::iterator;
+  OffsetToIteratorOp<AtomicIterT> dst_transform_op{d_out.begin()};
+  auto d_range_dsts = thrust::make_transform_iterator(d_offsets.begin(), dst_transform_op);
+
+  // Get temporary storage requirements
+  size_t temp_storage_bytes = 0;
+  CubDebugExit(cub::DeviceCopy::Batched(
+    nullptr, temp_storage_bytes, d_range_srcs, d_range_dsts, d_range_sizes.cbegin(), num_ranges));
+
+  c2h::device_vector<std::uint8_t> d_temp_storage(temp_storage_bytes);
+
+  c2h::host_vector<AtomicT> h_out(num_total_items);
+  c2h::host_vector<AtomicT> h_gpu_results(num_total_items);
+
+  // Invoke device-side algorithm being under test
+  CubDebugExit(cub::DeviceCopy::Batched(
+    thrust::raw_pointer_cast(d_temp_storage.data()),
+    temp_storage_bytes,
+    d_range_srcs,
+    d_range_dsts,
+    d_range_sizes.cbegin(),
+    num_ranges));
+
+  // Copy back the output range
+  h_gpu_results = d_out;
+
+  // CPU-side result generation for verification
+  for (RangeOffsetT i = 0; i < num_ranges; i++)
+  {
+    std::copy(d_range_srcs[i], d_range_srcs[i] + h_range_sizes[i], h_out.begin() + h_offsets[i]);
+  }
+
+  const auto it_pair = std::mismatch(h_gpu_results.cbegin(), h_gpu_results.cend(), h_out.cbegin());
+
+  if (it_pair.first != h_gpu_results.cend())
+  {
+    std::cout << "Mismatch at index " << std::distance(h_gpu_results.cbegin(), it_pair.first)
+              << ", CPU vs. GPU: " << *it_pair.second << ", " << *it_pair.first << "\n";
+  }
+  AssertEquals(it_pair.first, h_gpu_results.cend());
+}
+catch (std::bad_alloc& e)
+{
+  (void) e;
+#ifdef DEBUG_CHECKED_ALLOC_FAILURE
+  std::cout
+    << "Skipping test 'RunTest(" << num_ranges << ", " //
+    << min_range_size << ", " //
+    << max_range_size << ", " //
+    << TestDataGenToString(output_gen) << ")" //
+    << "' due to insufficient memory: " << e.what() << "\n";
+#endif // DEBUG_CHECKED_ALLOC_FAILURE
+}
+
+struct object_with_non_trivial_ctor
+{
+  static constexpr int MAGIC = 923390;
+
+  int field;
+  int magic;
+
+  __host__ __device__ object_with_non_trivial_ctor()
+  {
+    magic = MAGIC;
+    field = 0;
+  }
+  __host__ __device__ object_with_non_trivial_ctor(int f)
+  {
+    magic = MAGIC;
+    field = f;
+  }
+
+  object_with_non_trivial_ctor(const object_with_non_trivial_ctor& x) = default;
+
+  __host__ __device__ object_with_non_trivial_ctor& operator=(const object_with_non_trivial_ctor& x)
+  {
+    if (magic == MAGIC)
+    {
+      field = x.field;
+    }
+    return *this;
+  }
+};
+
+void nontrivial_constructor_test()
+{
+  constexpr int num_buffers = 3;
+  c2h::device_vector<object_with_non_trivial_ctor> a(num_buffers, object_with_non_trivial_ctor(99));
+  c2h::device_vector<object_with_non_trivial_ctor> b(num_buffers);
+  using iterator = c2h::device_vector<object_with_non_trivial_ctor>::iterator;
+
+  c2h::device_vector<iterator> a_iter{a.begin(), a.begin() + 1, a.begin() + 2};
+
+  c2h::device_vector<iterator> b_iter{b.begin(), b.begin() + 1, b.begin() + 2};
+
+  auto sizes = thrust::make_constant_iterator(1);
+
+  std::uint8_t* d_temp_storage{};
+  std::size_t temp_storage_bytes{};
+
+  cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, a_iter.begin(), b_iter.begin(), sizes, num_buffers);
+
+  c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+  d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
+
+  cub::DeviceCopy::Batched(d_temp_storage, temp_storage_bytes, a_iter.begin(), b_iter.begin(), sizes, num_buffers);
+
+  for (int i = 0; i < 10; i++)
+  {
+    object_with_non_trivial_ctor ha(a[i]);
+    object_with_non_trivial_ctor hb(b[i]);
+    int ia = ha.field;
+    int ib = hb.field;
+
+    if (ia != ib)
+    {
+      std::cerr << "error: " << ia << " != " << ib << "\n";
+    }
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  //---------------------------------------------------------------------
+  // DeviceCopy::Batched tests
+  //---------------------------------------------------------------------
+  // Run the nontrivial constructor test suggested by senior-zero
+  nontrivial_constructor_test();
+  // Type used for indexing into the array of ranges
+  using RangeOffsetT = uint32_t;
+
+  // Type used for indexing into individual elements of a range (large enough to cover the max range
+  using RangeSizeT = uint32_t;
+
+  // Type used for indexing into bytes over *all* the ranges' sizes
+  using ByteOffsetT = uint32_t;
+
+  // Total number of bytes that are targeted to be copied on each run
+  constexpr RangeOffsetT target_copy_size = 64U << 20;
+
+  // The number of randomly
+  constexpr std::size_t num_rnd_range_tests = 32;
+
+  // Each range's size will be random within this interval
+  c2h::host_vector<std::pair<std::size_t, std::size_t>> size_ranges = {
+    {0, 1},
+    {1, 2},
+    {0, 16},
+    {1, 32},
+    {1, 1024},
+    {1, 32 * 1024},
+    {128 * 1024, 256 * 1024},
+    {target_copy_size, target_copy_size}};
+
+  std::mt19937 rng(0);
+  std::uniform_int_distribution<std::size_t> size_dist(1, 1000000);
+  for (std::size_t i = 0; i < num_rnd_range_tests; i++)
+  {
+    auto range_begin = size_dist(rng);
+    auto range_end   = size_dist(rng);
+    if (range_begin > range_end)
+    {
+      std::swap(range_begin, range_end);
+    }
+    size_ranges.push_back({range_begin, range_end});
+  }
+
+  for (const auto& size_range : size_ranges)
+  {
+    // The most granular type being copied.
+    using AtomicCopyT         = int64_t;
+    RangeSizeT min_range_size = static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT)));
+    RangeSizeT max_range_size =
+      static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
+    double average_range_size      = (min_range_size + max_range_size) / 2.0;
+    RangeOffsetT target_num_ranges = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+
+    // Run tests with output ranges being consecutive
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
+      target_num_ranges, min_range_size, max_range_size, TestDataGen::CONSECUTIVE);
+
+    // Run tests with output ranges being randomly shuffled
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
+      target_num_ranges, min_range_size, max_range_size, TestDataGen::RANDOM);
+  }
+
+  for (const auto& size_range : size_ranges)
+  {
+    // The most granular type being copied.
+    using AtomicCopyT         = thrust::tuple<int64_t, int32_t, int16_t, char, char>;
+    RangeSizeT min_range_size = static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.first, sizeof(AtomicCopyT)));
+    RangeSizeT max_range_size =
+      static_cast<RangeSizeT>(CUB_ROUND_UP_NEAREST(size_range.second, static_cast<RangeSizeT>(sizeof(AtomicCopyT))));
+    double average_range_size      = (min_range_size + max_range_size) / 2.0;
+    RangeOffsetT target_num_ranges = static_cast<RangeOffsetT>(target_copy_size / average_range_size);
+
+    // Run tests with output ranges being consecutive
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
+      target_num_ranges, min_range_size, max_range_size, TestDataGen::CONSECUTIVE);
+
+    // Run tests with output ranges being randomly shuffled
+    RunTest<AtomicCopyT, RangeOffsetT, RangeSizeT, ByteOffsetT>(
+      target_num_ranges, min_range_size, max_range_size, TestDataGen::RANDOM);
+  }
+
+  //---------------------------------------------------------------------
+  // DeviceCopy::Batched test with 64-bit offsets
+  //---------------------------------------------------------------------
+  using ByteOffset64T = uint64_t;
+  using RangeSize64T  = uint64_t;
+  ByteOffset64T large_target_copy_size =
+    static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL);
+  // Make sure min_range_size is in fact smaller than max range size
+  constexpr RangeOffsetT single_range = 1;
+
+  // Run tests with output ranges being consecutive
+  RunTest<uint8_t, RangeOffsetT, RangeSize64T, ByteOffset64T>(
+    single_range, large_target_copy_size, large_target_copy_size, TestDataGen::CONSECUTIVE);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_memcpy.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_memcpy.cu
new file mode 100644
index 000000000..869b67480
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_device_batch_memcpy.cu
@@ -0,0 +1,613 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_memcpy.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cub/util_ptx.cuh>
+
+#include <thrust/fill.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "test_util.h"
+#include <c2h/device_policy.cuh>
+#include <c2h/vector.cuh>
+
+/**
+ * @brief Host-side random data generation
+ */
+template <typename T>
+void GenerateRandomData(
+  T* rand_out,
+  const std::size_t num_items,
+  const T min_rand_val                                                           = std::numeric_limits<T>::min(),
+  const T max_rand_val                                                           = std::numeric_limits<T>::max(),
+  const std::uint_fast32_t seed                                                  = 320981U,
+  typename std::enable_if<std::is_integral<T>::value && (sizeof(T) >= 2)>::type* = nullptr)
+{
+  // initialize random number generator
+  std::mt19937 rng(seed);
+  std::uniform_int_distribution<T> uni_dist(min_rand_val, max_rand_val);
+
+  // generate random numbers
+  for (std::size_t i = 0; i < num_items; ++i)
+  {
+    rand_out[i] = uni_dist(rng);
+  }
+}
+
+template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT, typename BufferOffsetT>
+void __global__ BaselineBatchMemCpyKernel(
+  InputBufferIt input_buffer_it,
+  OutputBufferIt output_buffer_it,
+  BufferSizeIteratorT buffer_sizes,
+  BufferOffsetT num_buffers)
+{
+  BufferOffsetT gtid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (gtid >= num_buffers)
+  {
+    return;
+  }
+  for (BufferOffsetT i = 0; i < buffer_sizes[gtid]; i++)
+  {
+    reinterpret_cast<uint8_t*>(output_buffer_it[gtid])[i] = reinterpret_cast<uint8_t*>(input_buffer_it[gtid])[i];
+  }
+}
+
+template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
+void InvokeBaselineBatchMemcpy(
+  InputBufferIt input_buffer_it, OutputBufferIt output_buffer_it, BufferSizeIteratorT buffer_sizes, uint32_t num_buffers)
+{
+  constexpr uint32_t block_threads = 128U;
+  uint32_t num_blocks              = (num_buffers + block_threads - 1) / block_threads;
+  BaselineBatchMemCpyKernel<<<num_blocks, block_threads>>>(input_buffer_it, output_buffer_it, buffer_sizes, num_buffers);
+}
+
+template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT, typename BufferOffsetT>
+void __global__ BaselineBatchMemCpyPerBlockKernel(
+  InputBufferIt input_buffer_it,
+  OutputBufferIt output_buffer_it,
+  BufferSizeIteratorT buffer_sizes,
+  BufferOffsetT num_buffers)
+{
+  BufferOffsetT gbid = blockIdx.x;
+  if (gbid >= num_buffers)
+  {
+    return;
+  }
+  for (BufferOffsetT i = threadIdx.x; i < buffer_sizes[gbid] / 8; i += blockDim.x)
+  {
+    reinterpret_cast<uint64_t*>(output_buffer_it[gbid])[i] = reinterpret_cast<uint64_t*>(input_buffer_it[gbid])[i];
+  }
+}
+
+/**
+ * @brief Used for generating a shuffled but cohesive sequence of output-buffer offsets for the
+ * sequence of input-buffers.
+ */
+template <typename BufferOffsetT, typename ByteOffsetT, typename BufferSizeT>
+c2h::host_vector<ByteOffsetT>
+GetShuffledBufferOffsets(const c2h::host_vector<BufferSizeT>& buffer_sizes, const std::uint_fast32_t seed = 320981U)
+{
+  BufferOffsetT num_buffers = static_cast<BufferOffsetT>(buffer_sizes.size());
+
+  // We're remapping the i-th buffer to pmt_idxs[i]
+  std::mt19937 rng(seed);
+  c2h::host_vector<BufferOffsetT> pmt_idxs(num_buffers);
+  std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<BufferOffsetT>(0));
+  std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), rng);
+
+  // Compute the offsets using the new mapping
+  ByteOffsetT running_offset = {};
+  c2h::host_vector<ByteOffsetT> permuted_offsets;
+  permuted_offsets.reserve(num_buffers);
+  for (auto permuted_buffer_idx : pmt_idxs)
+  {
+    permuted_offsets.push_back(running_offset);
+    running_offset += buffer_sizes[permuted_buffer_idx];
+  }
+
+  // Generate the scatter indexes that identify where each buffer was mapped to
+  c2h::host_vector<BufferOffsetT> scatter_idxs(num_buffers);
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    scatter_idxs[pmt_idxs[i]] = i;
+  }
+
+  c2h::host_vector<ByteOffsetT> new_offsets(num_buffers);
+  for (BufferOffsetT i = 0; i < num_buffers; i++)
+  {
+    new_offsets[i] = permuted_offsets[scatter_idxs[i]];
+  }
+
+  return new_offsets;
+}
+
+/**
+ * @brief Function object class template that takes an offset and returns an iterator at the given
+ * offset relative to a fixed base iterator.
+ *
+ * @tparam IteratorT The random-access iterator type to be returned
+ */
+template <typename IteratorT>
+struct OffsetToPtrOp
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ IteratorT operator()(T offset) const
+  {
+    return base_it + offset;
+  }
+  IteratorT base_it;
+};
+
+enum class TestDataGen
+{
+  // Random offsets into a data segment
+  RANDOM,
+
+  // Buffers cohesively reside next to each other
+  CONSECUTIVE
+};
+
+std::string TestDataGenToString(TestDataGen gen)
+{
+  switch (gen)
+  {
+    case TestDataGen::RANDOM:
+      return "TestDataGen::RANDOM";
+    case TestDataGen::CONSECUTIVE:
+      return "TestDataGen::CONSECUTIVE";
+    default:
+      return "Unknown";
+  }
+}
+
+/**
+ * @brief
+ *
+ * @tparam AtomicT The most granular type being copied. All source and destination pointers will be
+ * aligned based on this type, the number of bytes being copied will be an integer multiple of this
+ * type's size
+ * @tparam BufferOffsetT Type used for indexing into the array of buffers
+ * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to
+ * cover the max buffer size)
+ * @tparam ByteOffsetT Type used for indexing into bytes over *all* the buffers' sizes
+ */
+template <typename AtomicT, typename BufferOffsetT, typename BufferSizeT, typename ByteOffsetT>
+void RunTest(BufferOffsetT num_buffers,
+             BufferSizeT min_buffer_size,
+             BufferSizeT max_buffer_size,
+             TestDataGen input_gen,
+             TestDataGen output_gen)
+{
+  try
+  {
+    using SrcPtrT = const uint8_t*;
+    using DstPtrT = uint8_t*;
+
+    // Buffer segment data (their offsets and sizes)
+    c2h::host_vector<BufferSizeT> h_buffer_sizes(num_buffers);
+    c2h::host_vector<ByteOffsetT> h_buffer_src_offsets(num_buffers);
+    c2h::host_vector<ByteOffsetT> h_buffer_dst_offsets(num_buffers);
+
+    // Generate the buffer sizes
+    GenerateRandomData(h_buffer_sizes.data(), h_buffer_sizes.size(), min_buffer_size, max_buffer_size);
+
+    // Make sure buffer sizes are a multiple of the most granular unit (one AtomicT) being copied
+    // (round down)
+    for (BufferOffsetT i = 0; i < num_buffers; i++)
+    {
+      h_buffer_sizes[i] = (h_buffer_sizes[i] / sizeof(AtomicT)) * sizeof(AtomicT);
+    }
+
+    // Compute the total bytes to be copied
+    ByteOffsetT num_total_bytes = 0;
+    for (BufferOffsetT i = 0; i < num_buffers; i++)
+    {
+      if (input_gen == TestDataGen::CONSECUTIVE)
+      {
+        h_buffer_src_offsets[i] = num_total_bytes;
+      }
+      if (output_gen == TestDataGen::CONSECUTIVE)
+      {
+        h_buffer_dst_offsets[i] = num_total_bytes;
+      }
+      num_total_bytes += h_buffer_sizes[i];
+    }
+
+    // Shuffle input buffer source-offsets
+    std::uint_fast32_t shuffle_seed = 320981U;
+    if (input_gen == TestDataGen::RANDOM)
+    {
+      h_buffer_src_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes, shuffle_seed);
+      shuffle_seed += 42;
+    }
+
+    // Shuffle input buffer source-offsets
+    if (output_gen == TestDataGen::RANDOM)
+    {
+      h_buffer_dst_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes, shuffle_seed);
+    }
+
+    // Populate the data source with random data
+    using RandomInitAliasT         = uint16_t;
+    std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t);
+    std::size_t num_aliased_units  = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor);
+
+    c2h::host_vector<std::uint8_t> h_in(num_aliased_units * num_aliased_factor);
+    c2h::host_vector<std::uint8_t> h_out(num_total_bytes);
+    c2h::host_vector<std::uint8_t> h_gpu_results(num_total_bytes);
+
+    // Generate random offsets into the random-bits data buffer
+    GenerateRandomData(reinterpret_cast<RandomInitAliasT*>(thrust::raw_pointer_cast(h_in.data())), num_aliased_units);
+
+    c2h::device_vector<std::uint8_t> d_in(h_in);
+    c2h::device_vector<std::uint8_t> d_out(num_total_bytes);
+    c2h::device_vector<ByteOffsetT> d_buffer_src_offsets(h_buffer_src_offsets);
+    c2h::device_vector<ByteOffsetT> d_buffer_dst_offsets(h_buffer_dst_offsets);
+    c2h::device_vector<BufferSizeT> d_buffer_sizes(h_buffer_sizes);
+
+    // Prepare d_buffer_srcs
+    OffsetToPtrOp<SrcPtrT> src_transform_op{static_cast<SrcPtrT>(thrust::raw_pointer_cast(d_in.data()))};
+    cub::TransformInputIterator<SrcPtrT, OffsetToPtrOp<SrcPtrT>, ByteOffsetT*> d_buffer_srcs(
+      thrust::raw_pointer_cast(d_buffer_src_offsets.data()), src_transform_op);
+
+    // Prepare d_buffer_dsts
+    OffsetToPtrOp<DstPtrT> dst_transform_op{static_cast<DstPtrT>(thrust::raw_pointer_cast(d_out.data()))};
+    cub::TransformInputIterator<DstPtrT, OffsetToPtrOp<DstPtrT>, ByteOffsetT*> d_buffer_dsts(
+      thrust::raw_pointer_cast(d_buffer_dst_offsets.data()), dst_transform_op);
+
+    // Get temporary storage requirements
+    std::size_t temp_storage_bytes = 0;
+    CubDebugExit(cub::DeviceMemcpy::Batched(
+      nullptr, temp_storage_bytes, d_buffer_srcs, d_buffer_dsts, d_buffer_sizes.cbegin(), num_buffers));
+
+    c2h::device_vector<std::uint8_t> d_temp_storage(temp_storage_bytes);
+
+    // Invoke device-side algorithm being under test
+    CubDebugExit(cub::DeviceMemcpy::Batched(
+      thrust::raw_pointer_cast(d_temp_storage.data()),
+      temp_storage_bytes,
+      d_buffer_srcs,
+      d_buffer_dsts,
+      d_buffer_sizes.begin(),
+      num_buffers));
+
+    // Copy back the output buffer
+    CubDebugExit(cudaDeviceSynchronize());
+    h_gpu_results = d_out;
+
+    // CPU-side result generation for verification
+    for (BufferOffsetT i = 0; i < num_buffers; i++)
+    {
+      std::memcpy(thrust::raw_pointer_cast(h_out.data()) + h_buffer_dst_offsets[i],
+                  thrust::raw_pointer_cast(h_in.data()) + h_buffer_src_offsets[i],
+                  h_buffer_sizes[i]);
+    }
+
+    for (ByteOffsetT i = 0; i < num_total_bytes; i++)
+    {
+      if (h_gpu_results[i] != h_out[i])
+      {
+        std::cout << "Mismatch at index " << i << ", CPU vs. GPU: " << static_cast<uint16_t>(h_gpu_results[i]) << ", "
+                  << static_cast<uint16_t>(h_out[i]) << "\n";
+      }
+      AssertEquals(h_out[i], h_gpu_results[i]);
+    }
+  }
+  catch (std::bad_alloc& e)
+  {
+    (void) e;
+#ifdef DEBUG_CHECKED_ALLOC_FAILURE
+    std::cout
+      << "Skipping test 'RunTest(" //
+      << num_buffers << ", " //
+      << min_buffer_size << ", " //
+      << max_buffer_size << ", " //
+      << TestDataGenToString(input_gen) << ", " //
+      << TestDataGenToString(output_gen) << ")" //
+      << "' due to insufficient memory: " << e.what() << "\n";
+#endif // DEBUG_CHECKED_ALLOC_FAILURE
+  }
+}
+
+template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
+__global__ void TestVectorizedCopyKernel(const void* d_in, void* d_out, ByteOffsetT copy_size)
+{
+  cub::detail::VectorizedCopy<LOGICAL_WARP_SIZE, VectorT>(threadIdx.x, d_out, copy_size, d_in);
+}
+
+struct TupleMemberEqualityOp
+{
+  template <typename T>
+  __host__ __device__ __forceinline__ bool operator()(T tuple)
+  {
+    return thrust::get<0>(tuple) == thrust::get<1>(tuple);
+  }
+};
+
+/**
+ * @brief Tests the VectorizedCopy for various aligned and misaligned input and output pointers.
+ * @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
+ */
+template <typename VectorT>
+void TestVectorizedCopy()
+{
+  constexpr uint32_t threads_per_block = 8;
+
+  c2h::host_vector<std::size_t> in_offsets{0, 1, sizeof(uint32_t) - 1};
+  c2h::host_vector<std::size_t> out_offsets{0, 1, sizeof(VectorT) - 1};
+  c2h::host_vector<std::size_t> copy_sizes{
+    0, 1, sizeof(uint32_t), sizeof(VectorT), 2 * threads_per_block * sizeof(VectorT)};
+  for (auto copy_sizes_it = std::begin(copy_sizes); copy_sizes_it < std::end(copy_sizes); copy_sizes_it++)
+  {
+    for (auto in_offsets_it = std::begin(in_offsets); in_offsets_it < std::end(in_offsets); in_offsets_it++)
+    {
+      for (auto out_offsets_it = std::begin(out_offsets); out_offsets_it < std::end(out_offsets); out_offsets_it++)
+      {
+        std::size_t in_offset  = *in_offsets_it;
+        std::size_t out_offset = *out_offsets_it;
+        std::size_t copy_size  = *copy_sizes_it;
+
+        // Prepare data
+        const std::size_t alloc_size_in  = in_offset + copy_size;
+        const std::size_t alloc_size_out = out_offset + copy_size;
+        c2h::device_vector<char> data_in(alloc_size_in);
+        c2h::device_vector<char> data_out(alloc_size_out);
+        thrust::sequence(c2h::device_policy, data_in.begin(), data_in.end(), static_cast<char>(0));
+        thrust::fill_n(c2h::device_policy, data_out.begin(), alloc_size_out, static_cast<char>(0x42));
+
+        auto d_in  = thrust::raw_pointer_cast(data_in.data());
+        auto d_out = thrust::raw_pointer_cast(data_out.data());
+
+        TestVectorizedCopyKernel<threads_per_block, VectorT>
+          <<<1, threads_per_block>>>(d_in + in_offset, d_out + out_offset, static_cast<int>(copy_size));
+        auto zip_it = thrust::make_zip_iterator(data_in.begin() + in_offset, data_out.begin() + out_offset);
+
+        bool success = thrust::all_of(c2h::device_policy, zip_it, zip_it + copy_size, TupleMemberEqualityOp{});
+        AssertTrue(success);
+      }
+    }
+  }
+}
+
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, bool PREFER_POW2_BITS>
+__global__ void
+TestBitPackedCounterKernel(uint32_t* bins, uint32_t* increments, uint32_t* counts_out, uint32_t num_items)
+{
+  using BitPackedCounterT = cub::detail::BitPackedCounter<NUM_ITEMS, MAX_ITEM_VALUE, PREFER_POW2_BITS>;
+  BitPackedCounterT counter{};
+  for (uint32_t i = 0; i < num_items; i++)
+  {
+    counter.Add(bins[i], increments[i]);
+  }
+
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    counts_out[i] = counter.Get(i);
+  }
+}
+
+/**
+ * @brief Tests BitPackedCounter that's used for computing the histogram of buffer sizes (i.e.,
+ * small, medium, large).
+ */
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE>
+void TestBitPackedCounter(const std::uint_fast32_t seed = 320981U)
+{
+  constexpr uint32_t min_increment = 0;
+  constexpr uint32_t max_increment = 4;
+  constexpr double avg_increment =
+    static_cast<double>(min_increment) + (static_cast<double>(max_increment - min_increment) / 2.0);
+  std::uint32_t num_increments = static_cast<uint32_t>(static_cast<double>(MAX_ITEM_VALUE * NUM_ITEMS) / avg_increment);
+
+  // Test input data
+  std::array<uint64_t, NUM_ITEMS> reference_counters{};
+  c2h::host_vector<uint32_t> h_bins(num_increments);
+  c2h::host_vector<uint32_t> h_increments(num_increments);
+
+  // Generate random test input data
+  GenerateRandomData(thrust::raw_pointer_cast(h_bins.data()), num_increments, 0U, NUM_ITEMS - 1U, seed);
+  GenerateRandomData(
+    thrust::raw_pointer_cast(h_increments.data()), num_increments, min_increment, max_increment, (seed + 17));
+
+  // Make sure test data does not overflow any of the counters
+  for (std::size_t i = 0; i < num_increments; i++)
+  {
+    // New increment for this bin would overflow => zero this increment
+    if (reference_counters[h_bins[i]] + h_increments[i] >= MAX_ITEM_VALUE)
+    {
+      h_increments[i] = 0;
+    }
+    else
+    {
+      reference_counters[h_bins[i]] += h_increments[i];
+    }
+  }
+
+  // Device memory
+  c2h::device_vector<uint32_t> bins_in(num_increments);
+  c2h::device_vector<uint32_t> increments_in(num_increments);
+  c2h::device_vector<uint32_t> counts_out(NUM_ITEMS);
+
+  // Initialize device-side test data
+  bins_in       = h_bins;
+  increments_in = h_increments;
+
+  // Memory for GPU-generated results
+  c2h::host_vector<uint32_t> host_counts(num_increments);
+
+  // Reset counters to arbitrary random value
+  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);
+
+  // Run tests with densely bit-packed counters
+  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, false><<<1, 1>>>(
+    thrust::raw_pointer_cast(bins_in.data()),
+    thrust::raw_pointer_cast(increments_in.data()),
+    thrust::raw_pointer_cast(counts_out.data()),
+    num_increments);
+
+  // Result verification
+  host_counts = counts_out;
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    AssertEquals(reference_counters[i], host_counts[i]);
+  }
+
+  // Reset counters to arbitrary random value
+  thrust::fill(counts_out.begin(), counts_out.end(), 814920U);
+
+  // Run tests with bit-packed counters, where bit-count is a power-of-two
+  TestBitPackedCounterKernel<NUM_ITEMS, MAX_ITEM_VALUE, true><<<1, 1>>>(
+    thrust::raw_pointer_cast(bins_in.data()),
+    thrust::raw_pointer_cast(increments_in.data()),
+    thrust::raw_pointer_cast(counts_out.data()),
+    num_increments);
+
+  // Result verification
+  host_counts = counts_out;
+  for (uint32_t i = 0; i < NUM_ITEMS; i++)
+  {
+    AssertEquals(reference_counters[i], host_counts[i]);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  //---------------------------------------------------------------------
+  // VectorizedCopy tests
+  //---------------------------------------------------------------------
+  TestVectorizedCopy<uint32_t>();
+  TestVectorizedCopy<uint4>();
+
+  //---------------------------------------------------------------------
+  // BitPackedCounter tests
+  //---------------------------------------------------------------------
+  TestBitPackedCounter<1, 1>();
+  TestBitPackedCounter<1, (0x01U << 16)>();
+  TestBitPackedCounter<4, 1>();
+  TestBitPackedCounter<4, 2>();
+  TestBitPackedCounter<4, 255>();
+  TestBitPackedCounter<4, 256>();
+  TestBitPackedCounter<8, 1024>();
+  TestBitPackedCounter<32, 1>();
+  TestBitPackedCounter<32, 256>();
+
+  //---------------------------------------------------------------------
+  // DeviceMemcpy::Batched tests
+  //---------------------------------------------------------------------
+  // The most granular type being copied. Buffer's will be aligned and their size be an integer
+  // multiple of this type
+  using AtomicCopyT = uint8_t;
+
+  // Type used for indexing into the array of buffers
+  using BufferOffsetT = uint32_t;
+
+  // Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer
+  using BufferSizeT = uint32_t;
+
+  // Type used for indexing into bytes over *all* the buffers' sizes
+  using ByteOffsetT = uint32_t;
+
+  // Total number of bytes that are targeted to be copied on each run
+  constexpr BufferOffsetT target_copy_size = 64U << 20;
+
+  // The number of randomly
+  constexpr std::size_t num_rnd_buffer_range_tests = 32;
+
+  // Each buffer's size will be random within this interval
+  c2h::host_vector<std::pair<std::size_t, std::size_t>> buffer_size_ranges = {
+    {0, 1},
+    {1, 2},
+    {0, 16},
+    {1, 32},
+    {1, 1024},
+    {1, 32 * 1024},
+    {128 * 1024, 256 * 1024},
+    {target_copy_size, target_copy_size}};
+
+  std::mt19937 rng(0);
+  std::uniform_int_distribution<std::size_t> size_dist(1, 1000000);
+  for (std::size_t i = 0; i < num_rnd_buffer_range_tests; i++)
+  {
+    auto range_begin = size_dist(rng);
+    auto range_end   = size_dist(rng);
+    if (range_begin > range_end)
+    {
+      std::swap(range_begin, range_end);
+    }
+    buffer_size_ranges.push_back({range_begin, range_end});
+  }
+
+  for (const auto& buffer_size_range : buffer_size_ranges)
+  {
+    BufferSizeT min_buffer_size =
+      static_cast<BufferSizeT>(CUB_ROUND_UP_NEAREST(buffer_size_range.first, sizeof(AtomicCopyT)));
+    BufferSizeT max_buffer_size = static_cast<BufferSizeT>(
+      CUB_ROUND_UP_NEAREST(buffer_size_range.second, static_cast<BufferSizeT>(sizeof(AtomicCopyT))));
+    double average_buffer_size       = (min_buffer_size + max_buffer_size) / 2.0;
+    BufferOffsetT target_num_buffers = static_cast<BufferOffsetT>(target_copy_size / average_buffer_size);
+
+    // Run tests with input buffer being consecutive and output buffers being consecutive
+    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(
+      target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE);
+
+    // Run tests with input buffer being randomly shuffled and output buffers being randomly
+    // shuffled
+    RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(
+      target_num_buffers, min_buffer_size, max_buffer_size, TestDataGen::RANDOM, TestDataGen::RANDOM);
+  }
+
+  //---------------------------------------------------------------------
+  // DeviceMemcpy::Batched test with 64-bit offsets
+  //---------------------------------------------------------------------
+  using ByteOffset64T = uint64_t;
+  using BufferSize64T = uint64_t;
+  ByteOffset64T large_target_copy_size =
+    static_cast<ByteOffset64T>(std::numeric_limits<uint32_t>::max()) + (128ULL * 1024ULL * 1024ULL);
+  // Make sure min_buffer_size is in fact smaller than max buffer size
+  constexpr BufferOffsetT single_buffer = 1;
+
+  // Run tests with input buffer being consecutive and output buffers being consecutive
+  RunTest<AtomicCopyT, BufferOffsetT, BufferSize64T, ByteOffset64T>(
+    single_buffer, large_target_copy_size, large_target_copy_size, TestDataGen::CONSECUTIVE, TestDataGen::CONSECUTIVE);
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_device_radix_sort_decomposer_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_device_radix_sort_decomposer_fail.cu
new file mode 100644
index 000000000..04f108aa9
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_device_radix_sort_decomposer_fail.cu
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * Copyright (c) 2011-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_radix_sort.cuh>
+
+struct custom_t
+{
+  std::uint16_t i;
+  float f;
+};
+
+struct decomposer_t
+{
+  // expected-error {{"DecomposerT must be a callable object returning a tuple of references"}}
+  __host__ __device__ std::uint16_t& operator()(custom_t& key) const
+  {
+    return key.i;
+  }
+};
+
+int main()
+{
+  custom_t* d_in{};
+  custom_t* d_out{};
+  std::size_t temp_storage_bytes{};
+  std::uint8_t* d_temp_storage{};
+
+  cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, decomposer_t{});
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_device_segmented_reduce_offset_type_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_device_segmented_reduce_offset_type_fail.cu
new file mode 100644
index 000000000..a49fa5496
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_device_segmented_reduce_offset_type_fail.cu
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// %PARAM% TEST_ERR err 0:1:2:3:4:5
+
+#include <cub/device/device_segmented_reduce.cuh>
+
+int main()
+{
+  using offset_t = float; // error
+  // using offset_t = int; // ok
+  float *d_in{}, *d_out{};
+  offset_t* d_offsets{};
+  std::size_t temp_storage_bytes{};
+  std::uint8_t* d_temp_storage{};
+
+#if TEST_ERR == 0
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::Reduce(
+    d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1, cub::Min(), 0);
+
+#elif TEST_ERR == 1
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1);
+
+#elif TEST_ERR == 2
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1);
+
+#elif TEST_ERR == 3
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1);
+
+#elif TEST_ERR == 4
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1);
+
+#elif TEST_ERR == 5
+  // expected-error {{"Offset iterator type should be integral."}}
+  cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, 0, d_offsets, d_offsets + 1);
+#endif
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_device_spmv.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_device_spmv.cu
new file mode 100644
index 000000000..2e2699dd1
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_device_spmv.cu
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_spmv.cuh>
+#include <cub/util_debug.cuh>
+
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/mismatch.h>
+#include <thrust/scan.h>
+
+#include <cuda/std/type_traits>
+
+#include <iostream>
+#include <type_traits>
+#include <typeinfo>
+
+#include "test_util.h"
+#include <c2h/device_policy.cuh>
+#include <c2h/vector.cuh>
+
+bool g_verbose = false;
+
+//==============================================================================
+// Casts char types to int for numeric printing
+template <typename T>
+T print_cast(T val)
+{
+  return val;
+}
+
+int print_cast(char val)
+{
+  return static_cast<int>(val);
+}
+
+int print_cast(signed char val)
+{
+  return static_cast<int>(val);
+}
+
+int print_cast(unsigned char val)
+{
+  return static_cast<int>(val);
+}
+
+//==============================================================================
+// Print a vector to out
+template <typename VectorT>
+void print_vector(std::ostream& out, const VectorT& vec)
+{
+  bool first = true;
+  for (const auto& val : vec)
+  {
+    if (!first)
+    {
+      out << ", ";
+    }
+    first = false;
+    out << print_cast(val);
+  }
+}
+
+//==============================================================================
+// Simple CSR matrix implementation.
+// HostStorage controls whether data is stored on the host or device.
+// Use the host_csr_matrix and device_csr_matrix aliases for code clarity.
+template <typename ValueT, bool HostStorage>
+struct csr_matrix
+{
+  csr_matrix(int num_rows, int num_cols)
+      : m_row_offsets(static_cast<size_t>(num_rows + 1), 0)
+      , m_num_rows(num_rows)
+      , m_num_columns(num_cols)
+  {}
+
+  // host/device conversion constructor
+  explicit csr_matrix(const csr_matrix<ValueT, !HostStorage>& other)
+      : m_values(other.m_values)
+      , m_row_offsets(other.m_row_offsets)
+      , m_column_indices(other.m_column_indices)
+      , m_num_rows(other.m_num_rows)
+      , m_num_columns(other.m_num_columns)
+      , m_num_nonzeros(other.m_num_nonzeros)
+  {}
+
+  // Note that this must append to the values array. Finish filling each row
+  // before adding to the next, and each row's columns must be added in order.
+  // Must call `finalize` once all items are added.
+  void append_value(int row, int col, ValueT value)
+  {
+    ++m_num_nonzeros;
+    ++m_row_offsets[row];
+    m_column_indices.push_back(col);
+    m_values.push_back(std::move(value));
+  }
+
+  void finalize()
+  {
+    _CCCL_IF_CONSTEXPR (HostStorage)
+    {
+      thrust::exclusive_scan(thrust::host, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin());
+    }
+    else
+    {
+      thrust::exclusive_scan(c2h::device_policy, m_row_offsets.cbegin(), m_row_offsets.cend(), m_row_offsets.begin());
+    }
+    AssertEquals(m_row_offsets.back(), m_num_nonzeros);
+  }
+
+  const ValueT* get_values() const
+  {
+    return thrust::raw_pointer_cast(m_values.data());
+  }
+
+  const int* get_row_offsets() const
+  {
+    return thrust::raw_pointer_cast(m_row_offsets.data());
+  }
+
+  int get_row_offset(int row) const
+  {
+    return m_row_offsets[row];
+  }
+
+  int get_row_num_nonzero(int row) const
+  {
+    return m_row_offsets[row + 1] - m_row_offsets[row];
+  }
+
+  const int* get_column_indices() const
+  {
+    return thrust::raw_pointer_cast(m_column_indices.data());
+  }
+
+  int get_num_rows() const
+  {
+    return m_num_rows;
+  }
+
+  int get_num_columns() const
+  {
+    return m_num_columns;
+  }
+
+  int get_num_nonzeros() const
+  {
+    return m_num_nonzeros;
+  }
+
+  void print_internals(std::ostream& out) const
+  {
+    out << (HostStorage ? "host" : "device") << "_csr_matrix"
+        << "(" << m_num_rows << ", " << m_num_columns << ")\n"
+        << " - num_elems:   " << (m_num_rows * m_num_columns) << "\n"
+        << " - num_nonzero: " << m_num_nonzeros << "\n"
+        << " - row_offsets:\n     [";
+    print_vector(out, m_row_offsets);
+    out << "]\n"
+        << " - column_indices:\n     [";
+    print_vector(out, m_column_indices);
+    out << "]\n"
+        << " - values:\n     [";
+    print_vector(out, m_values);
+    out << "]\n";
+  }
+
+  void print_summary(std::ostream& out) const
+  {
+    const int num_elems = m_num_rows * m_num_columns;
+    const float fill_ratio =
+      num_elems == 0 ? 0.f : (static_cast<float>(m_num_nonzeros) / static_cast<float>(num_elems));
+
+    out << m_num_rows << "x" << m_num_columns << ", " << m_num_nonzeros << "/" << num_elems << " (" << fill_ratio
+        << ")\n";
+  }
+
+  friend class csr_matrix<ValueT, !HostStorage>;
+
+private:
+  template <typename VecValueT>
+  using vector_t = ::cuda::std::_If<HostStorage, c2h::host_vector<VecValueT>, c2h::device_vector<VecValueT>>;
+
+  vector_t<ValueT> m_values;
+  vector_t<int> m_row_offsets;
+  vector_t<int> m_column_indices;
+
+  int m_num_rows{0};
+  int m_num_columns{0};
+  int m_num_nonzeros{0};
+};
+
+//==============================================================================
+// Convenience aliases for host/device csr_matrix types.
+template <typename ValueT>
+using host_csr_matrix = csr_matrix<ValueT, true>;
+
+template <typename ValueT>
+using device_csr_matrix = csr_matrix<ValueT, false>;
+
+//==============================================================================
+// Compare two floats within a tolerance.
+// This mimics the approach used by Thrust's ASSERT_ALMOST_EQUAL checks.
+template <typename ValueT>
+struct fp_almost_equal_functor
+{
+  __host__ __device__ bool operator()(ValueT v1, ValueT v2) const
+  {
+    constexpr double r_tol = 1e-3;
+    constexpr double a_tol = 1e-2;
+    const double limit     = r_tol * (std::fabs(v1) + std::fabs(v2)) + a_tol;
+    return std::fabs(v1 - v2) <= limit;
+  }
+};
+
+//==============================================================================
+// Compare the reference and cub output vectors.
+// Use fuzzy check for floating point values.
+template <typename ValueT>
+bool compare_results(
+  std::true_type /* is_fp */, const c2h::host_vector<ValueT>& h_vec1, const c2h::device_vector<ValueT>& d_vec2)
+{
+  c2h::device_vector<ValueT> d_vec1(h_vec1);
+  auto err = thrust::mismatch(
+    c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin(), fp_almost_equal_functor<ValueT>{});
+  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
+  {
+    return true;
+  }
+  else
+  {
+    c2h::host_vector<ValueT> h_vec2(d_vec2);
+    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
+    std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs "
+              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
+    return false;
+  }
+};
+
+template <typename ValueT>
+bool compare_results(
+  std::false_type /* is_fp */, const c2h::host_vector<ValueT>& h_vec1, const c2h::device_vector<ValueT>& d_vec2)
+{
+  c2h::device_vector<ValueT> d_vec1(h_vec1);
+  auto err = thrust::mismatch(c2h::device_policy, d_vec1.cbegin(), d_vec1.cend(), d_vec2.cbegin());
+  if (err.first == d_vec1.cend() || err.second == d_vec2.cend())
+  {
+    return true;
+  }
+  else
+  {
+    c2h::host_vector<ValueT> h_vec2(d_vec2);
+    const auto idx = thrust::distance(d_vec1.cbegin(), err.first);
+    std::cerr << "Mismatch at position " << idx << ": " << print_cast(ValueT{h_vec1[idx]}) << " vs "
+              << print_cast(ValueT{h_vec2[idx]}) << std::endl;
+    return false;
+  }
+}
+
+//==============================================================================
+// Generate a random host_csr_matrix<ValueT> with the specified dimensions.
+// target_fill_ratio is the target fraction of non-zero elements (may be more
+// or less in the output).
+template <typename ValueT>
+host_csr_matrix<ValueT> make_random_csr_matrix(int num_rows, int num_cols, float target_fill_ratio)
+{
+  host_csr_matrix<ValueT> mat{num_rows, num_cols};
+
+  for (int row = 0; row < num_rows; ++row)
+  {
+    for (int col = 0; col < num_cols; ++col)
+    {
+      const bool is_non_zero = RandomValue<float>(1.f) < target_fill_ratio;
+      if (!is_non_zero)
+      {
+        continue;
+      }
+
+      if (std::is_floating_point<ValueT>::value)
+      {
+        // Keep fp numbers somewhat small, from -50 -> 50; otherwise we run
+        // into issues with nans/infs
+        ValueT value = (RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50));
+        mat.append_value(row, col, value);
+      }
+      else
+      {
+        ValueT value{};
+        InitValue(RANDOM, value);
+        mat.append_value(row, col, value);
+      }
+    }
+  }
+
+  mat.finalize();
+
+  const int num_elements        = num_rows * num_cols;
+  const float actual_fill_ratio = static_cast<float>(mat.get_num_nonzeros()) / static_cast<float>(num_elements);
+
+  if (g_verbose)
+  {
+    printf(
+      "Created host_csr_matrix<%s>(%d, %d)\n"
+      " - NumElements: %d\n"
+      " - NumNonZero:  %d\n"
+      " - Target fill: %0.2f%%\n"
+      " - Actual fill: %0.2f%%\n",
+      typeid(ValueT).name(),
+      num_rows,
+      num_cols,
+      num_elements,
+      mat.get_num_nonzeros(),
+      target_fill_ratio,
+      actual_fill_ratio);
+  }
+
+  return mat;
+}
+
+//==============================================================================
+// Fill a vector with random values.
+template <typename ValueT>
+c2h::host_vector<ValueT> make_random_vector(int len)
+{
+  c2h::host_vector<ValueT> vec(len);
+  for (auto& val : vec)
+  {
+    if (std::is_floating_point<ValueT>::value)
+    { // Keep fp numbers somewhat small; otherwise we run into issues with
+      // nans/infs
+      val = RandomValue(static_cast<ValueT>(100)) - static_cast<ValueT>(50);
+    }
+    else
+    {
+      InitValue(RANDOM, val);
+    }
+  }
+  return vec;
+}
+
+//==============================================================================
+// Serial y = Ax computation
+template <typename ValueT>
+void compute_reference_solution(
+  const host_csr_matrix<ValueT>& a, const c2h::host_vector<ValueT>& x, c2h::host_vector<ValueT>& y)
+{
+  if (a.get_num_rows() == 0 || a.get_num_columns() == 0)
+  {
+    return;
+  }
+
+  for (int row = 0; row < a.get_num_rows(); ++row)
+  {
+    const int row_offset = a.get_row_offset(row);
+    const int row_length = a.get_row_num_nonzero(row);
+    const int* cols      = a.get_column_indices() + row_offset;
+    const int* cols_end  = cols + row_length;
+    const ValueT* values = a.get_values() + row_offset;
+
+    ValueT accum{};
+    while (cols < cols_end)
+    {
+      accum += (*values++) * x[*cols++];
+    }
+    y[row] = accum;
+  }
+}
+
+//==============================================================================
+// cub::DeviceSpmv::CsrMV y = Ax computation
+template <typename ValueT>
+void compute_cub_solution(
+  const device_csr_matrix<ValueT>& a, const c2h::device_vector<ValueT>& x, c2h::device_vector<ValueT>& y)
+{
+  c2h::device_vector<char> temp_storage;
+  std::size_t temp_storage_bytes{};
+  auto err = cub::DeviceSpmv::CsrMV(
+    nullptr,
+    temp_storage_bytes,
+    a.get_values(),
+    a.get_row_offsets(),
+    a.get_column_indices(),
+    thrust::raw_pointer_cast(x.data()),
+    thrust::raw_pointer_cast(y.data()),
+    a.get_num_rows(),
+    a.get_num_columns(),
+    a.get_num_nonzeros());
+  CubDebugExit(err);
+
+  temp_storage.resize(temp_storage_bytes);
+
+  err = cub::DeviceSpmv::CsrMV(
+    thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    a.get_values(),
+    a.get_row_offsets(),
+    a.get_column_indices(),
+    thrust::raw_pointer_cast(x.data()),
+    thrust::raw_pointer_cast(y.data()),
+    a.get_num_rows(),
+    a.get_num_columns(),
+    a.get_num_nonzeros());
+  CubDebugExit(err);
+}
+
+//==============================================================================
+// Compute y = Ax twice, one reference and one cub::DeviceSpmv, and compare the
+// results.
+template <typename ValueT>
+void test_spmv(const host_csr_matrix<ValueT>& h_a, const c2h::host_vector<ValueT>& h_x)
+{
+  if (g_verbose)
+  {
+    std::cout << "Testing cub::DeviceSpmv on inputs:\n";
+    h_a.print_internals(std::cout);
+    std::cout << "x vector:\n  [";
+    print_vector(std::cout, h_x);
+    std::cout << "]" << std::endl;
+  }
+  else
+  {
+    h_a.print_summary(std::cout);
+  }
+
+  const device_csr_matrix<ValueT> d_a(h_a);
+  const c2h::device_vector<ValueT> d_x(h_x);
+
+  c2h::host_vector<ValueT> h_y(h_a.get_num_rows());
+  c2h::device_vector<ValueT> d_y(d_a.get_num_rows());
+
+  compute_reference_solution(h_a, h_x, h_y);
+  compute_cub_solution(d_a, d_x, d_y);
+
+  if (g_verbose)
+  {
+    std::cout << "reference output:\n  [";
+    print_vector(std::cout, h_y);
+    std::cout << "]\n";
+    c2h::host_vector<ValueT> tmp_y(d_y);
+    std::cout << "cub::DeviceSpmv output:\n  [";
+    print_vector(std::cout, tmp_y);
+    std::cout << "]" << std::endl;
+  }
+
+  constexpr auto is_fp = std::is_floating_point<ValueT>{};
+  AssertTrue(compare_results(is_fp, h_y, d_y));
+}
+
+//==============================================================================
+// Test example from cub::DeviceSpmv documentation
+template <typename ValueT>
+void test_doc_example()
+{
+  std::cout << "\n\ntest_doc_example<" << typeid(ValueT).name() << ">()" << std::endl;
+
+  host_csr_matrix<ValueT> h_a(9, 9);
+  h_a.append_value(0, 1, ValueT{1});
+  h_a.append_value(0, 3, ValueT{1});
+  h_a.append_value(1, 0, ValueT{1});
+  h_a.append_value(1, 2, ValueT{1});
+  h_a.append_value(1, 4, ValueT{1});
+  h_a.append_value(2, 1, ValueT{1});
+  h_a.append_value(2, 5, ValueT{1});
+  h_a.append_value(3, 0, ValueT{1});
+  h_a.append_value(3, 4, ValueT{1});
+  h_a.append_value(3, 6, ValueT{1});
+  h_a.append_value(4, 1, ValueT{1});
+  h_a.append_value(4, 3, ValueT{1});
+  h_a.append_value(4, 5, ValueT{1});
+  h_a.append_value(4, 7, ValueT{1});
+  h_a.append_value(5, 2, ValueT{1});
+  h_a.append_value(5, 4, ValueT{1});
+  h_a.append_value(5, 8, ValueT{1});
+  h_a.append_value(6, 3, ValueT{1});
+  h_a.append_value(6, 7, ValueT{1});
+  h_a.append_value(7, 4, ValueT{1});
+  h_a.append_value(7, 6, ValueT{1});
+  h_a.append_value(7, 8, ValueT{1});
+  h_a.append_value(8, 5, ValueT{1});
+  h_a.append_value(8, 7, ValueT{1});
+  h_a.finalize();
+
+  c2h::host_vector<ValueT> h_x(9, ValueT{1});
+
+  test_spmv(h_a, h_x);
+}
+
+//==============================================================================
+// Generate and test a random SpMV operation with the given parameters.
+template <typename ValueT>
+void test_random(int rows, int cols, float target_fill_ratio)
+{
+  std::cout << "\n\ntest_random<" << typeid(ValueT).name() << ">(" << rows << ", " << cols << ", " << target_fill_ratio
+            << ")" << std::endl;
+
+  host_csr_matrix<ValueT> h_a  = make_random_csr_matrix<ValueT>(rows, cols, target_fill_ratio);
+  c2h::host_vector<ValueT> h_x = make_random_vector<ValueT>(cols);
+
+  test_spmv(h_a, h_x);
+}
+
+//==============================================================================
+// Dispatch many random SpMV tests over a variety of parameters.
+template <typename ValueT>
+void test_random()
+{
+  test_random<ValueT>(0, 0, 1.f);
+  test_random<ValueT>(0, 1, 1.f);
+  test_random<ValueT>(1, 0, 1.f);
+
+  constexpr int dim_min = 1;
+  constexpr int dim_max = 10000;
+
+  constexpr int max_num_elems = 100000;
+
+  constexpr float ratio_min  = 0.f;
+  constexpr float ratio_max  = 1.1f; // a lil over to account for fp errors
+  constexpr float ratio_step = 0.3334f;
+
+  for (int rows = dim_min; rows < dim_max; rows <<= 1)
+  {
+    for (int cols = dim_min; cols < dim_max; cols <<= 1)
+    {
+      if (rows * cols >= max_num_elems)
+      {
+        continue;
+      }
+
+      for (float ratio = ratio_min; ratio < ratio_max; ratio += ratio_step)
+      {
+        test_random<ValueT>(rows, cols, ratio);
+        // Test nearby non-power-of-two dims:
+        test_random<ValueT>(rows + 97, cols + 83, ratio);
+      }
+    }
+  }
+}
+
+//==============================================================================
+// Dispatch many SpMV tests for a given ValueT.
+template <typename ValueT>
+void test_type()
+{
+  test_doc_example<ValueT>();
+  test_random<ValueT>();
+}
+
+//==============================================================================
+// Dispatch many SpMV tests over a variety of types.
+void test_types()
+{
+  test_type<float>();
+  test_type<double>();
+  test_type<signed char>();
+  test_type<int>();
+  test_type<long long>();
+}
+
+int main(int argc, char** argv)
+{
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+  g_verbose = args.CheckCmdLineFlag("v");
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>] "
+           "[--v] verbose"
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  CubDebugExit(args.DeviceInit());
+
+  test_types();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_fail.cu
new file mode 100644
index 000000000..0615c4d49
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_fail.cu
@@ -0,0 +1,4 @@
+int main()
+{
+  static_assert(false, "fail one"); // expected-error {{"fail one"}}
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_grid_barrier.cu
similarity index 52%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu
rename to source/tnn/device/cuda/thirdparty/cub/test/test_grid_barrier.cu
index e6e3b8125..2f5ecfa3e 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/test/test_grid_barrier.cu
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_grid_barrier.cu
@@ -33,15 +33,13 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-
 #include <cub/grid/grid_barrier.cuh>
 
 #include "test_util.h"
+#include <stdio.h>
 
 using namespace cub;
 
-
 //---------------------------------------------------------------------
 // Test kernels
 //---------------------------------------------------------------------
@@ -49,17 +47,14 @@ using namespace cub;
 /**
  * Kernel that iterates through the specified number of software global barriers
  */
-__global__ void Kernel(
-    GridBarrier global_barrier,
-    int iterations)
+__global__ void Kernel(GridBarrier global_barrier, int iterations)
 {
-    for (int i = 0; i < iterations; i++)
-    {
-        global_barrier.Sync();
-    }
+  for (int i = 0; i < iterations; i++)
+  {
+    global_barrier.Sync();
+  }
 }
 
-
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -69,84 +64,85 @@ __global__ void Kernel(
  */
 int main(int argc, char** argv)
 {
-    cudaError_t retval = cudaSuccess;
-
-    // Defaults
-    int iterations = 10000;
-    int block_size = 128;
-    int grid_size = -1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-
-    // Get args
-    args.GetCmdLineArgument("i", iterations);
-    args.GetCmdLineArgument("grid-size", grid_size);
-    args.GetCmdLineArgument("block-size", block_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>]"
-            "[--i=<iterations>]"
-            "[--grid-size<grid-size>]"
-            "[--block-size<block-size>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Get SM properties
-    int sm_count, max_block_threads, max_sm_occupancy;
-    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
-
-    // Compute grid size and occupancy
-    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
-
-    if (grid_size == -1)
-    {
-        grid_size = occupancy * sm_count;
-    }
-    else
-    {
-        occupancy = grid_size / sm_count;
-    }
-
-    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
-        grid_size, block_size, occupancy);
-    fflush(stdout);
-
-    // Init global barrier
-    GridBarrierLifetime global_barrier;
-    global_barrier.Setup(grid_size);
-
-    // Time kernel
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
-    gpu_timer.Stop();
-
-    retval = CubDebug(cudaThreadSynchronize());
-
-    // Output timing results
-    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
-    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
-        iterations,
-        gpu_timer.ElapsedMillis(),
-        avg_elapsed);
-
-    return retval;
+  cudaError_t retval = cudaSuccess;
+
+  // Defaults
+  int iterations = 10000;
+  int block_size = 128;
+  int grid_size  = -1;
+
+  // Initialize command line
+  CommandLineArgs args(argc, argv);
+
+  // Get args
+  args.GetCmdLineArgument("i", iterations);
+  args.GetCmdLineArgument("grid-size", grid_size);
+  args.GetCmdLineArgument("block-size", block_size);
+
+  // Print usage
+  if (args.CheckCmdLineFlag("help"))
+  {
+    printf("%s "
+           "[--device=<device-id>]"
+           "[--i=<iterations>]"
+           "[--grid-size<grid-size>]"
+           "[--block-size<block-size>]"
+           "\n",
+           argv[0]);
+    exit(0);
+  }
+
+  // Initialize device
+  CubDebugExit(args.DeviceInit());
+
+  // Get device ordinal
+  int device_ordinal;
+  CubDebugExit(cudaGetDevice(&device_ordinal));
+
+  // Get device SM version
+  int sm_version = 0;
+  CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+  // Get SM properties
+  int sm_count, max_block_threads, max_sm_occupancy;
+  CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+  CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
+  CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+
+  // Compute grid size and occupancy
+  int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
+
+  if (grid_size == -1)
+  {
+    grid_size = occupancy * sm_count;
+  }
+  else
+  {
+    occupancy = grid_size / sm_count;
+  }
+
+  printf(
+    "Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n", grid_size, block_size, occupancy);
+  fflush(stdout);
+
+  // Init global barrier
+  GridBarrierLifetime global_barrier;
+  global_barrier.Setup(grid_size);
+
+  // Time kernel
+  GpuTimer gpu_timer;
+  gpu_timer.Start();
+  Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
+  gpu_timer.Stop();
+
+  retval = CubDebug(cudaDeviceSynchronize());
+
+  // Output timing results
+  float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
+  printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
+         iterations,
+         gpu_timer.ElapsedMillis(),
+         avg_elapsed);
+
+  return retval;
 }
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_namespace_wrapped.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_namespace_wrapped.cu
new file mode 100644
index 000000000..d3f599247
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_namespace_wrapped.cu
@@ -0,0 +1,75 @@
+// Wrap thrust and cub in different enclosing namespaces
+// (In practice, you probably want these to be the same, in which case just
+// set THRUST_CUB_WRAPPED_NAMESPACE to set both).
+#define THRUST_WRAPPED_NAMESPACE wrap_thrust
+#define CUB_WRAPPED_NAMESPACE    wrap_cub
+
+// Enable error checking:
+#define CUB_STDERR
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_debug.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "test_util.h"
+
+// Test that we can use a few common utilities and algorithms from wrapped
+// Thrust/CUB namespaces at runtime. More extensive testing is performed by the
+// header tests and the check_namespace.cmake test.
+int main(int argc, char** argv)
+{
+  CommandLineArgs args(argc, argv);
+  CubDebugExit(args.DeviceInit());
+
+  constexpr std::size_t n = 2048;
+
+  // Fill a vector with random data:
+  ::wrap_thrust::thrust::host_vector<int> h_input(n);
+  for (auto& val : h_input)
+  {
+    RandomBits(val);
+  }
+
+  // Test the qualifier macro:
+  THRUST_NS_QUALIFIER::device_vector<int> d_input(h_input);
+  THRUST_NS_QUALIFIER::device_vector<int> d_output(n);
+
+  std::size_t temp_storage_bytes{};
+
+  // Sort with DeviceRadixSort:
+  auto error = ::wrap_cub::cub::DeviceRadixSort::SortKeys(
+    nullptr,
+    temp_storage_bytes,
+    ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
+    ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
+    static_cast<std::size_t>(n));
+
+  CubDebugExit(error);
+
+  ::wrap_thrust::thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
+
+  // Test the CUB qualifier macro:
+  error = CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys(
+    ::wrap_thrust::thrust::raw_pointer_cast(temp_storage.data()),
+    temp_storage_bytes,
+    ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
+    ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
+    static_cast<std::size_t>(n));
+
+  CubDebugExit(error);
+
+  // Verify output:
+  if (!::wrap_thrust::thrust::is_sorted(d_output.cbegin(), d_output.cend()))
+  {
+    std::cerr << "Output is not sorted!\n";
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_disabled.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_disabled.cu
new file mode 100644
index 000000000..c6eba196b
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_disabled.cu
@@ -0,0 +1,19 @@
+#define CUB_DETAIL_BEFORE_NVTX_RANGE_SCOPE(name) static_assert(false, "");
+#define CCCL_DISABLE_NVTX
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuda/std/functional>
+
+#if defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION)
+#  error "NVTX was included somewhere even though it is turned off via CCCL_DISABLE_NVTX"
+#endif // defined(CCCL_DISABLE_NVTX) && defined(NVTX_VERSION)
+
+int main()
+{
+  thrust::counting_iterator<int> it{0};
+  cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate<int>{});
+  cudaDeviceSynchronize();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode.cu
new file mode 100644
index 000000000..a45fe3362
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode.cu
@@ -0,0 +1,16 @@
+#include <cub/device/device_for.cuh> // internal include of NVTX
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuda/std/functional>
+
+#include <nvtx3/nvtx3.hpp> // user-side include of NVTX, retrieved elsewhere
+
+int main()
+{
+  nvtx3::scoped_range range("user-range"); // user-side use of unversioned NVTX API
+
+  thrust::counting_iterator<int> it{0};
+  cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate<int>{}); // internal use of NVTX
+  cudaDeviceSynchronize();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode_explicit.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode_explicit.cu
new file mode 100644
index 000000000..7c73e9bed
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_in_usercode_explicit.cu
@@ -0,0 +1,17 @@
+#define NVTX3_CPP_REQUIRE_EXPLICIT_VERSION
+#include <cub/device/device_for.cuh> // internal include of NVTX
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuda/std/functional>
+
+#include <nvtx3/nvtx3.hpp> // user-side include of NVTX, retrieved elsewhere
+
+int main()
+{
+  nvtx3::v1::scoped_range range("user-range"); // user-side use of explicit NVTX API
+
+  thrust::counting_iterator<int> it{0};
+  cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate<int>{}); // internal use of NVTX
+  cudaDeviceSynchronize();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_standalone.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_standalone.cu
new file mode 100644
index 000000000..3b6ed314c
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_nvtx_standalone.cu
@@ -0,0 +1,22 @@
+// The purpose of this test is to verify that CUB can use NVTX without any additional dependencies. It is built as part
+// of the unit tests, but can also be built standalone:
+
+// Compile (from current directory):
+//   nvcc test_nvtx_standalone.cu -I../../cub -I../../thrust -I../../libcudacxx/include -o nvtx_standalone
+// Profile & view:
+//   (nsys profile -o nvtx_standalone.nsys-rep -f true ./nvtx_standalone || true) && nsys-ui nvtx_standalone.nsys-rep
+
+#include <cub/device/device_for.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuda/std/functional>
+
+int main()
+{
+  CUB_DETAIL_NVTX_RANGE_SCOPE("main");
+
+  thrust::counting_iterator<int> it{0};
+  cub::DeviceFor::ForEach(it, it + 16, ::cuda::std::negate<int>{});
+  cudaDeviceSynchronize();
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_param_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_param_fail.cu
new file mode 100644
index 000000000..39e28832f
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_param_fail.cu
@@ -0,0 +1,10 @@
+// %PARAM% TEST_ERR err 0:1
+
+int main()
+{
+#if TEST_ERR == 0
+  static_assert(false, "fail one"); // expected-error-0 {{"fail one"}}
+#elif TEST_ERR == 1
+  static_assert(false, "fail two"); // expected-error-1 {{"fail two"}}
+#endif
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_param_return_code_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_param_return_code_fail.cu
new file mode 100644
index 000000000..c312b5524
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_param_return_code_fail.cu
@@ -0,0 +1,10 @@
+// %PARAM% TEST_ERR err 0:1
+
+int main()
+{
+#if TEST_ERR == 0
+  static_assert(false, "fail one");
+#elif TEST_ERR == 1
+  static_assert(false, "fail two"); // expected-error-1 {{"fail two"}}
+#endif
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_return_code_fail.cu b/source/tnn/device/cuda/thirdparty/cub/test/test_return_code_fail.cu
new file mode 100644
index 000000000..9382fc5c0
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_return_code_fail.cu
@@ -0,0 +1,4 @@
+int main()
+{
+  static_assert(false, "fail with no regex");
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub/test/test_util.h b/source/tnn/device/cuda/thirdparty/cub/test/test_util.h
new file mode 100644
index 000000000..fd0932134
--- /dev/null
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_util.h
@@ -0,0 +1,1613 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+#  include <windows.h>
+#  undef small // Windows is terrible for polluting macro namespace
+#else
+#  include <sys/resource.h>
+#endif
+
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_macro.cuh>
+#include <cub/util_math.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <cfloat>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "mersenne.h"
+#include "test_warning_suppression.cuh"
+#include <c2h/extended_types.cuh>
+#include <c2h/test_util_vec.cuh>
+#include <nv/target>
+
+/******************************************************************************
+ * Type conversion macros
+ ******************************************************************************/
+
+/**
+ * Return a value of type `T` with the same bitwise representation of `in`.
+ * Types `T` and `U` must be the same size.
+ */
+template <typename T, typename U>
+__host__ __device__ T SafeBitCast(const U& in)
+{
+  static_assert(sizeof(T) == sizeof(U), "Types must be same size.");
+  T out;
+  memcpy(&out, &in, sizeof(T));
+  return out;
+}
+
+/******************************************************************************
+ * Assertion macros
+ ******************************************************************************/
+
+/**
+ * Assert equals
+ */
+#define AssertEquals(a, b)                                                                           \
+  if ((a) != (b))                                                                                    \
+  {                                                                                                  \
+    std::cerr << "\n" << __FILE__ << ": " << __LINE__ << ": AssertEquals(" #a ", " #b ") failed.\n"; \
+    exit(1);                                                                                         \
+  }
+
+#define AssertTrue(a)                                                                      \
+  if (!(a))                                                                                \
+  {                                                                                        \
+    std::cerr << "\n" << __FILE__ << ": " << __LINE__ << ": AssertTrue(" #a ") failed.\n"; \
+    exit(1);                                                                               \
+  }
+
+/******************************************************************************
+ * Command-line parsing functionality
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLineArgs
+{
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  std::vector<std::string> args;
+  cudaDeviceProp deviceProp;
+  float device_giga_bandwidth;
+  std::size_t device_free_physmem;
+  std::size_t device_total_physmem;
+
+  /**
+   * Constructor
+   */
+  CommandLineArgs(int argc, char** argv)
+      : keys(10)
+      , values(10)
+  {
+    using namespace std;
+
+    // Initialize mersenne generator
+    unsigned int mersenne_init[4] = {0x123, 0x234, 0x345, 0x456};
+    mersenne::init_by_array(mersenne_init, 4);
+
+    for (int i = 1; i < argc; i++)
+    {
+      string arg = argv[i];
+
+      if ((arg[0] != '-') || (arg[1] != '-'))
+      {
+        args.push_back(arg);
+        continue;
+      }
+
+      string::size_type pos;
+      string key, val;
+      if ((pos = arg.find('=')) == string::npos)
+      {
+        key = string(arg, 2, arg.length() - 2);
+        val = "";
+      }
+      else
+      {
+        key = string(arg, 2, pos - 2);
+        val = string(arg, pos + 1, arg.length() - 1);
+      }
+
+      keys.push_back(key);
+      values.push_back(val);
+    }
+  }
+
+  /**
+   * Checks whether a flag "--<flag>" is present in the commandline
+   */
+  bool CheckCmdLineFlag(const char* arg_name)
+  {
+    using namespace std;
+
+    for (std::size_t i = 0; i < keys.size(); ++i)
+    {
+      if (keys[i] == string(arg_name))
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Returns number of naked (non-flag and non-key-value) commandline parameters
+   */
+  template <typename T>
+  int NumNakedArgs()
+  {
+    return args.size();
+  }
+
+  /**
+   * Returns the commandline parameter for a given index (not including flags)
+   */
+  template <typename T>
+  void GetCmdLineArgument(std::size_t index, T& val)
+  {
+    using namespace std;
+    if (index < args.size())
+    {
+      istringstream str_stream(args[index]);
+      str_stream >> val;
+    }
+  }
+
+  /**
+   * Returns the value specified for a given commandline parameter --<flag>=<value>
+   */
+  template <typename T>
+  void GetCmdLineArgument(const char* arg_name, T& val)
+  {
+    using namespace std;
+
+    for (std::size_t i = 0; i < keys.size(); ++i)
+    {
+      if (keys[i] == string(arg_name))
+      {
+        istringstream str_stream(values[i]);
+        str_stream >> val;
+      }
+    }
+  }
+
+  /**
+   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+   */
+  template <typename T>
+  void GetCmdLineArguments(const char* arg_name, std::vector<T>& vals)
+  {
+    using namespace std;
+
+    if (CheckCmdLineFlag(arg_name))
+    {
+      // Clear any default values
+      vals.clear();
+
+      // Recover from multi-value string
+      for (std::size_t i = 0; i < keys.size(); ++i)
+      {
+        if (keys[i] == string(arg_name))
+        {
+          string val_string(values[i]);
+          istringstream str_stream(val_string);
+          string::size_type old_pos = 0;
+          string::size_type new_pos = 0;
+
+          // Iterate comma-separated values
+          T val;
+          while ((new_pos = val_string.find(',', old_pos)) != string::npos)
+          {
+            if (new_pos != old_pos)
+            {
+              str_stream.width(new_pos - old_pos);
+              str_stream >> val;
+              vals.push_back(val);
+            }
+
+            // skip over comma
+            str_stream.ignore(1);
+            old_pos = new_pos + 1;
+          }
+
+          // Read last value
+          str_stream >> val;
+          vals.push_back(val);
+        }
+      }
+    }
+  }
+
+  /**
+   * The number of pairs parsed
+   */
+  int ParsedArgc()
+  {
+    return (int) keys.size();
+  }
+
+  /**
+   * Initialize device
+   */
+  cudaError_t DeviceInit(int dev = -1)
+  {
+    cudaError_t error = cudaSuccess;
+
+    do
+    {
+      int deviceCount;
+      error = CubDebug(cudaGetDeviceCount(&deviceCount));
+      if (error)
+      {
+        break;
+      }
+
+      if (deviceCount == 0)
+      {
+        fprintf(stderr, "No devices supporting CUDA.\n");
+        exit(1);
+      }
+      if (dev < 0)
+      {
+        GetCmdLineArgument("device", dev);
+      }
+      if ((dev > deviceCount - 1) || (dev < 0))
+      {
+        dev = 0;
+      }
+
+      error = CubDebug(cudaSetDevice(dev));
+      if (error)
+      {
+        break;
+      }
+
+      CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+
+      int ptx_version = 0;
+      error           = CubDebug(CUB_NS_QUALIFIER::PtxVersion(ptx_version));
+      if (error)
+      {
+        break;
+      }
+
+      error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
+      if (error)
+      {
+        break;
+      }
+
+      if (deviceProp.major < 1)
+      {
+        fprintf(stderr, "Device does not support CUDA.\n");
+        exit(1);
+      }
+
+      device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+      if (!CheckCmdLineFlag("quiet"))
+      {
+        printf(
+          "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
+          "%lld free / %lld total MB physmem, "
+          "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
+          dev,
+          deviceProp.name,
+          ptx_version,
+          deviceProp.major * 100 + deviceProp.minor * 10,
+          deviceProp.multiProcessorCount,
+          (unsigned long long) device_free_physmem / 1024 / 1024,
+          (unsigned long long) device_total_physmem / 1024 / 1024,
+          device_giga_bandwidth,
+          deviceProp.memoryClockRate,
+          (deviceProp.ECCEnabled) ? "on" : "off");
+        fflush(stdout);
+      }
+
+    } while (0);
+
+    return error;
+  }
+};
+
+// Gets the amount of global memory of the current device.
+inline std::size_t TotalGlobalMem()
+{
+  int device = 0;
+  CubDebugExit(cudaGetDevice(&device));
+  std::size_t free_mem = 0, total_mem = 0;
+  CubDebugExit(cudaMemGetInfo(&free_mem, &total_mem));
+  return total_mem;
+}
+
+/******************************************************************************
+ * Random bits generator
+ ******************************************************************************/
+
+template <typename T>
+bool IsNaN(T /* val */)
+{
+  return false;
+}
+
+template <>
+inline bool IsNaN<float>(float val)
+{
+  return std::isnan(val);
+}
+
+template <>
+inline bool IsNaN<float1>(float1 val)
+{
+  return (IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<float2>(float2 val)
+{
+  return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<float3>(float3 val)
+{
+  return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<float4>(float4 val)
+{
+  return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+template <>
+inline bool IsNaN<double>(double val)
+{
+  return std::isnan(val);
+}
+
+template <>
+inline bool IsNaN<double1>(double1 val)
+{
+  return (IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<double2>(double2 val)
+{
+  return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<double3>(double3 val)
+{
+  return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template <>
+inline bool IsNaN<double4>(double4 val)
+{
+  return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+#ifdef TEST_HALF_T
+template <>
+inline bool IsNaN<half_t>(half_t val)
+{
+  const auto bits = SafeBitCast<unsigned short>(val);
+
+  // commented bit is always true, leaving for documentation:
+  return (((bits >= 0x7C01) && (bits <= 0x7FFF)) || ((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/));
+}
+#endif
+
+#ifdef TEST_BF_T
+template <>
+inline bool IsNaN<bfloat16_t>(bfloat16_t val)
+{
+  const auto bits = SafeBitCast<unsigned short>(val);
+
+  // commented bit is always true, leaving for documentation:
+  return (((bits >= 0x7F81) && (bits <= 0x7FFF)) || ((bits >= 0xFF81) /*&& (bits <= 0xFFFFFFFF)*/));
+}
+#endif
+
+/**
+ * Generates random keys.
+ *
+ * We always take the second-order byte from rand() because the higher-order
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ *
+ * We can decrease the entropy level of keys by adopting the technique
+ * of Thearling and Smith in which keys are computed from the bitwise AND of
+ * multiple random samples:
+ *
+ * entropy_reduction    | Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1                   | 0
+ * 0                    | 32
+ * 1                    | 25.95 (81%)
+ * 2                    | 17.41 (54%)
+ * 3                    | 10.78 (34%)
+ * 4                    | 6.42 (20%)
+ * ...                  | ...
+ *
+ */
+template <typename K>
+void RandomBits(K& key, int entropy_reduction = 0, int begin_bit = 0, int end_bit = sizeof(K) * 8)
+{
+  constexpr int NUM_BYTES  = sizeof(K);
+  constexpr int WORD_BYTES = sizeof(unsigned int);
+  constexpr int NUM_WORDS  = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
+
+  unsigned int word_buff[NUM_WORDS];
+
+  if (entropy_reduction == -1)
+  {
+    memset((void*) &key, 0, sizeof(key));
+    return;
+  }
+
+  if (end_bit < 0)
+  {
+    end_bit = sizeof(K) * 8;
+  }
+
+  while (true)
+  {
+    // Generate random word_buff
+    for (int j = 0; j < NUM_WORDS; j++)
+    {
+      int current_bit = j * WORD_BYTES * 8;
+
+      unsigned int word = 0xffffffff;
+      word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
+      word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
+
+      for (int i = 0; i <= entropy_reduction; i++)
+      {
+        // Grab some of the higher bits from rand (better entropy, supposedly)
+        word &= mersenne::genrand_int32();
+      }
+
+      word_buff[j] = word;
+    }
+
+    memcpy(&key, word_buff, sizeof(K));
+
+    K copy = key;
+    if (!IsNaN(copy))
+    {
+      break; // avoids NaNs when generating random floating point numbers
+    }
+  }
+}
+
+/// Randomly select number between [0:max)
+template <typename T>
+T RandomValue(T max)
+{
+  unsigned int bits;
+  unsigned int max_int = (unsigned int) -1;
+  do
+  {
+    RandomBits(bits);
+  } while (bits == max_int);
+
+  return (T) ((double(bits) / double(max_int)) * double(max));
+}
+
+/******************************************************************************
+ * Test value initialization utilities
+ ******************************************************************************/
+
+/**
+ * Test problem generation options
+ */
+enum GenMode
+{
+  UNIFORM, // Assign to '2', regardless of integer seed
+  INTEGER_SEED, // Assign to integer seed
+  RANDOM, // Assign to random, regardless of integer seed
+  RANDOM_BIT, // Assign to randomly chosen 0 or 1, regardless of integer seed
+  RANDOM_MINUS_PLUS_ZERO, // Assign to random, with some values being -0.0 or +0.0 patterns
+};
+
+/**
+ * Initialize value
+ */
+#pragma nv_exec_check_disable
+template <typename T>
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, std::size_t index = 0)
+{
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (switch (gen_mode) {
+      case RANDOM:
+        RandomBits(value);
+        break;
+      case RANDOM_BIT: {
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = static_cast<T>((c > 0) ? 1 : -1);
+        break;
+      }
+      case RANDOM_MINUS_PLUS_ZERO: {
+        // Replace roughly 1/128 of values with -0.0 or +0.0, and
+        // generate the rest randomly
+        using UnsignedBits = typename CUB_NS_QUALIFIER::Traits<T>::UnsignedBits;
+        char c;
+        RandomBits(c);
+        if (c == 0)
+        {
+          // Replace 1/256 of values with +0.0 bit pattern
+          value = SafeBitCast<T>(UnsignedBits(0));
+        }
+        else if (c == 1)
+        {
+          // Replace 1/256 of values with -0.0 bit pattern
+          value = SafeBitCast<T>(UnsignedBits(UnsignedBits(1) << (sizeof(UnsignedBits) * 8) - 1));
+        }
+        else
+        {
+          // 127/128 of values are random
+          RandomBits(value);
+        }
+        break;
+      }
+      case UNIFORM:
+        value = 2;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = static_cast<T>(index);
+        break;
+    }),
+    ( // NV_IS_DEVICE:
+      switch (gen_mode) {
+        case RANDOM:
+        case RANDOM_BIT:
+        case RANDOM_MINUS_PLUS_ZERO:
+          _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
+          CUB_NS_QUALIFIER::ThreadTrap();
+          break;
+        case UNIFORM:
+          value = 2;
+          break;
+        case INTEGER_SEED:
+        default:
+          value = static_cast<T>(index);
+          break;
+      }));
+}
+
+/**
+ * Initialize value (bool)
+ */
+#pragma nv_exec_check_disable
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool& value, std::size_t index = 0)
+{
+  // RandomBits is host-only.
+  NV_IF_TARGET(
+    NV_IS_HOST,
+    (switch (gen_mode) {
+      case RANDOM:
+      case RANDOM_BIT:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0);
+        break;
+      case UNIFORM:
+        value = true;
+        break;
+      case INTEGER_SEED:
+      default:
+        value = (index > 0);
+        break;
+    }),
+    ( // NV_IS_DEVICE,
+      switch (gen_mode) {
+        case RANDOM:
+        case RANDOM_BIT:
+        case RANDOM_MINUS_PLUS_ZERO:
+          _CubLog("%s\n", "cub::InitValue cannot generate random numbers on device.");
+          CUB_NS_QUALIFIER::ThreadTrap();
+          break;
+        case UNIFORM:
+          value = true;
+          break;
+        case INTEGER_SEED:
+        default:
+          value = (index > 0);
+          break;
+      }));
+}
+
+/**
+ * cub::NullType test initialization
+ */
+__host__ __device__ __forceinline__ void
+InitValue(GenMode /* gen_mode */, CUB_NS_QUALIFIER::NullType& /* value */, std::size_t /* index */ = 0)
+{}
+
+/**
+ * cub::KeyValuePair<OffsetT, ValueT>test initialization
+ */
+#pragma nv_exec_check_disable
+template <typename KeyT, typename ValueT>
+__host__ __device__ __forceinline__ void
+InitValue(GenMode gen_mode, CUB_NS_QUALIFIER::KeyValuePair<KeyT, ValueT>& value, std::size_t index = 0)
+{
+  InitValue(gen_mode, value.value, index);
+
+  // This specialization only appears to be used by test_warp_scan.
+  // It initializes with uniform values and random keys, so we need to
+  // protect the call to the host-only RandomBits.
+  // clang-format off
+    NV_IF_TARGET(NV_IS_HOST, (
+        // Assign corresponding flag with a likelihood of the last bit
+        // being set with entropy-reduction level 3
+        RandomBits(value.key, 3);
+        value.key = (value.key & 0x1);
+      ), ( // NV_IS_DEVICE
+        _CubLog("%s\n",
+                "cub::InitValue cannot generate random numbers on device.");
+        CUB_NS_QUALIFIER::ThreadTrap();
+      ));
+  // clang-format on
+}
+
+/******************************************************************************
+ * Comparison and ostream operators
+ ******************************************************************************/
+
+/**
+ * KeyValuePair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<Key, Value>& val)
+{
+  os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
+  return os;
+}
+
+#if CUB_IS_INT128_ENABLED
+inline std::ostream& operator<<(std::ostream& os, __uint128_t val)
+{
+  constexpr int max_digits      = 40;
+  char buffer[max_digits]       = {};
+  char* digit                   = buffer + max_digits;
+  static constexpr char ascii[] = "0123456789";
+
+  do
+  {
+    digit--;
+    *digit = ascii[val % 10];
+    val /= 10;
+  } while (val != 0);
+
+  for (; digit != buffer + max_digits; digit++)
+  {
+    os << *digit;
+  }
+
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, __int128_t val)
+{
+  if (val < 0)
+  {
+    __uint128_t tmp = -val;
+    os << '-' << tmp;
+  }
+  else
+  {
+    __uint128_t tmp = val;
+    os << tmp;
+  }
+
+  return os;
+}
+#endif
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1_OLD(T, BaseT)                                                                \
+  /* Test initialization */                                                                             \
+  __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, std::size_t index = 0) \
+  {                                                                                                     \
+    InitValue(gen_mode, value.x, index);                                                                \
+  }
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2_OLD(T, BaseT)                                                                \
+  /* Test initialization */                                                                             \
+  __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, std::size_t index = 0) \
+  {                                                                                                     \
+    InitValue(gen_mode, value.x, index);                                                                \
+    InitValue(gen_mode, value.y, index);                                                                \
+  }
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3_OLD(T, BaseT)                                                                \
+  /* Test initialization */                                                                             \
+  __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, std::size_t index = 0) \
+  {                                                                                                     \
+    InitValue(gen_mode, value.x, index);                                                                \
+    InitValue(gen_mode, value.y, index);                                                                \
+    InitValue(gen_mode, value.z, index);                                                                \
+  }
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4_OLD(T, BaseT)                                                                \
+  /* Test initialization */                                                                             \
+  __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T& value, std::size_t index = 0) \
+  {                                                                                                     \
+    InitValue(gen_mode, value.x, index);                                                                \
+    InitValue(gen_mode, value.y, index);                                                                \
+    InitValue(gen_mode, value.z, index);                                                                \
+    InitValue(gen_mode, value.w, index);                                                                \
+  }
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD_OLD(COMPONENT_T, BaseT) \
+  CUB_VEC_OVERLOAD_1_OLD(COMPONENT_T##1, BaseT)  \
+  CUB_VEC_OVERLOAD_2_OLD(COMPONENT_T##2, BaseT)  \
+  CUB_VEC_OVERLOAD_3_OLD(COMPONENT_T##3, BaseT)  \
+  CUB_VEC_OVERLOAD_4_OLD(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD_OLD(char, signed char)
+CUB_VEC_OVERLOAD_OLD(short, short)
+CUB_VEC_OVERLOAD_OLD(int, int)
+CUB_VEC_OVERLOAD_OLD(long, long)
+CUB_VEC_OVERLOAD_OLD(longlong, long long)
+CUB_VEC_OVERLOAD_OLD(uchar, unsigned char)
+CUB_VEC_OVERLOAD_OLD(ushort, unsigned short)
+CUB_VEC_OVERLOAD_OLD(uint, unsigned int)
+CUB_VEC_OVERLOAD_OLD(ulong, unsigned long)
+CUB_VEC_OVERLOAD_OLD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD_OLD(float, float)
+CUB_VEC_OVERLOAD_OLD(double, double)
+
+//---------------------------------------------------------------------
+// Complex data type TestFoo
+//---------------------------------------------------------------------
+
+/**
+ * TestFoo complex data type
+ */
+struct TestFoo
+{
+  using x_t = long long;
+  using y_t = int;
+  using z_t = short;
+  using w_t = char;
+
+  x_t x;
+  y_t y;
+  z_t z;
+  w_t w;
+
+  // Factory
+  static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
+  {
+    TestFoo retval = {x, y, z, w};
+    return retval;
+  }
+
+  // Assignment from int operator
+  __host__ __device__ __forceinline__ TestFoo& operator=(int b)
+  {
+    x = static_cast<x_t>(b);
+    y = static_cast<y_t>(b);
+    z = static_cast<z_t>(b);
+    w = static_cast<w_t>(b);
+    return *this;
+  }
+
+  // Summation operator
+  __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo& b) const
+  {
+    return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
+  }
+
+  // Inequality operator
+  __host__ __device__ __forceinline__ bool operator!=(const TestFoo& b) const
+  {
+    return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
+  }
+
+  // Equality operator
+  __host__ __device__ __forceinline__ bool operator==(const TestFoo& b) const
+  {
+    return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
+  }
+
+  // Less than operator
+  __host__ __device__ __forceinline__ bool operator<(const TestFoo& b) const
+  {
+    if (x < b.x)
+    {
+      return true;
+    }
+    else if (b.x < x)
+    {
+      return false;
+    }
+    if (y < b.y)
+    {
+      return true;
+    }
+    else if (b.y < y)
+    {
+      return false;
+    }
+    if (z < b.z)
+    {
+      return true;
+    }
+    else if (b.z < z)
+    {
+      return false;
+    }
+    return w < b.w;
+  }
+
+  // Greater than operator
+  __host__ __device__ __forceinline__ bool operator>(const TestFoo& b) const
+  {
+    if (x > b.x)
+    {
+      return true;
+    }
+    else if (b.x > x)
+    {
+      return false;
+    }
+    if (y > b.y)
+    {
+      return true;
+    }
+    else if (b.y > y)
+    {
+      return false;
+    }
+    if (z > b.z)
+    {
+      return true;
+    }
+    else if (b.z > z)
+    {
+      return false;
+    }
+    return w > b.w;
+  }
+};
+
+/**
+ * TestFoo ostream operator
+ */
+inline std::ostream& operator<<(std::ostream& os, const TestFoo& val)
+{
+  os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
+  return os;
+}
+
+/**
+ * TestFoo test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo& value, std::size_t index = 0)
+{
+  InitValue(gen_mode, value.x, index);
+  InitValue(gen_mode, value.y, index);
+  InitValue(gen_mode, value.z, index);
+  InitValue(gen_mode, value.w, index);
+}
+
+/// numeric_limits<TestFoo> specialization
+CUB_NAMESPACE_BEGIN
+template <>
+struct NumericTraits<TestFoo>
+{
+  static constexpr Category CATEGORY = NOT_A_NUMBER;
+  enum
+  {
+    PRIMITIVE = false,
+    NULL_TYPE = false,
+  };
+  __host__ __device__ static TestFoo Max()
+  {
+    return TestFoo::MakeTestFoo(
+      NumericTraits<long long>::Max(),
+      NumericTraits<int>::Max(),
+      NumericTraits<short>::Max(),
+      NumericTraits<char>::Max());
+  }
+
+  __host__ __device__ static TestFoo Lowest()
+  {
+    return TestFoo::MakeTestFoo(
+      NumericTraits<long long>::Lowest(),
+      NumericTraits<int>::Lowest(),
+      NumericTraits<short>::Lowest(),
+      NumericTraits<char>::Lowest());
+  }
+};
+CUB_NAMESPACE_END
+
+//---------------------------------------------------------------------
+// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
+//---------------------------------------------------------------------
+
+/**
+ * TestBar complex data type
+ */
+struct TestBar
+{
+  long long x;
+  int y;
+
+  // Constructor
+  __host__ __device__ __forceinline__ TestBar()
+      : x(0)
+      , y(0)
+  {}
+
+  // Constructor
+  __host__ __device__ __forceinline__ TestBar(int b)
+      : x(b)
+      , y(b)
+  {}
+
+  // Constructor
+  __host__ __device__ __forceinline__ TestBar(long long x, int y)
+      : x(x)
+      , y(y)
+  {}
+
+  // Assignment from int operator
+  __host__ __device__ __forceinline__ TestBar& operator=(int b)
+  {
+    x = b;
+    y = b;
+    return *this;
+  }
+
+  // Summation operator
+  __host__ __device__ __forceinline__ TestBar operator+(const TestBar& b) const
+  {
+    return TestBar(x + b.x, y + b.y);
+  }
+
+  // Inequality operator
+  __host__ __device__ __forceinline__ bool operator!=(const TestBar& b) const
+  {
+    return (x != b.x) || (y != b.y);
+  }
+
+  // Equality operator
+  __host__ __device__ __forceinline__ bool operator==(const TestBar& b) const
+  {
+    return (x == b.x) && (y == b.y);
+  }
+
+  // Less than operator
+  __host__ __device__ __forceinline__ bool operator<(const TestBar& b) const
+  {
+    if (x < b.x)
+    {
+      return true;
+    }
+    else if (b.x < x)
+    {
+      return false;
+    }
+    return y < b.y;
+  }
+
+  // Greater than operator
+  __host__ __device__ __forceinline__ bool operator>(const TestBar& b) const
+  {
+    if (x > b.x)
+    {
+      return true;
+    }
+    else if (b.x > x)
+    {
+      return false;
+    }
+    return y > b.y;
+  }
+};
+
+/**
+ * TestBar ostream operator
+ */
+inline std::ostream& operator<<(std::ostream& os, const TestBar& val)
+{
+  os << '(' << val.x << ',' << val.y << ')';
+  return os;
+}
+
+/**
+ * TestBar test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar& value, std::size_t index = 0)
+{
+  InitValue(gen_mode, value.x, index);
+  InitValue(gen_mode, value.y, index);
+}
+
+/// numeric_limits<TestBar> specialization
+CUB_NAMESPACE_BEGIN
+template <>
+struct NumericTraits<TestBar>
+{
+  static constexpr Category CATEGORY = NOT_A_NUMBER;
+  enum
+  {
+    PRIMITIVE = false,
+    NULL_TYPE = false,
+  };
+  __host__ __device__ static TestBar Max()
+  {
+    return TestBar(NumericTraits<long long>::Max(), NumericTraits<int>::Max());
+  }
+
+  __host__ __device__ static TestBar Lowest()
+  {
+    return TestBar(NumericTraits<long long>::Lowest(), NumericTraits<int>::Lowest());
+  }
+};
+CUB_NAMESPACE_END
+
+/******************************************************************************
+ * Helper routines for list comparison and display
+ ******************************************************************************/
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename S, typename T, typename OffsetT>
+int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
+{
+  for (OffsetT i = 0; i < len; i++)
+  {
+    if (computed[i] != reference[i])
+    {
+      if (verbose)
+      {
+        std::cout << "INCORRECT: [" << i << "]: " << CoutCast(computed[i]) << " != " << CoutCast(reference[i]);
+      }
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
+{
+  for (OffsetT i = 0; i < len; i++)
+  {
+    if (computed[i] != reference[i])
+    {
+      float difference = std::abs(computed[i] - reference[i]);
+      float fraction   = difference / std::abs(reference[i]);
+
+      if (fraction > 0.00015)
+      {
+        if (verbose)
+        {
+          std::cout
+            << "INCORRECT: [" << i << "]: "
+            << "(computed) " << CoutCast(computed[i]) << " != " << CoutCast(reference[i])
+            << " (difference:" << difference << ", fraction: " << fraction << ")";
+        }
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(
+  CUB_NS_QUALIFIER::NullType* computed, CUB_NS_QUALIFIER::NullType* reference, OffsetT len, bool verbose = true)
+{
+  return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
+{
+  for (OffsetT i = 0; i < len; i++)
+  {
+    if (computed[i] != reference[i])
+    {
+      double difference = std::abs(computed[i] - reference[i]);
+      double fraction   = difference / std::abs(reference[i]);
+
+      if (fraction > 0.00015)
+      {
+        if (verbose)
+        {
+          std::cout << "INCORRECT: [" << i << "]: " << CoutCast(computed[i]) << " != " << CoutCast(reference[i])
+                    << " (difference:" << difference << ", fraction: " << fraction << ")";
+        }
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+inline int CompareDeviceResults(
+  CUB_NS_QUALIFIER::NullType* /* h_reference */,
+  CUB_NS_QUALIFIER::NullType* /* d_data */,
+  std::size_t /* num_items */,
+  bool /* verbose */      = true,
+  bool /* display_data */ = false)
+{
+  return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename OffsetT>
+int CompareDeviceResults(
+  S* /*h_reference*/,
+  CUB_NS_QUALIFIER::DiscardOutputIterator<OffsetT> /*d_data*/,
+  std::size_t /*num_items*/,
+  bool /*verbose*/      = true,
+  bool /*display_data*/ = false)
+{
+  return 0;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename T>
+int CompareDeviceResults(
+  S* h_reference, T* d_data, std::size_t num_items, bool verbose = true, bool display_data = false)
+{
+  if (num_items == 0)
+  {
+    return 0;
+  }
+
+  // Allocate array on host
+  T* h_data = (T*) malloc(num_items * sizeof(T));
+
+  // Copy data back
+  cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+  // Display data
+  if (display_data)
+  {
+    printf("Reference:\n");
+    for (std::size_t i = 0; i < num_items; i++)
+    {
+      std::cout << CoutCast(h_reference[i]) << ", ";
+    }
+    printf("\n\nComputed:\n");
+    for (std::size_t i = 0; i < num_items; i++)
+    {
+      std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n\n");
+  }
+
+  // Check
+  int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+  // Cleanup
+  if (h_data)
+  {
+    free(h_data);
+  }
+
+  return retval;
+}
+
+/**
+ * Verify the contents of a device array match those
+ * of a device array
+ */
+template <typename T>
+int CompareDeviceDeviceResults(
+  T* d_reference, T* d_data, std::size_t num_items, bool verbose = true, bool display_data = false)
+{
+  // Allocate array on host
+  T* h_reference = (T*) malloc(num_items * sizeof(T));
+  T* h_data      = (T*) malloc(num_items * sizeof(T));
+
+  // Copy data back
+  cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+  // Display data
+  if (display_data)
+  {
+    printf("Reference:\n");
+    for (std::size_t i = 0; i < num_items; i++)
+    {
+      std::cout << CoutCast(h_reference[i]) << ", ";
+    }
+    printf("\n\nComputed:\n");
+    for (std::size_t i = 0; i < num_items; i++)
+    {
+      std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n\n");
+  }
+
+  // Check
+  int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+  // Cleanup
+  if (h_reference)
+  {
+    free(h_reference);
+  }
+  if (h_data)
+  {
+    free(h_data);
+  }
+
+  return retval;
+}
+
+/**
+ * Print the contents of a host array
+ */
+inline void DisplayResults(CUB_NS_QUALIFIER::NullType* /* h_data */, std::size_t /* num_items */) {}
+
+/**
+ * Print the contents of a host array
+ */
+template <typename InputIteratorT>
+void DisplayResults(InputIteratorT h_data, std::size_t num_items)
+{
+  // Display data
+  for (std::size_t i = 0; i < num_items; i++)
+  {
+    std::cout << CoutCast(h_data[i]) << ", ";
+  }
+  printf("\n");
+}
+
+/**
+ * Print the contents of a device array
+ */
+template <typename T>
+void DisplayDeviceResults(T* d_data, std::size_t num_items)
+{
+  // Allocate array on host
+  T* h_data = (T*) malloc(num_items * sizeof(T));
+
+  // Copy data back
+  cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+  DisplayResults(h_data, num_items);
+
+  // Cleanup
+  if (h_data)
+  {
+    free(h_data);
+  }
+}
+
+/******************************************************************************
+ * Segment descriptor generation
+ ******************************************************************************/
+
+/**
+ * Initialize segments
+ */
+template <typename OffsetT>
+void InitializeSegments(OffsetT num_items, int num_segments, OffsetT* h_segment_offsets, bool verbose = false)
+{
+  if (num_segments <= 0)
+  {
+    return;
+  }
+
+  OffsetT expected_segment_length = ::cuda::ceil_div(num_items, OffsetT(num_segments));
+  OffsetT offset                  = 0;
+  for (int i = 0; i < num_segments; ++i)
+  {
+    h_segment_offsets[i] = offset;
+
+    OffsetT segment_length = RandomValue((expected_segment_length * 2) + 1);
+    offset += segment_length;
+    offset = CUB_MIN(offset, num_items);
+  }
+  h_segment_offsets[num_segments] = num_items;
+
+  if (verbose)
+  {
+    printf("Segment offsets: ");
+    DisplayResults(h_segment_offsets, num_segments + 1);
+  }
+}
+
+/******************************************************************************
+ * Timing
+ ******************************************************************************/
+
+struct CpuTimer
+{
+#if defined(_WIN32) || defined(_WIN64)
+
+  LARGE_INTEGER ll_freq;
+  LARGE_INTEGER ll_start;
+  LARGE_INTEGER ll_stop;
+
+  CpuTimer()
+  {
+    QueryPerformanceFrequency(&ll_freq);
+  }
+
+  void Start()
+  {
+    QueryPerformanceCounter(&ll_start);
+  }
+
+  void Stop()
+  {
+    QueryPerformanceCounter(&ll_stop);
+  }
+
+  float ElapsedMillis()
+  {
+    double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
+    double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
+
+    return float((stop - start) * 1000);
+  }
+
+#else
+
+  rusage start;
+  rusage stop;
+
+  void Start()
+  {
+    getrusage(RUSAGE_SELF, &start);
+  }
+
+  void Stop()
+  {
+    getrusage(RUSAGE_SELF, &stop);
+  }
+
+  float ElapsedMillis()
+  {
+    float sec  = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
+    float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
+
+    return (sec * 1000) + (usec / 1000);
+  }
+
+#endif
+};
+
+struct GpuTimer
+{
+  cudaEvent_t start;
+  cudaEvent_t stop;
+
+  GpuTimer()
+  {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+  }
+
+  ~GpuTimer()
+  {
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+  }
+
+  void Start()
+  {
+    cudaEventRecord(start, 0);
+  }
+
+  void Stop()
+  {
+    cudaEventRecord(stop, 0);
+  }
+
+  float ElapsedMillis()
+  {
+    float elapsed;
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsed, start, stop);
+    return elapsed;
+  }
+};
+
+template <int ELEMENTS_PER_OBJECT_ = 128>
+struct HugeDataType
+{
+  static constexpr int ELEMENTS_PER_OBJECT = ELEMENTS_PER_OBJECT_;
+
+  __device__ __host__ HugeDataType()
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = 0;
+    }
+  }
+
+  __device__ __host__ HugeDataType(const HugeDataType& rhs)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = rhs.data[i];
+    }
+  }
+
+  explicit __device__ __host__ HugeDataType(int val)
+  {
+    for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+    {
+      data[i] = val;
+    }
+  }
+
+  __device__ __host__ HugeDataType& operator=(const HugeDataType& rhs)
+  {
+    if (this != &rhs)
+    {
+      for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+      {
+        data[i] = rhs.data[i];
+      }
+    }
+    return *this;
+  }
+
+  int data[ELEMENTS_PER_OBJECT];
+};
+
+template <int ELEMENTS_PER_OBJECT>
+inline __device__ __host__ bool
+operator==(const HugeDataType<ELEMENTS_PER_OBJECT>& lhs, const HugeDataType<ELEMENTS_PER_OBJECT>& rhs)
+{
+  for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] != rhs.data[i])
+    {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <int ELEMENTS_PER_OBJECT>
+inline __device__ __host__ bool
+operator<(const HugeDataType<ELEMENTS_PER_OBJECT>& lhs, const HugeDataType<ELEMENTS_PER_OBJECT>& rhs)
+{
+  for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] < rhs.data[i])
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <typename DataType, int ELEMENTS_PER_OBJECT>
+__device__ __host__ bool operator!=(const HugeDataType<ELEMENTS_PER_OBJECT>& lhs, const DataType& rhs)
+{
+  for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+  {
+    if (lhs.data[i] != rhs)
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <int ELEMENTS_PER_OBJECT>
+std::ostream& operator<<(std::ostream& os, const HugeDataType<ELEMENTS_PER_OBJECT>& val)
+{
+  os << '(';
+  for (int i = 0; i < ELEMENTS_PER_OBJECT; i++)
+  {
+    os << CoutCast(val.data[i]);
+    if (i < ELEMENTS_PER_OBJECT - 1)
+    {
+      os << ',';
+    }
+  }
+  os << ')';
+  return os;
+}
diff --git a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh b/source/tnn/device/cuda/thirdparty/cub/test/test_warning_suppression.cuh
similarity index 72%
rename from source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh
rename to source/tnn/device/cuda/thirdparty/cub/test/test_warning_suppression.cuh
index 4488d97f6..e11d199e0 100644
--- a/source/tnn/device/cuda/thirdparty/cub-1.9.10-1/cub/util_namespace.cuh
+++ b/source/tnn/device/cuda/thirdparty/cub/test/test_warning_suppression.cuh
@@ -1,6 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,34 +25,16 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * Place-holder for prefixing the cub namespace
- */
-
 #pragma once
 
-#include "version.cuh"
-
-// For example:
-//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define CUB_NS_POSTFIX } }
-
-#ifndef CUB_NS_PREFIX
-#define CUB_NS_PREFIX
-#endif
+#include <cub/util_compiler.cuh>
+#include <cub/util_cpp_dialect.cuh>
 
-#ifndef CUB_NS_POSTFIX
-#define CUB_NS_POSTFIX
+// C4127: conditional expression is constant
+// This can be fixed with `if constexpr` when available, but there's no way to
+// silence these pre-C++17.
+#if defined(_CCCL_COMPILER_MSVC)
+#  if _CCCL_STD_VER < 2017
+#    pragma warning(disable : 4127)
+#  endif
 #endif
-
-// Declare these namespaces here for the purpose of Doxygenating them
-
-/*! \namespace cub
- *  \brief \p cub is the top-level namespace which contains all CUB
- *         functions and types.
- */
-namespace cub
-{
-
-}
diff --git a/source/tnn/device/cuda/utils.cu b/source/tnn/device/cuda/utils.cu
index 0e3b403fd..23c0c24bc 100644
--- a/source/tnn/device/cuda/utils.cu
+++ b/source/tnn/device/cuda/utils.cu
@@ -12,13 +12,1534 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
+#include <exception>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
 #include "tnn/device/cuda/utils.cuh"
 
+
 namespace TNN_NS {
 
 template<> __device__ float get_float_value<__half>(__half value) { return __half2float(value); }
 
 template<> __device__ __half convert_float_value<__half>(float value) { return __float2half(value); }
 
-}  //  namespace TNN_NS;
 
+CublasMMConfig::CublasMMConfig() {}
+
+CublasMMConfig::CublasMMConfig(const int& in_dtype,
+                               const int& in_batch,
+                               cublasOperation_t in_transa,
+                               cublasOperation_t in_transb,
+                               const int& in_m,
+                               const int& in_n,
+                               const int& in_k,
+                               const int& in_lda,
+                               const int& in_ldb,
+                               const int& in_ldc,
+                               const float& f_alpha,
+                               const float& f_beta,
+                               const int& max_workspace_size) :
+                                   dtype(in_dtype), batch(in_batch), m(in_m), n(in_n), k(in_k),
+                                   lda(in_lda), ldb(in_ldb), ldc(in_ldc) {
+    transa = in_transa == CUBLAS_OP_N ? 0 : 1;
+    transb = in_transb == CUBLAS_OP_N ? 0 : 1;
+    non_1_alpha = f_alpha == 1.0f ? 0 : 1;
+    non_0_beta = f_beta == 0.0f ? 0 : 1;
+    need_workspace = max_workspace_size > 0 ? 1 : 0;
+}
+
+CublasMMConfig::CublasMMConfig(const int& in_dtype,
+                               const int& in_batch,
+                               cublasOperation_t in_transa,
+                               cublasOperation_t in_transb,
+                               const int& in_m,
+                               const int& in_n,
+                               const int& in_k,
+                               const int& in_lda,
+                               const int& in_ldb,
+                               const int& in_ldc,
+                               const half& h_alpha,
+                               const half& h_beta,
+                               const int& max_workspace_size) :
+                                   dtype(in_dtype), batch(in_batch), m(in_m), n(in_n), k(in_k),
+                                   lda(in_lda), ldb(in_ldb), ldc(in_ldc) {
+    transa = in_transa == CUBLAS_OP_N ? 0 : 1;
+    transb = in_transb == CUBLAS_OP_N ? 0 : 1;
+    non_1_alpha = float(h_alpha) == 1.0f ? 0 : 1;
+    non_0_beta = float(h_beta) == 0.0f ? 0 : 1;
+    need_workspace = max_workspace_size > 0 ? 1 : 0;
+}
+
+CublasMMConfig::CublasMMConfig(const int& in_dtype,
+                               const int& in_batch,
+                               cublasOperation_t in_transa,
+                               cublasOperation_t in_transb,
+                               const int& in_m,
+                               const int& in_n,
+                               const int& in_k,
+                               const int& in_lda,
+                               const int& in_ldb,
+                               const int& in_ldc,
+                               const bool in_non_1_alpha,
+                               const bool in_non_0_beta,
+                               const bool in_need_workspace) :
+                                   dtype(in_dtype), batch(in_batch), m(in_m), n(in_n), k(in_k),
+                                   lda(in_lda), ldb(in_ldb), ldc(in_ldc),
+                                   non_1_alpha(in_non_1_alpha), non_0_beta(in_non_0_beta), need_workspace(in_need_workspace) {
+    transa = in_transa == CUBLAS_OP_N ? 0 : 1;
+    transb = in_transb == CUBLAS_OP_N ? 0 : 1;
+}
+
+
+
+
+
+
+CublasMMWrapper::CublasMMWrapper(cublasHandle_t   cublas_handle,
+                                 cublasLtHandle_t cublaslt_handle,
+                                 void* cuda_context_workspace) :
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle),
+    cuda_context_workspace_(cublaslt_handle) {
+}
+
+CublasMMWrapper::~CublasMMWrapper() {
+    FreeCublasLtDesc();
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       alpha,
+                           const void*       A,
+                           cudaDataType_t    Atype,
+                           int               lda,
+                           const void*       B,
+                           cudaDataType_t    Btype,
+                           int               ldb,
+                           const void*       beta,
+                           void*             C,
+                           cudaDataType_t    Ctype,
+                           int               ldc,
+                           cudaDataType_t    computeType,
+                           cublasGemmAlgo_t  algo) {
+    CUBLAS_CHECK(cublasGemmEx(cublas_handle_,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              alpha,
+                              A,
+                              Atype,
+                              lda,
+                              B,
+                              Btype,
+                              ldb,
+                              beta,
+                              C,
+                              Ctype,
+                              ldc,
+                              computeType,
+                              algo));
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const float*      A,
+                           const int         lda,
+                           const float*      B,
+                           const int         ldb,
+                           float*            C,
+                           const int         ldc,
+                           cudaStream_t      stream,
+                           const int         max_workspace_size,
+                           const bool        use_default_algo,
+                           const std::string caller_name) {
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, stream, max_workspace_size, use_default_algo, caller_name);
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const half*       A,
+                           const int         lda,
+                           const half*       B,
+                           const int         ldb,
+                           half*             C,
+                           const int         ldc,
+                           cudaStream_t      stream,
+                           const int         max_workspace_size,
+                           const bool        use_default_algo,
+                           const std::string caller_name) {
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, half(1.0f), half(0.0f), stream, max_workspace_size, use_default_algo, caller_name);
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const float*      A,
+                           const int         lda,
+                           const float*      B,
+                           const int         ldb,
+                           float*            C,
+                           const int         ldc,
+                           float             f_alpha,
+                           float             f_beta,
+                           cudaStream_t      stream,
+                           const int         max_workspace_size,
+                           const bool        use_default_algo,
+                           const std::string caller_name) {
+    const void* alpha = reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = reinterpret_cast<void*>(&f_beta);
+    CUBLAS_CHECK(cublasGemmEx(cublas_handle_,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              alpha,
+                              A,
+                              CUDA_R_32F,
+                              lda,
+                              B,
+                              CUDA_R_32F,
+                              ldb,
+                              beta,
+                              C,
+                              CUDA_R_32F,
+                              ldc,
+                              CUDA_R_32F,
+                              CUBLAS_GEMM_DEFAULT));
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const half*       A,
+                           const int         lda,
+                           const half*       B,
+                           const int         ldb,
+                           half*             C,
+                           const int         ldc,
+                           float             f_alpha,
+                           float             f_beta,
+                           cudaStream_t      stream,
+                           const int         max_workspace_size,
+                           const bool        use_default_algo,
+                           const std::string caller_name) {
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, half(f_alpha), half(f_beta), stream, max_workspace_size, use_default_algo, caller_name);
+}
+
+void CublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const half*       A,
+                           const int         lda,
+                           const half*       B,
+                           const int         ldb,
+                           half*             C,
+                           const int         ldc,
+                           half              h_alpha,
+                           half              h_beta,
+                           cudaStream_t      stream,
+                           const int         max_workspace_size,
+                           const bool        use_default_algo,
+                           const std::string caller_name) {
+    if (use_default_algo) {
+        const void* alpha = reinterpret_cast<void*>(&h_alpha);
+        const void* beta  = reinterpret_cast<void*>(&h_beta);
+        CublasMMConfig config = CublasMMConfig(int(DATA_TYPE_HALF), 1, transa, transb, m, n, k, lda, ldb, ldc, h_alpha, h_beta, max_workspace_size);
+        if (this->algo_map_.find(config) == this->algo_map_.end()) {
+            CublasMMAlgoInfo new_algo_info;
+            InitCublasLtDesc(config, new_algo_info);
+            this->algo_map_[config] = std::move(new_algo_info);
+        }
+        CublasMMAlgoInfo& algo_info = this->algo_map_[config];
+        CUBLAS_CHECK(cublasLtMatmul(cublaslt_handle_,
+                                    algo_info.operationDesc,
+                                    alpha,
+                                    A,
+                                    algo_info.Adesc,
+                                    B,
+                                    algo_info.Bdesc,
+                                    beta,
+                                    C,
+                                    algo_info.Cdesc,
+                                    C,
+                                    algo_info.Cdesc,
+                                    NULL,
+                                    NULL,
+                                    0,
+                                    stream));
+        return;
+    }
+
+    // ONLY GEMM with caller_name set will be stored in Configure Map.
+    if (!caller_name.empty()) {
+        std::string map_key = caller_name + "_half";
+        Status ret = UpdateMinMaxMapAndFindMMAlgosIfNeeded(map_key, DATA_TYPE_HALF, transa, transb, 1, m, n, k, A, lda, B, ldb, C, ldc,
+                                                           float(h_alpha), float(h_beta), stream, max_workspace_size, true, true);
+        if (ret != TNN_OK) {
+            LOGE("Error in Updating Min Max Configure Map and run FindMMAlgos() for intermediate values, caller_name = (%s).\n", caller_name.c_str());
+        }
+    }
+
+    CublasMMAlgoInfo best_algo_info;
+    const void* alpha = reinterpret_cast<void*>(&h_alpha);
+    const void* beta  = reinterpret_cast<void*>(&h_beta);
+    cublasStatus_t status = FindBestMMAlgo(transa, transb, 1, m, n, k, A, lda, B, ldb, C, ldc,
+                                           alpha, beta, best_algo_info, stream, max_workspace_size);
+
+    if (best_algo_info.is_cublaslt) {
+        CUBLAS_CHECK(cublasLtMatmul(cublaslt_handle_,
+                                    best_algo_info.operationDesc,
+                                    alpha,
+                                    A,
+                                    best_algo_info.Adesc,
+                                    B,
+                                    best_algo_info.Bdesc,
+                                    beta,
+                                    C,
+                                    best_algo_info.Cdesc,
+                                    C,
+                                    best_algo_info.Cdesc,
+                                    &(best_algo_info.algo),
+                                    this->cuda_context_workspace_,
+                                    best_algo_info.workspaceSize,
+                                    stream));
+    } else {
+        CUBLAS_CHECK(cublasGemmEx(cublas_handle_,
+                                  transa,
+                                  transb,
+                                  m,
+                                  n,
+                                  k,
+                                  alpha,
+                                  A,
+                                  CUDA_R_16F,
+                                  lda,
+                                  B,
+                                  CUDA_R_16F,
+                                  ldb,
+                                  beta,
+                                  C,
+                                  CUDA_R_16F,
+                                  ldc,
+                                  CUDA_R_16F,
+                                  cublasGemmAlgo_t(best_algo_info.algoId)));
+    }
+}
+
+Status CublasMMWrapper::InitCublasLtDesc(const CublasMMConfig& config,
+                                         CublasMMAlgoInfo& info) {
+    DataType dtype = DataType(config.dtype);
+    cublasOperation_t transa = config.transa == 0 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t transb = config.transb == 0 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    return InitCublasLtDesc(dtype, transa, transb, config.m, config.n, config.k, config.lda, config.ldb, config.ldc, info);
+}
+
+Status CublasMMWrapper::InitCublasLtDesc(const DataType    dtype,
+                                         cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const int         lda,
+                                         const int         ldb,
+                                         const int         ldc,
+                                         CublasMMAlgoInfo& info) {
+    bool is_fp16_computeType = dtype == DATA_TYPE_HALF ? 1 : 0;
+    LOGD("InitCublasLtDesc: dtype=(%d), m=%d, n=%d, k=%d, lda=%d, ldb=%d, ldc=%d.\n", int(dtype), m, n, k, lda, ldb, ldc);
+
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+    cudaDataType_t scaleType;
+
+    if (is_fp16_computeType) {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+        scaleType = CUDA_R_16F;
+    }
+    else {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+        scaleType = CUDA_R_32F;
+    }
+
+    // Create descriptors for the original matrices
+    // "scaleType below should actually be computeType", however, cuda>=11 requires compute type to be cublasComputType_t,
+    // so we use scaleType to replace
+    cublasLtMatrixLayoutCreate(&(info.Adesc), scaleType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+    cublasLtMatrixLayoutCreate(&(info.Bdesc), scaleType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+    cublasLtMatrixLayoutCreate(&(info.Cdesc), scaleType, m, n, ldc);
+
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&(info.operationDesc), computeType, scaleType);
+#else
+    cublasLtMatmulDescCreate(&(info.operationDesc), computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(info.operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(info.operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+    
+    return TNN_OK;
+}
+
+void CublasMMWrapper::FreeCublasLtDesc() {
+    for (auto& iter : this->algo_map_) {
+        if (iter.second.operationDesc) {
+            cublasLtMatmulDescDestroy(iter.second.operationDesc);
+            iter.second.operationDesc = NULL;
+        }
+        if (iter.second.Adesc) {
+            cublasLtMatrixLayoutDestroy(iter.second.Adesc);
+            iter.second.Adesc = NULL;
+        }
+        if (iter.second.Bdesc) {
+            cublasLtMatrixLayoutDestroy(iter.second.Bdesc);
+            iter.second.Bdesc = NULL;
+        }
+        if (iter.second.Cdesc) {
+            cublasLtMatrixLayoutDestroy(iter.second.Cdesc);
+            iter.second.Cdesc = NULL;
+        }
+    }
+}
+
+void CublasMMWrapper::FreeCublasLtDesc(const CublasMMConfig& config) {
+    auto iter = this->algo_map_.find(config);
+    if (iter != this->algo_map_.end()) {
+        if (iter->second.operationDesc) {
+            cublasLtMatmulDescDestroy(iter->second.operationDesc);
+            iter->second.operationDesc = NULL;
+        }
+        if (iter->second.Adesc) {
+            cublasLtMatrixLayoutDestroy(iter->second.Adesc);
+            iter->second.Adesc = NULL;
+        }
+        if (iter->second.Bdesc) {
+            cublasLtMatrixLayoutDestroy(iter->second.Bdesc);
+            iter->second.Bdesc = NULL;
+        }
+        if (iter->second.Cdesc) {
+            cublasLtMatrixLayoutDestroy(iter->second.Cdesc);
+            iter->second.Cdesc = NULL;
+        }
+    }
+}
+
+Status CublasMMWrapper::UpdateMinMaxMapAndFindMMAlgosIfNeeded(const std::string& caller_name_key,
+                                                              const DataType     dtype,
+                                                              cublasOperation_t  transa,
+                                                              cublasOperation_t  transb,
+                                                              const int          batch,
+                                                              const int          m,
+                                                              const int          n,
+                                                              const int          k,
+                                                              const void*        A,
+                                                              const int          lda,
+                                                              const void*        B,
+                                                              const int          ldb,
+                                                              void*              C,
+                                                              const int          ldc,
+                                                              float              f_alpha,
+                                                              float              f_beta,
+                                                              cudaStream_t       stream,
+                                                              const int          max_workspace_size,
+                                                              const bool         enable_cublas_algo,
+                                                              const bool         enable_cublaslt_algo) {
+    auto iter = this->caller_name_minmax_map_.find(caller_name_key);
+    if (iter != this->caller_name_minmax_map_.end()) {
+        CublasMMMinMaxConfig& caller_cfg = iter->second;
+        if (caller_cfg.transa != transa || caller_cfg.transb != transb ||
+            std::abs(caller_cfg.f_alpha - f_alpha) > 1e-4 || std::abs(caller_cfg.f_beta - f_beta) > 1e-4) {
+            LOGE("Calling Same GEMM with different TransA, TransB, alpha or beta.\n");
+            return TNNERR_NET_ERR;
+        }
+
+        // Step 1: Update Min Max Config Map, check if the GEMM has one and only one of M, N, K dynamic.
+        int diff_count = 0;
+        bool diff_batch = false, diff_m = false, diff_n = false, diff_k = false;
+        if (batch < caller_cfg.batch_min || m > caller_cfg.batch_max) {
+            caller_cfg.batch_min = std::min(batch, caller_cfg.batch_min);
+            caller_cfg.batch_max = std::max(batch, caller_cfg.batch_max);
+            diff_count++;
+            diff_batch = true;
+        }
+        if (m < caller_cfg.m_min || m > caller_cfg.m_max) {
+            caller_cfg.m_min = std::min(m, caller_cfg.m_min);
+            caller_cfg.m_max = std::max(m, caller_cfg.m_max);
+            if (transa == CUBLAS_OP_N) {
+                caller_cfg.lda_min   = std::min(lda, caller_cfg.lda_min);
+                caller_cfg.lda_max   = std::max(lda, caller_cfg.lda_max);
+                if (caller_cfg.lda_min != caller_cfg.m_min ||
+                    caller_cfg.lda_max != caller_cfg.m_max) {
+                    LOGD("Input Mat A transa == CUBLAS_OP_N but m != lda, intermediate m, lda in between [Min, Max] Best Algo will run on Init time.\n");
+                    return TNN_OK;
+                }
+            }
+            caller_cfg.ldc_min   = std::min(ldc, caller_cfg.ldc_min);
+            caller_cfg.ldc_max   = std::max(ldc, caller_cfg.ldc_max);
+            if (caller_cfg.ldc_min != caller_cfg.m_min ||
+                caller_cfg.ldc_max != caller_cfg.m_max) {
+                LOGD("Output Mat C, m != ldc, intermediate m, ldc in between [Min, Max] Best Algo will run on Init time.\n");
+                return TNN_OK;
+            }
+            diff_count++;
+            diff_m = true;
+        }
+        if (n < caller_cfg.n_min || n > caller_cfg.n_max) {
+            caller_cfg.n_min = std::min(n, caller_cfg.n_min);
+            caller_cfg.n_max = std::max(n, caller_cfg.n_max);
+            if (transb == CUBLAS_OP_T) {
+                caller_cfg.ldb_min = std::min(ldb, caller_cfg.ldb_min);
+                caller_cfg.ldb_max = std::max(ldb, caller_cfg.ldb_max);
+                if (caller_cfg.ldb_min != caller_cfg.n_min ||
+                    caller_cfg.ldb_max != caller_cfg.n_max) {
+                    LOGD("Input Mat B transa == CUBLAS_OP_T but n != ldb, intermediate n, ldb in between [Min, Max] Best Algo will run on Init time.\n");
+                    return TNN_OK;
+                }
+            }
+            diff_count++;
+            diff_n = true;
+        }
+        if (k < caller_cfg.k_min || k > caller_cfg.k_max) {
+            caller_cfg.k_min = std::min(k, caller_cfg.k_min);
+            caller_cfg.k_max = std::max(k, caller_cfg.k_max);
+            if (transa == CUBLAS_OP_T) {
+                caller_cfg.lda_min = std::min(lda, caller_cfg.lda_min);
+                caller_cfg.lda_max = std::max(lda, caller_cfg.lda_max);
+                if (caller_cfg.lda_min != caller_cfg.k_min ||
+                    caller_cfg.lda_max != caller_cfg.k_max) {
+                    LOGD("Input Mat A transa == CUBLAS_OP_T but k != lda, intermediate k, lda in between [Min, Max] Best Algo will run on Init time.\n");
+                    return TNN_OK;
+                }
+            }
+            if (transb == CUBLAS_OP_N) {
+                caller_cfg.ldb_min = std::min(ldb, caller_cfg.ldb_min);
+                caller_cfg.ldb_max = std::max(ldb, caller_cfg.ldb_max);
+                if (caller_cfg.ldb_min != caller_cfg.k_min ||
+                    caller_cfg.ldb_max != caller_cfg.k_max) {
+                    LOGD("Input Mat B transa == CUBLAS_OP_N but k != ldb, intermediate k, ldb in between [Min, Max] Best Algo will run on Init time.\n");
+                    return TNN_OK;
+                }
+            }
+            diff_count++;
+            diff_k = true;
+        }
+
+        if (diff_count >= 2) {
+            LOGD("More than one of batch, M, N, K is dynamic, run. This is not supported right NOW, maybe TNN future version will ADD support.\n");
+            return TNN_OK;
+        }
+
+
+        // Step 2: Create Vector of MatMul config for FindCublasMMAlgo()
+        std::vector<CublasMMConfig> intermediate_cfgs_vec;
+        if (diff_batch) {
+            // TODO, batch-GEMM now requires all of m, n, k, lda, ldb, ldc to be non-dynamic.
+            //       Maybe we can loose the restriction a little bit later.
+            if ((m != caller_cfg.m_min || caller_cfg.m_min != caller_cfg.m_max) ||
+                (n != caller_cfg.n_min || caller_cfg.n_min != caller_cfg.n_max) ||
+                (k != caller_cfg.k_min || caller_cfg.k_min != caller_cfg.k_max) ||
+                (lda != caller_cfg.lda_min || caller_cfg.lda_min != caller_cfg.lda_max) ||
+                (ldb != caller_cfg.ldb_min || caller_cfg.ldb_min != caller_cfg.ldb_max) ||
+                (ldc != caller_cfg.ldc_min || caller_cfg.ldc_min != caller_cfg.ldc_max)) {
+                LOGD("More than one of M, N, K, lda, ldb, ldc is dynamic in Batched-GEMM with dynamic batch. This is not supported right NOW, maybe TNN future version will ADD support.\n");
+                return TNN_OK;
+            }
+
+            for (int batch_i = caller_cfg.batch_min; batch_i < caller_cfg.batch_max; batch_i++) {
+                CublasMMConfig cur_config = CublasMMConfig(int(dtype), batch_i, transa, transb, m, n, k,
+                                                           lda, ldb, ldc, f_alpha, f_beta, max_workspace_size);
+                intermediate_cfgs_vec.push_back(std::move(cur_config));
+            }
+        }
+        if (diff_m) {
+            if ((transa == CUBLAS_OP_T && (lda != caller_cfg.lda_min || caller_cfg.lda_min != caller_cfg.lda_max)) ||
+                (ldb != caller_cfg.ldb_min || caller_cfg.ldb_min != caller_cfg.ldb_max) ||
+                (ldc != caller_cfg.ldc_min || caller_cfg.ldc_min != caller_cfg.ldc_max)) {
+                LOGD("More than one of lda, ldb, ldc is dynamic, This is not supported right NOW, maybe TNN future version will ADD support.\n");
+                return TNN_OK;
+            }
+            for (int m_i = caller_cfg.m_min; m_i < caller_cfg.m_max; m_i++) {
+                if (this->run_every_intermediate_mnk_ || m_i % 16 == 0) {
+                    int lda_i = transa == CUBLAS_OP_N ? m_i : lda;
+                    CublasMMConfig cur_config = CublasMMConfig(int(dtype), batch, transa, transb, m_i, n, k,
+                                                               lda_i, ldb, ldc, f_alpha, f_beta, max_workspace_size);
+                    intermediate_cfgs_vec.push_back(std::move(cur_config));
+                }
+            }
+        }
+        if (diff_n) {
+            if ((transb == CUBLAS_OP_N && (ldb != caller_cfg.ldb_min || caller_cfg.ldb_min != caller_cfg.ldb_max)) ||
+                (lda != caller_cfg.lda_min || caller_cfg.lda_min != caller_cfg.lda_max) ||
+                (ldc != caller_cfg.ldc_min || caller_cfg.ldc_min != caller_cfg.ldc_max)) {
+                LOGD("More than one of lda, ldb, ldc is dynamic, This is not supported right NOW, maybe TNN future version will ADD support.\n");
+                return TNN_OK;
+            }
+            for (int n_i = caller_cfg.n_min; n_i < caller_cfg.n_max; n_i++) {
+                if (this->run_every_intermediate_mnk_ || n_i % 16 == 0) {
+                    int ldb_i = transb == CUBLAS_OP_T ? n_i : ldb;
+                    CublasMMConfig cur_config = CublasMMConfig(int(dtype), batch, transa, transb, m, n_i, k,
+                                                               lda, ldb_i, ldc, f_alpha, f_beta, max_workspace_size);
+                    intermediate_cfgs_vec.push_back(std::move(cur_config));
+                }
+            }
+        }
+        if (diff_k) {
+            if ((transa == CUBLAS_OP_N && (lda != caller_cfg.lda_min || caller_cfg.lda_min != caller_cfg.lda_max)) ||
+                (transb == CUBLAS_OP_T && (ldb != caller_cfg.ldb_min || caller_cfg.ldb_min != caller_cfg.ldb_max)) ||
+                (ldc != caller_cfg.ldc_min || caller_cfg.ldc_min != caller_cfg.ldc_max)) {
+                LOGD("More than one of lda, ldb, ldc is dynamic, This is not supported right NOW, maybe TNN future version will ADD support.\n");
+                return TNN_OK;
+            }
+            for (int k_i = caller_cfg.k_min; k_i < caller_cfg.k_max; k_i++) {
+                if (this->run_every_intermediate_mnk_ || k_i % 16 == 0) {
+                    int lda_i = transa == CUBLAS_OP_T ? k_i : lda;
+                    int ldb_i = transb == CUBLAS_OP_N ? k_i : ldb;
+                    CublasMMConfig cur_config = CublasMMConfig(int(dtype), batch, transa, transb, m, n, k_i,
+                                                               lda_i, ldb_i, ldc, f_alpha, f_beta, max_workspace_size);
+                    intermediate_cfgs_vec.push_back(std::move(cur_config));
+                }
+            }
+        }
+
+
+        // Step 3: run FindCublasMMAlgo() for all Configs in vec.
+        for (int i=0; i<intermediate_cfgs_vec.size(); i++) {
+            CublasMMConfig& config = intermediate_cfgs_vec[i];
+            CublasMMAlgoInfo cur_best_algo_info;
+            if (dtype == DATA_TYPE_FLOAT) {
+                if (this->algo_map_.find(config) != this->algo_map_.end()) {
+                    const void* alpha = reinterpret_cast<void*>(&f_alpha);
+                    const void* beta  = reinterpret_cast<void*>(&f_beta);
+                    FindBestMMAlgo<float>(transa, transb, config.batch, config.m, config.n, config.k,
+                                          static_cast<const float*>(A), config.lda, static_cast<const float*>(B), config.ldb,
+                                          static_cast<float*>(C), config.ldc, alpha, beta, cur_best_algo_info, stream,
+                                          max_workspace_size, enable_cublas_algo, enable_cublaslt_algo);
+                    this->algo_map_[config] = std::move(cur_best_algo_info);
+                }
+            } else {
+                if (this->algo_map_.find(config) != this->algo_map_.end()) {
+                    half h_alpha = half(f_alpha);
+                    half h_beta = half(f_beta);
+                    const void* alpha = reinterpret_cast<void*>(&h_alpha);
+                    const void* beta  = reinterpret_cast<void*>(&h_beta);
+                    FindBestMMAlgo<half>(transa, transb, config.batch, config.m, config.n, config.k,
+                                         static_cast<const half*>(A), config.lda, static_cast<const half*>(B), config.ldb,
+                                         static_cast<half*>(C), config.ldc, alpha, beta, cur_best_algo_info, stream,
+                                         max_workspace_size, enable_cublas_algo, enable_cublaslt_algo);
+                    this->algo_map_[config] = std::move(cur_best_algo_info);
+                }
+            }
+        }
+    } else {
+        CublasMMMinMaxConfig caller_cfg;
+        caller_cfg.dtype     = int(DATA_TYPE_HALF);
+        caller_cfg.transa    = transa;
+        caller_cfg.transb    = transb;
+        caller_cfg.batch_min = batch;
+        caller_cfg.batch_max = batch;
+        caller_cfg.m_min     = m;
+        caller_cfg.m_max     = m;
+        caller_cfg.n_min     = n;
+        caller_cfg.n_max     = n;
+        caller_cfg.k_min     = k;
+        caller_cfg.k_max     = k;
+        caller_cfg.lda_min   = lda;
+        caller_cfg.lda_max   = lda;
+        caller_cfg.ldb_min   = ldb;
+        caller_cfg.ldb_max   = ldb;
+        caller_cfg.ldc_min   = ldc;
+        caller_cfg.ldc_max   = ldc;
+        caller_cfg.f_alpha   = f_alpha;
+        caller_cfg.f_beta    = f_beta;
+        caller_cfg.max_workspace_size = max_workspace_size;
+
+        this->caller_name_minmax_map_[caller_name_key] = std::move(caller_cfg);
+    }
+
+    return TNN_OK;
+}
+
+std::string CublasMMWrapper::GenAlgoConfigString(const CublasMMConfig& config,
+                                                 const std::string  delimiter) {
+    std::string uid;
+    uid += std::to_string(config.dtype) + delimiter;
+    uid += std::to_string(int(config.transa)) + delimiter;
+    uid += std::to_string(int(config.transb)) + delimiter;
+    uid += std::to_string(config.batch) + delimiter;
+    uid += std::to_string(config.m) + delimiter;
+    uid += std::to_string(config.n) + delimiter;
+    uid += std::to_string(config.k) + delimiter;
+    uid += std::to_string(config.lda) + delimiter;
+    uid += std::to_string(config.ldb) + delimiter;
+    uid += std::to_string(config.ldc) + delimiter;
+    uid += std::to_string(int(config.non_1_alpha)) + delimiter;
+    uid += std::to_string(int(config.non_0_beta)) + delimiter;
+    uid += std::to_string(int(config.need_workspace));
+
+    return uid;
+}
+
+std::string CublasMMWrapper::GenAlgoInfoString(const CublasMMAlgoInfo& info,
+                                               const std::string delimiter) {
+    std::string uid;
+
+    std::stringstream exec_time_stream, waves_count_stream;
+    exec_time_stream << std::fixed << std::setprecision(8) << info.exec_time;
+    waves_count_stream << std::fixed << std::setprecision(8) << info.wavesCount;
+    std::string exec_time_str = exec_time_stream.str();
+    std::string waves_count_str = waves_count_stream.str();
+
+    uid += std::to_string(info.dtype) + delimiter;
+    uid += std::to_string(info.algoId) + delimiter;
+    uid += exec_time_str + delimiter;
+    uid += std::to_string(info.batch) + delimiter;
+    uid += std::to_string(info.customOption) + delimiter;
+    uid += std::to_string(info.tile) + delimiter;
+    uid += std::to_string(info.splitK_val) + delimiter;
+    uid += std::to_string(info.swizzle) + delimiter;
+    uid += std::to_string(info.reductionScheme) + delimiter;
+    uid += std::to_string(info.workspaceSize) + delimiter;
+    uid += std::to_string(info.stages) + delimiter;
+    uid += waves_count_str + delimiter;
+    uid += std::to_string(int(info.is_cublaslt));
+    
+    return uid;
+}
+
+Status CublasMMWrapper::GetAlgoConfigFromString(const std::string& config_str,
+                                                CublasMMConfig&    config,
+                                                const std::string  delimiter) {
+    Status ret = TNN_OK;
+    std::istringstream cfg_iss(config_str);
+    char delim = *(delimiter.c_str());
+
+    auto getlineToInt = [&] (int& target) {
+        Status func_ret = TNN_OK;
+        std::string substr;
+        int target_tmp;
+        std::getline(cfg_iss, substr, *(delimiter.c_str()));
+        try {
+            target_tmp = std::stoi(substr);
+        } catch(std::exception &err) {
+            LOGE("Unable to Interpret TNN Cublas Algorithm Configure from String.\n");
+            func_ret = TNNERR_NET_ERR;
+        }
+        target = target_tmp;
+        return func_ret;
+    };
+
+    ret = getlineToInt(config.dtype);
+    if (ret != TNN_OK) return ret;
+
+    int transa_int;
+    ret = getlineToInt(transa_int);
+    if (ret != TNN_OK) return ret;
+    config.transa = bool(transa_int);
+
+    int transb_int;
+    ret = getlineToInt(transb_int);
+    if (ret != TNN_OK) return ret;
+    config.transb = bool(transb_int);
+
+    ret = getlineToInt(config.batch);
+    if (ret != TNN_OK) return ret;
+
+    ret = getlineToInt(config.m);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(config.n);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(config.k);
+    if (ret != TNN_OK) return ret;
+
+    ret = getlineToInt(config.lda);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(config.ldb);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(config.ldc);
+    if (ret != TNN_OK) return ret;
+
+    int non_1_alpha_int, non_0_beta_int, need_workspace_int;
+    ret = getlineToInt(non_1_alpha_int);
+    if (ret != TNN_OK) return ret;
+    config.non_1_alpha = bool(non_1_alpha_int);
+    ret = getlineToInt(non_0_beta_int);
+    if (ret != TNN_OK) return ret;
+    config.non_0_beta = bool(non_0_beta_int);
+    ret = getlineToInt(need_workspace_int);
+    if (ret != TNN_OK) return ret;
+    config.need_workspace = bool(need_workspace_int);
+
+    return ret;
+}
+
+Status CublasMMWrapper::GetAlgoInfoFromString(const std::string&    info_str,
+                                              const CublasMMConfig& config,
+                                              CublasMMAlgoInfo&     info,
+                                              const std::string     delimiter) {
+    Status ret = TNN_OK;
+    std::istringstream info_iss(info_str);
+    char delim = *(delimiter.c_str());
+
+    auto getlineToInt = [&] (int& target) {
+        Status func_ret = TNN_OK;
+        std::string substr;
+        int target_tmp;
+        std::getline(info_iss, substr, *(delimiter.c_str()));
+        try {
+            target_tmp = std::stoi(substr);
+        } catch(std::exception &err) {
+            LOGE("Unable to Interpret TNN Cublas Algorithm Configure from String.\n");
+            func_ret = TNNERR_NET_ERR;
+        }
+        target = target_tmp;
+        return func_ret;
+    };
+    
+    auto getlineToFloat = [&] (float& target) {
+        Status func_ret = TNN_OK;
+        std::string substr;
+        float target_tmp;
+        std::getline(info_iss, substr, *(delimiter.c_str()));
+        try {
+            target_tmp = std::stof(substr);
+        } catch(std::exception &err) {
+            LOGE("Unable to Interpret TNN Cublas Algorithm Configure from String.\n");
+            func_ret = TNNERR_NET_ERR;
+        }
+        target = target_tmp;
+        return func_ret;
+    };
+
+    ret = getlineToInt(info.dtype);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.algoId);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToFloat(info.exec_time);
+    if (ret != TNN_OK) return ret;
+
+    ret = getlineToInt(info.batch);
+    if (ret != TNN_OK) return ret;
+
+    ret = getlineToInt(info.customOption);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.tile);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.splitK_val);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.swizzle);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.reductionScheme);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.workspaceSize);
+    if (ret != TNN_OK) return ret;
+    ret = getlineToInt(info.stages);
+    if (ret != TNN_OK) return ret;
+
+    ret = getlineToFloat(info.wavesCount);
+    if (ret != TNN_OK) return ret;
+
+    int is_cublaslt_int;
+    ret = getlineToInt(is_cublaslt_int);
+    if (ret != TNN_OK) return ret;
+    info.is_cublaslt = bool(is_cublaslt_int);
+
+    if (info.is_cublaslt) {
+        InitCublasLtAlgo(info);
+        InitCublasLtDesc(config, info);
+    }
+
+    return ret;
+}
+
+Status CublasMMWrapper::InitCublasLtAlgo(CublasMMAlgoInfo& info) {
+    if (info.lt_algo_inited) {
+        return TNN_OK;
+    }
+
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+    cudaDataType_t scaleType;
+    cudaDataType_t AType, BType, CType;
+
+    if (info.dtype == DATA_TYPE_HALF) {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+        scaleType   = CUDA_R_16F;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+    } else {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+        scaleType   = CUDA_R_32F;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    if (!info.is_cublaslt) {
+        LOGI("Unable to Generate cublasLt MatMul Algo from Algo Info because it is not a CublasLt algo. Return empty algo.\n");
+        cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, scaleType, AType, BType, CType, CType, 0, &algo);
+        return TNN_OK;
+    }
+    cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, scaleType, AType, BType, CType, CType, info.algoId, &algo);
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+    // Save Algo back to AlgoInfo Struct.
+    info.algo = algo;
+    info.lt_algo_inited = true;
+
+    return TNN_OK;
+}
+
+std::string CublasMMWrapper::GetCacheFileName(const int device_id) {
+#ifdef CUDART_VERSION
+    std::string cuda_version = "cu" + std::to_string(CUDART_VERSION);
+#else
+    std::string cuda_version = "";
+#endif
+
+    // DeviceType, save as TNN/source/tnn/network/tensorrt/utils.cc  GetGpuType(int gpu_id)
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+    int length = strlen(prop.name);
+    for (int i = 0; i < length; i++) {
+        char c = prop.name[i];
+        if (((c >= 'a') && (c<='z')) ||
+            ((c >= 'A') && (c<='Z')) ||
+            ((c >= '0') && (c<='9'))) {
+               continue;
+           }
+       prop.name[i] = '_';
+    }
+    std::string gpu_type = std::string(prop.name);
+
+    std::string cache_file_name = "." + cuda_version + "-" + gpu_type + "-cublas_and_cublaslt_matmul_stored_best_algos.cache";
+    return cache_file_name;
+}
+
+Status CublasMMWrapper::SaveAlgoMapToFile(const std::string file_path, const int device_id) {
+    Status ret = TNN_OK;
+    std::string cache_file_name = GetCacheFileName(device_id);
+
+    std::ofstream write_stream;
+    write_stream.open(file_path + cache_file_name, std::ios::binary);
+    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
+        write_stream.close();
+        LOGE("invalid mmalgo file path! (%s)\n", file_path.c_str());
+        return Status(TNNERR_PACK_MODEL, "TNN cublas MM algorithm file cannot be written.");
+    }
+
+    const std::string config_info_separator = ";";
+
+    std::string all_algos_str;
+    int count = 0;
+    for (const auto& item : this->algo_map_) {
+        const CublasMMConfig& config = item.first;
+        std::string config_str = GenAlgoConfigString(config, ",");
+        std::string algo_info_str = GenAlgoInfoString(item.second, ",");
+        all_algos_str += config_str + config_info_separator + algo_info_str + "\n";
+        count++;
+    }
+
+    write_stream << all_algos_str;
+    write_stream.close();
+    LOGD("Successfully save [%d] Best Algos to best algo cache file.\n", count);
+    
+    return ret;
+}
+
+Status CublasMMWrapper::LoadAlgoMapFromFile(const std::string file_path, const int device_id) {
+    Status ret = TNN_OK;
+    std::string cache_file_name = GetCacheFileName(device_id);
+
+    std::ifstream read_stream;
+    read_stream.open(file_path + cache_file_name);
+    if (!read_stream || !read_stream.is_open() || !read_stream.good()) {
+        read_stream.close();
+        LOGE("invalid mmalgo file path! (%s)\n", file_path.c_str());
+        return Status(TNNERR_PACK_MODEL, "TNN cublas MM algorithm file cannot be read.");
+    }
+
+    int count = 0;
+    while (read_stream) {
+        std::string algo_line;
+        std::getline(read_stream, algo_line);
+        if (algo_line.empty()) {
+            continue;
+        }
+        std::string algo_config_str, algo_info_str;
+        std::stringstream algo_ss(algo_line);
+        std::getline(algo_ss, algo_config_str, ';');
+        std::getline(algo_ss, algo_info_str);
+        if (algo_config_str.empty() || algo_info_str.empty()) {
+            continue;
+        }
+
+        CublasMMConfig cur_config;
+        CublasMMAlgoInfo cur_algo_info;
+        ret = GetAlgoConfigFromString(algo_config_str, cur_config);
+        if (ret != TNN_OK) {
+            LOGE("Unable to Get Current Algorithm Config from TNN cublas Algorithm file. Skip The line.\n");
+            continue;
+        }
+        ret = GetAlgoInfoFromString(algo_info_str, cur_config, cur_algo_info);
+        if (ret != TNN_OK) {
+            LOGE("Unable to Get Current Algorithm Info from TNN cublas Algorithm file. Skip The line.\n");
+            continue;
+        }
+
+        this->algo_map_[cur_config] = std::move(cur_algo_info);
+        count++;
+    }
+    read_stream.close();
+    LOGD("Successfully load [%d] Best Algos from best algo cache file.\n", count);
+
+    return ret;
+}
+
+
+cublasStatus_t CublasMMWrapper::RunCublasLtMMAlgoPerf(cublasLtMatmulDesc_t        operationDesc,
+                                                      const void*                 alpha, // host or device pointer
+                                                      const void*                 A,
+                                                      cublasLtMatrixLayout_t      Adesc,
+                                                      const void*                 B,
+                                                      cublasLtMatrixLayout_t      Bdesc,
+                                                      const void*                 beta, // host or device pointer
+                                                      const void*                 C,
+                                                      cublasLtMatrixLayout_t      Cdesc,
+                                                      void*                       D,
+                                                      cublasLtMatrixLayout_t      Ddesc,
+                                                      const cublasLtMatmulAlgo_t& algo,
+                                                      int                         kernelRepeats,
+                                                      size_t                      workSpaceSizeInBytes,
+                                                      CublasMMAlgoInfo&           perfResultInfo,
+                                                      cudaStream_t                stream,
+                                                      cudaEvent_t&                startEvent,
+                                                      cudaEvent_t&                stopEvent) {
+    const int WARMUP_ITERS = 10;
+
+    // Check If Algo could run or not.
+    cublasLtMatmulHeuristicResult_t heurResult;
+    cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(cublaslt_handle_, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            cudaError_t err, err1, err2, err3;
+
+            // Uncounted WARMUP iters.
+            for (int loop = 0; loop < WARMUP_ITERS; loop++) {
+                cublasStatus_t oneRunStatus = cublasLtMatmul(cublaslt_handle_,
+                                                             operationDesc,
+                                                             alpha,
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta,
+                                                             C,
+                                                             Cdesc,
+                                                             D,
+                                                             Ddesc,
+                                                             &algo,
+                                                             cuda_context_workspace_,
+                                                             workSpaceSizeInBytes,
+                                                             stream);
+                if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                    algoStatus = oneRunStatus;
+                    break;
+                }
+            }
+
+            err = cudaEventRecord(startEvent, stream);
+            for (int loop = 0; loop < kernelRepeats; loop++) {
+                cublasStatus_t oneRunStatus = cublasLtMatmul(cublaslt_handle_,
+                                                             operationDesc,
+                                                             alpha,
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta,
+                                                             C,
+                                                             Cdesc,
+                                                             D,
+                                                             Ddesc,
+                                                             &algo,
+                                                             cuda_context_workspace_,
+                                                             workSpaceSizeInBytes,
+                                                             stream);
+                if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                    algoStatus = oneRunStatus;
+                    break;
+                }
+            }
+            err1 = cudaEventRecord(stopEvent, stream);
+            err2 = cudaEventSynchronize(stopEvent);
+            float time;
+            err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
+            if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
+                algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
+            }
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+                cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+                stages = 0;
+#endif
+                LOGD("CublasLt MatMul Perf: AlgoId=[%d], exec_time=[%.4f]ms, tile=%d, customOption=%d, numSplitsK=%d, swizzle=%d, reductionScheme=%d, workspaceSize=%d, stages=%d, wavesCount=%.4f.\n",
+                    algoId, time / kernelRepeats, tile, customOption, numSplitsK, swizzle, reductionScheme, heurResult.workspaceSize,
+                    stages, heurResult.wavesCount);
+
+                perfResultInfo.algoId            = algoId;
+                perfResultInfo.exec_time         = time / kernelRepeats;
+                perfResultInfo.batch             = 1;
+                perfResultInfo.customOption      = customOption;
+                perfResultInfo.tile              = tile;
+                perfResultInfo.splitK_val        = numSplitsK;
+                perfResultInfo.swizzle           = swizzle;
+                perfResultInfo.reductionScheme   = reductionScheme;
+                perfResultInfo.workspaceSize     = heurResult.workspaceSize;
+                perfResultInfo.stages            = stages;
+                perfResultInfo.wavesCount        = heurResult.wavesCount;
+                perfResultInfo.algo              = algo;
+                //perfResultInfo.operationDesc   = operationDesc; // NOTE: DESCs will not be set here.
+                //perfResultInfo.Adesc           = Adesc;
+                //perfResultInfo.Bdesc           = Bdesc;
+                //perfResultInfo.Cdesc           = Cdesc;
+                perfResultInfo.is_cublaslt       = true;
+                perfResultInfo.lt_algo_inited    = true;
+            }
+        } else {
+            LOGD("CublasLt MatMul Perf: No enough workspace. Required=%d, Max Size Available=%d.\n", heurResult.workspaceSize, workSpaceSizeInBytes);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+
+    return algoStatus;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
+                                 cublasLtHandle_t cublaslt_handle) :
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle) {
+}
+
+cublasMMWrapper::~cublasMMWrapper() {
+    freeCublasLtDesc();
+    cublaslt_inited_ = false;
+}
+
+cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
+    cublas_handle_(wrapper.cublas_handle_),
+    cublaslt_handle_(wrapper.cublaslt_handle_),
+    Atype_(wrapper.Atype_),
+    Btype_(wrapper.Btype_),
+    Ctype_(wrapper.Ctype_),
+    computeType_(wrapper.computeType_),
+    cublaslt_inited_(false) {
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       alpha,
+                           const void*       A,
+                           cudaDataType_t    Atype,
+                           int               lda,
+                           const void*       B,
+                           cudaDataType_t    Btype,
+                           int               ldb,
+                           const void*       beta,
+                           void*             C,
+                           cudaDataType_t    Ctype,
+                           int               ldc,
+                           cudaDataType_t    computeType,
+                           cublasGemmAlgo_t  algo) {
+    CUBLAS_CHECK(cublasGemmEx(cublas_handle_,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              alpha,
+                              A,
+                              Atype,
+                              lda,
+                              B,
+                              Btype,
+                              ldb,
+                              beta,
+                              C,
+                              Ctype,
+                              ldc,
+                              computeType,
+                              algo));
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc,
+                           cudaStream_t      stream) {
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f, stream);
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc,
+                           float             f_alpha,
+                           float             f_beta,
+                           cudaStream_t      stream) {
+    half h_alpha = (half)(f_alpha);
+    half h_beta  = (half)(f_beta);
+
+    int  is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+
+    // fp32 use cublas as default
+    // fp16 use cublasLt as default
+    if (Atype_ == CUDA_R_16F) {
+        prepareCublasLtDesc(transa, transb, m, n, k, lda, ldb, ldc);
+
+        CUBLAS_CHECK(cublasLtMatmul(cublaslt_handle_,
+                                    operationDesc,
+                                    alpha,
+                                    A,
+                                    Adesc,
+                                    B,
+                                    Bdesc,
+                                    beta,
+                                    C,
+                                    Cdesc,
+                                    C,
+                                    Cdesc,
+                                    NULL,
+                                    NULL,
+                                    0,
+                                    stream));
+    } else {
+        cublasGemmAlgo_t algoId = ((Atype_ == CUDA_R_16F) ? CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT);
+
+        CUBLAS_CHECK(cublasGemmEx(cublas_handle_,
+                                  transa,
+                                  transb,
+                                  m,
+                                  n,
+                                  k,
+                                  alpha,
+                                  A,
+                                  Atype_,
+                                  lda,
+                                  B,
+                                  Btype_,
+                                  ldb,
+                                  beta,
+                                  C,
+                                  Ctype_,
+                                  ldc,
+                                  computeType_,
+                                  algoId));
+    }
+}
+
+void cublasMMWrapper::setFP32GemmConfig() {
+    Atype_       = CUDA_R_32F;
+    Btype_       = CUDA_R_32F;
+    Ctype_       = CUDA_R_32F;
+    computeType_ = CUDA_R_32F;
+}
+
+void cublasMMWrapper::setFP16GemmConfig() {
+    Atype_       = CUDA_R_16F;
+    Btype_       = CUDA_R_16F;
+    Ctype_       = CUDA_R_16F;
+    computeType_ = CUDA_R_32F;
+}
+
+void cublasMMWrapper::prepareCublasLtDesc(cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          const int         m,
+                                          const int         n,
+                                          const int         k,
+                                          const int         lda,
+                                          const int         ldb,
+                                          const int         ldc) {
+    if (cublaslt_inited_ && transa == cached_transa && transb == cached_transb &&
+        m == cached_m && n == cached_n && k == cached_k &&
+        lda == cached_lda && ldb == cached_ldb && ldc == cached_ldc) {
+        return;
+    }
+
+    freeCublasLtDesc();
+
+    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+    cudaDataType_t scaleType;
+#else
+    cudaDataType_t computeType;
+#endif
+
+    if (is_fp16_computeType) {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+        scaleType = CUDA_R_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+    }
+    else {
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+        scaleType = CUDA_R_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+    }
+
+    // --------------------------------------
+    // Create descriptors for the original matrices
+    cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+    cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+    cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
+
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+
+    cached_transa = transa;
+    cached_transb = transb;
+    cached_m = m;
+    cached_n = n;
+    cached_k = k;
+    cached_lda = lda;
+    cached_ldb = ldb;
+    cached_ldc = ldc;
+    cublaslt_inited_ = true;
+}
+
+void cublasMMWrapper::freeCublasLtDesc() {
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+        operationDesc = NULL;
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+        Adesc = NULL;
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+        Bdesc = NULL;
+    }
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+        Cdesc = NULL;
+    }
+}
+
+void cublasMMWrapper::batchedGemm(cublasOperation_t  transa,
+                                  cublasOperation_t  transb,
+                                  const int          batch_count,
+                                  const int          m,
+                                  const int          n,
+                                  const int          k,
+                                  const void* const* A,
+                                  const int          lda,
+                                  const void* const* B,
+                                  const int          ldb,
+                                  void* const*       C,
+                                  const int          ldc,
+                                  cudaStream_t       stream) {
+    float f_alpha = static_cast<float>(1.0f);
+    float f_beta  = static_cast<float>(0.0f);
+
+    half h_alpha = (half)1.0f;
+    half h_beta  = (half)0.0f;
+
+    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+    cublasGemmAlgo_t algoId = ((Atype_ == CUDA_R_16F) ? CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT);
+
+    CUBLAS_CHECK(cublasGemmBatchedEx(cublas_handle_,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     alpha,
+                                     A,
+                                     Atype_,
+                                     lda,
+                                     B,
+                                     Btype_,
+                                     ldb,
+                                     beta,
+                                     C,
+                                     Ctype_,
+                                     ldc,
+                                     batch_count,
+                                     computeType_,
+                                     algoId));
+}
+
+void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         batch_count,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const void*       A,
+                                         const int         lda,
+                                         const int64_t     strideA,
+                                         const void*       B,
+                                         const int         ldb,
+                                         const int64_t     strideB,
+                                         void*             C,
+                                         const int         ldc,
+                                         const int64_t     strideC,
+                                         cudaStream_t      stream,
+                                         const float       f_alpha,
+                                         const float       f_beta) {
+    half h_alpha = (half)f_alpha;
+    half h_beta  = (half)f_beta;
+
+    int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
+    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
+    cublasGemmAlgo_t algoId = ((Atype_ == CUDA_R_16F) ? CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT);
+
+    CUBLAS_CHECK(cublasGemmStridedBatchedEx(cublas_handle_,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A,
+                                            Atype_,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            Btype_,
+                                            ldb,
+                                            strideB,
+                                            beta,
+                                            C,
+                                            Ctype_,
+                                            ldc,
+                                            strideC,
+                                            batch_count,
+                                            computeType_,
+                                            algoId));
+}
+
+}  //  namespace TNN_NS;
diff --git a/source/tnn/device/cuda/utils.cuh b/source/tnn/device/cuda/utils.cuh
index 1b60588a7..fdfd974ce 100644
--- a/source/tnn/device/cuda/utils.cuh
+++ b/source/tnn/device/cuda/utils.cuh
@@ -15,7 +15,20 @@
 #ifndef TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
 #define TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
 
+#include <algorithm>
+#include <chrono>
+#include <unordered_map>
+#include <string>
 #include <cuda_fp16.h>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+
+#include "tnn/core/common.h"
+#include "tnn/core/macro.h"
+#include "tnn/core/status.h"
+#include "tnn/device/cuda/cuda_macro.h"
+
+
 
 namespace TNN_NS {
 
@@ -25,6 +38,1263 @@ __device__ float get_float_value(T value) { return value; }
 template<typename T>
 __device__ T convert_float_value(float value) { return T(value); }
 
+// BFP16 Compatible Alias
+// Add: 2 Elementes
+template<typename T>
+inline __device__ T add(T a, T b) {
+    return a + b;
+}
+//#ifdef ENABLE_BF16
+//template<>
+//inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+//    return bf16hadd2(a, b);
+//}
+//template<>
+//inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+//    return bf16hadd(a, b);
+//}
+//inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, float b) {
+//    return bf16hadd(a, __float2bfloat16(b));
+//}
+//#endif // ENABLE_BF16
+
+// hadd2, hsub2, hmul2, hexp2: 2 Elements
+template<typename T>
+inline __device__ T hadd2(T a, T b) {
+    return __hadd2(a, b);
+}
+template<typename T>
+inline __device__ T hsub2(T a, T b) {
+    return __hsub2(a, b);
+}
+template<typename T>
+inline __device__ T hmul2(T a, T b) {
+    return __hmul2(a, b);
+}
+template<typename T>
+inline __device__ T hexp2(T a) {
+    return h2exp(a);
+}
+//#ifdef ENABLE_BF16
+//template<>
+//inline __device__ __nv_bfloat162 hadd2(__nv_bfloat162 a, __nv_bfloat162 b) {
+//    return bf16hadd2(a, b);
+//}
+//template<>
+//inline __device__ __nv_bfloat162 hsub2(__nv_bfloat162 a, __nv_bfloat162 b) {
+//    return bf16hsub2(a, b);
+//}
+//template<>
+//inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b) {
+//    return bf16hmul2(a, b);
+//}
+//template<>
+//inline __device__ __nv_bfloat162 hexp2(__nv_bfloat162 a) {
+//    return bf16exp2(a);
+//}
+//#endif // ENABLE_BF16
+
+
+
+// Cuda float -> half/bfp16 data converter
+// Cuda float -> half2/bfp162 data converter
+template<typename T>
+inline __device__ T float2type(float a) {
+    return a;
+}
+template<>
+inline __device__ half float2type(float a) {
+    return __float2half_rn(a);
+}
+template<typename T>
+inline __device__ T float2type2(float a);
+template<>
+inline __device__ half2 float2type2(float a) {
+    return __float2half2_rn(a);
+}
+//#ifdef ENABLE_BF16
+//template<>
+//inline __device__ __nv_bfloat16 float2type(float a) {
+//    return __float2bfloat16_rn(a);
+//}
+//template<>
+//inline __device__ __nv_bfloat162 float2type2(float a) {
+//    return __float2bfloat162_rn(a);
+//}
+//#endif // ENABLE_BF16
+
+
+// Cuda half  -> half2  data converter
+// Cuda bfp16 -> bfp162 data converter
+template<typename T_IN, typename T_OUT>
+inline __device__ T_OUT type2type2(T_IN a);
+template<>
+inline __device__ half2 type2type2(half a) {
+    return __half2half2(a);
+}
+//#ifdef ENABLE_BF16
+//template<>
+//inline __device__ __nv_bfloat162 type2type2(__nv_bfloat16 a) {
+//    return bf162bf162(a);
+//}
+//#endif // ENABLE_BF16
+
+
+
+
+
+// Cuda Type <-> Type2 Type inter converter
+// float <-> float
+// half  <-> half2
+// bfp16 <-> bpf162, TODO
+template<typename T>
+struct CudaType2InterConverter {using Type = half2;}; // by default half.
+template<>
+struct CudaType2InterConverter<half2> {using Type = half;};
+template<>
+struct CudaType2InterConverter<half> {using Type = half2;};
+template<>
+struct CudaType2InterConverter<float> {using Type = float;};
+//#ifdef ENABLE_BF16
+//template<>
+//struct CudaType2InterConverter<__nv_bfloat162> {using Type = __nv_bfloat16;};
+//template<>
+//struct CudaType2InterConverter<__nv_bfloat16> {using Type = __nv_bfloat162;};
+//#endif // ENABLE_BF16
+
+
+class CublasMMConfig {
+public:
+    // Members
+    int dtype;
+    int batch;
+    int m, n, k;
+    int lda, ldb, ldc;
+    bool transa, transb;
+    bool non_1_alpha, non_0_beta;
+    bool need_workspace;
+
+public:
+    // Constructors
+    CublasMMConfig();
+    
+    CublasMMConfig(const int& in_dtype,
+                   const int& in_batch,
+                   cublasOperation_t in_transa,
+                   cublasOperation_t in_transb,
+                   const int& in_m,
+                   const int& in_n,
+                   const int& in_k,
+                   const int& in_lda,
+                   const int& in_ldb,
+                   const int& in_ldc,
+                   const float& f_alpha,
+                   const float& f_beta,
+                   const int& max_workspace_size);
+
+    CublasMMConfig(const int& in_dtype,
+                   const int& in_batch,
+                   cublasOperation_t in_transa,
+                   cublasOperation_t in_transb,
+                   const int& in_m,
+                   const int& in_n,
+                   const int& in_k,
+                   const int& in_lda,
+                   const int& in_ldb,
+                   const int& in_ldc,
+                   const half& h_alpha,
+                   const half& h_beta,
+                   const int& max_workspace_size);
+
+    CublasMMConfig(const int& in_dtype,
+                   const int& in_batch,
+                   cublasOperation_t in_transa,
+                   cublasOperation_t in_transb,
+                   const int& in_m,
+                   const int& in_n,
+                   const int& in_k,
+                   const int& in_lda,
+                   const int& in_ldb,
+                   const int& in_ldc,
+                   const bool in_non_1_alpha,
+                   const bool in_non_0_beta,
+                   const bool in_need_workspace);
+
+
+    // Equal for std::hash
+    bool operator==(const CublasMMConfig &other) const {
+        return (dtype == other.dtype && batch == other.batch &&
+                m == other.m && n == other.n && k == other.k &&
+                lda == other.lda && ldb == other.ldb && ldc == other.ldc &&
+                transa == other.transa && transb == other.transb &&
+                non_1_alpha == other.non_1_alpha && non_0_beta == other.non_0_beta &&
+                need_workspace == other.need_workspace);
+    }
+};
+
+}  // namespace TNN_NS
+
+// Hash Function for Struct CublasMMConfig
+namespace std {
+    inline void cublas_mm_hash_combine(std::size_t& seed) { }
+    
+    template <typename T, typename... Rest>
+    inline void cublas_mm_hash_combine(std::size_t& seed, const T& v, Rest... rest) {
+        std::hash<T> hasher;
+        seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+        cublas_mm_hash_combine(seed, rest...);
+    }
+
+    template <>
+    struct hash<::TNN_NS::CublasMMConfig> {
+        std::size_t operator() (::TNN_NS::CublasMMConfig const& config) const noexcept {
+            std::size_t h_transa         = std::hash<bool>{}(config.transa);
+            std::size_t h_transb         = std::hash<bool>{}(config.transb);
+            std::size_t h_non_1_alpha    = std::hash<bool>{}(config.non_1_alpha);
+            std::size_t h_non_0_beta     = std::hash<bool>{}(config.non_0_beta);
+            std::size_t h_need_workspace = std::hash<bool>{}(config.need_workspace);
+            std::size_t h_booleans       = h_transa + (h_transb << 1) + (h_non_1_alpha << 2) + (h_non_0_beta << 3) + (h_need_workspace << 4);
+
+            std::size_t h_dtype          = std::hash<int>{}(config.dtype);
+            std::size_t h_batch          = std::hash<int>{}(config.batch);
+            std::size_t h_m              = std::hash<int>{}(config.m);
+            std::size_t h_n              = std::hash<int>{}(config.n);
+            std::size_t h_k              = std::hash<int>{}(config.k);
+            std::size_t h_lda            = std::hash<int>{}(config.lda);
+            std::size_t h_ldb            = std::hash<int>{}(config.ldb);
+            std::size_t h_ldc            = std::hash<int>{}(config.ldc);
+
+            std::size_t h_all = 0;
+            cublas_mm_hash_combine(h_all, h_booleans, h_dtype, h_batch, h_m, h_n, h_k, h_lda, h_ldb, h_ldc);
+            return h_all;
+        }
+    };
+}  // namespace std for hash func.
+
+
+
+namespace TNN_NS {
+
+class CublasMMWrapper {
+public:
+    explicit CublasMMWrapper(cublasHandle_t   cublas_handle,
+                             cublasLtHandle_t cublaslt_handle,
+                             void* cuda_context_workspace);
+
+    ~CublasMMWrapper();
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       alpha,
+              const void*       A,
+              cudaDataType_t    Atype,
+              int               lda,
+              const void*       B,
+              cudaDataType_t    Btype,
+              int               ldb,
+              const void*       beta,
+              void*             C,
+              cudaDataType_t    Ctype,
+              int               ldc,
+              cudaDataType_t    computeType,
+              cublasGemmAlgo_t  algo);
+
+    // @brief fp16 & fp32 simplified GEMM kernel with alpha=1, beta=0
+    // @param caller_name: String For GEMM to know the caller. Allow TNN to find best algo for Shapes in between [min, max], better to be called in OP.Init() than in OP.forward().
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const float*      A,
+              const int         lda,
+              const float*      B,
+              const int         ldb,
+              float*            C,
+              const int         ldc,
+              cudaStream_t      stream,
+              const int         max_workspace_size = 0,
+              const bool        use_default_algo = true,
+              const std::string caller_name = "");
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const half*       A,
+              const int         lda,
+              const half*       B,
+              const int         ldb,
+              half*             C,
+              const int         ldc,
+              cudaStream_t      stream,
+              const int         max_workspace_size = 0,
+              const bool        use_default_algo = true,
+              const std::string caller_name = "");
+
+    // @brief fp16 & fp32 simplified GEMM kernel with explicit alpha and beta
+    // @param caller_name: String For GEMM to know the caller. Allow TNN to find best algo for Shapes in between [min, max], better to be called in OP.Init() than in OP.forward().
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const float*      A,
+              const int         lda,
+              const float*      B,
+              const int         ldb,
+              float*            C,
+              const int         ldc,
+              float             f_alpha,
+              float             f_beta,
+              cudaStream_t      stream,
+              const int         max_workspace_size = 0,
+              const bool        use_default_algo = true,
+              const std::string caller_name = "");
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const half*       A,
+              const int         lda,
+              const half*       B,
+              const int         ldb,
+              half*             C,
+              const int         ldc,
+              float             f_alpha,
+              float             f_beta,
+              cudaStream_t      stream,
+              const int         max_workspace_size = 0,
+              const bool        use_default_algo = true,
+              const std::string caller_name = "");
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const half*       A,
+              const int         lda,
+              const half*       B,
+              const int         ldb,
+              half*             C,
+              const int         ldc,
+              half              h_alpha,
+              half              h_beta,
+              cudaStream_t      stream,
+              const int         max_workspace_size = 0,
+              const bool        use_default_algo = true,
+              const std::string caller_name = "");
+
+    // @brief Free Stored CublasLtDesc of given config if exits.
+    //        Free All Stored CublasLtDescs if config_str is not provided (empty)
+    void FreeCublasLtDesc();
+    void FreeCublasLtDesc(const CublasMMConfig& config);
+
+private:
+    // Address of Cuda Context Workspace.
+    // This workspace is managed by CUDA Context, CublasMMWrapper is only the USER of the workspace.
+    // workspace is set at the time when CublasMMWrapper Class is created.
+    void* cuda_context_workspace_     = nullptr;
+    cublasHandle_t   cublas_handle_   = nullptr;
+    cublasLtHandle_t cublaslt_handle_ = nullptr;
+
+    // Stored Algorithms:
+private:
+    // @brief Map To Store Algorithm Configurations, key: caller_name.
+    //        Caller name can be Layer Name, or Unique Strings containing layer name.
+    //        We Keep this Map to get Min / Max size of each of M, N, K, lda, ldb, ldc of Certain GEMM
+    struct CublasMMMinMaxConfig {
+        int               dtype = -1;
+        cublasOperation_t transa;
+        cublasOperation_t transb;
+        int               batch_min = -1, batch_max = -1;
+        int               m_min = -1, m_max = -1;
+        int               n_min = -1, n_max = -1;
+        int               k_min = -1, k_max = -1;
+        int               lda_min = -1, lda_max = -1;
+        int               ldb_min = -1, ldb_max = -1;
+        int               ldc_min = -1, ldc_max = -1;
+        float             f_alpha = 1.0f, f_beta = 0.0f;
+        int               max_workspace_size = 0;
+    };
+    std::unordered_map<std::string, CublasMMMinMaxConfig> caller_name_minmax_map_;
+    bool run_every_intermediate_mnk_ = true;
+
+    // @brief Update CublasMMMinMaxConfig Map based on caller_name,
+    //        if min shape != max shape (at most one of M, N, K not equal and lda, ldb, ldc == M, N or K)
+    //        run FindBestMMAlgo() for every intermediate configure in between min shape and max shape
+    //        or run FindBestMMAlgo() only for critical intermediate configures in between min shape and max shape.
+    Status UpdateMinMaxMapAndFindMMAlgosIfNeeded(const std::string& caller_name_key,
+                                                 const DataType     dtype,
+                                                 cublasOperation_t  transa,
+                                                 cublasOperation_t  transb,
+                                                 const int          batch,
+                                                 const int          m,
+                                                 const int          n,
+                                                 const int          k,
+                                                 const void*        A,
+                                                 const int          lda,
+                                                 const void*        B,
+                                                 const int          ldb,
+                                                 void*              C,
+                                                 const int          ldc,
+                                                 float              f_alpha,
+                                                 float              f_beta,
+                                                 cudaStream_t       stream,
+                                                 const int          max_workspace_size,
+                                                 const bool         enable_cublas_algo = true,
+                                                 const bool         enable_cublaslt_algo = true);
+
+    // @brief Map of Best Algorithm Infomation, key: MatMul Configure
+    struct CublasMMAlgoInfo {
+        int            dtype = -1;                                              // cublas, cublasLt
+        int            algoId = -1;                                             // cublas, cublasLt
+        float          exec_time = 99999.0f;                                    // cublas, cublasLt
+        int            batch = -1;                                              // cublas, cublasLt
+        int            customOption = -1, tile = -1, splitK_val = -1;           //         cublasLt
+        int            swizzle = -1, reductionScheme = -1, workspaceSize = -1;  //         cublasLt
+        int            stages = -1;                                             //         cublasLt >= 11.0
+        float          wavesCount = 99999.0f;                                   //         cublasLt
+        cublasStatus_t status;                                                  //         cublasLt
+        cublasLtMatmulAlgo_t algo;                                              //         cublasLt
+        cublasLtMatmulDesc_t   operationDesc = NULL;                            //         cublasLt
+        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;        //         cublasLt
+        bool           is_cublaslt;                                             // cublas: false, cublasLt: true
+        bool           lt_algo_inited = false;                                  //         cublasLt
+    };
+    std::unordered_map<CublasMMConfig, CublasMMAlgoInfo> algo_map_;
+
+
+    // @brief Generate String for algo_map to be save to file, delimiter = ','
+    std::string GenAlgoConfigString(const CublasMMConfig& config,
+                                    const std::string  delimiter = ",");
+    std::string GenAlgoInfoString(const CublasMMAlgoInfo& info,
+                                  const std::string delimiter = ",");
+
+    // @brief Generate Config / AlgoInfo from saved algo file string, delimiter = ','
+    Status GetAlgoConfigFromString(const std::string& config_str,
+                                   CublasMMConfig& config,
+                                   const std::string delimiter = ",");
+    Status GetAlgoInfoFromString(const std::string& info_str,
+                                 const CublasMMConfig& config,
+                                 CublasMMAlgoInfo& info,
+                                 const std::string delimiter = ",");
+
+    // @brief Init OpDesc, ADesc, BDesc and CDesc for CublasLt in CublasLtAlgoInfo
+    Status InitCublasLtDesc(const CublasMMConfig& config, CublasMMAlgoInfo& info);
+    Status InitCublasLtDesc(const DataType    dtype,
+                            cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const int         lda,
+                            const int         ldb,
+                            const int         ldc,
+                            CublasMMAlgoInfo& info);
+
+    // @brief Init Algorithm for CublasLt in CublasLtAlgoInfo
+    Status InitCublasLtAlgo(CublasMMAlgoInfo& info);
+
+    // @brief Save / Load Best Algorithm Records to this->algo_map_, to / from a cache file.
+    std::string GetCacheFileName(const int device_id);
+    Status SaveAlgoMapToFile(const std::string file_path = "./", const int device_id = 0);
+    Status LoadAlgoMapFromFile(const std::string file_path = "./", const int device_id = 0);
+
+    // @brief Profile One CublasLt Algorithm, record its time cost
+    cublasStatus_t RunCublasLtMMAlgoPerf(cublasLtMatmulDesc_t        operationDesc,
+                                         const void*                 alpha, // host or device pointer
+                                         const void*                 A,
+                                         cublasLtMatrixLayout_t      Adesc,
+                                         const void*                 B,
+                                         cublasLtMatrixLayout_t      Bdesc,
+                                         const void*                 beta, // host or device pointer
+                                         const void*                 C,
+                                         cublasLtMatrixLayout_t      Cdesc,
+                                         void*                       D,
+                                         cublasLtMatrixLayout_t      Ddesc,
+                                         const cublasLtMatmulAlgo_t& algo,
+                                         int                         kernelRepeats,
+                                         size_t                      workSpaceSizeInBytes,
+                                         CublasMMAlgoInfo&           perfResultInfo,
+                                         cudaStream_t                stream,
+                                         cudaEvent_t&                startEvent,
+                                         cudaEvent_t&                stopEvent);
+
+    // @brief Find Best CublasLt MM Algo
+    // @param DType only Support float / half now.
+    template <typename dtype>
+    cublasStatus_t FindBestCublasLtMMAlgo(cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          const int         batch,
+                                          const int         m,
+                                          const int         n,
+                                          const int         k,
+                                          const dtype*      A,
+                                          const int         lda,
+                                          const dtype*      B,
+                                          const int         ldb,
+                                          dtype*            C,
+                                          const int         ldc,
+                                          const void*       alpha,  // host pointer
+                                          const void*       beta,   // host pointer
+                                          CublasMMAlgoInfo& best_cublaslt_algo,
+                                          cudaStream_t      stream,
+                                          const int         max_workspace_size = 0);
+
+    // @brief Find Best Classical Cublas MM Algo
+    // @param DType only Support float / half now.
+    template <typename dtype>
+    cublasStatus_t FindBestCublasMMAlgo(cublasOperation_t transa,
+                                        cublasOperation_t transb,
+                                        const int         batch,
+                                        const int         m,
+                                        const int         n,
+                                        const int         k,
+                                        const dtype*      A,
+                                        const int         lda,
+                                        const dtype*      B,
+                                        const int         ldb,
+                                        dtype*            C,
+                                        const int         ldc,
+                                        const void*       alpha,  // host pointer
+                                        const void*       beta,   // host pointer
+                                        CublasMMAlgoInfo& best_cublas_algo,
+                                        cudaStream_t      stream);
+
+    // @brief Find Best Classical Cublas and CublasLt MM Algo
+    // @param DType only Support float / half now.
+    template <typename dtype>
+    cublasStatus_t FindBestMMAlgo(cublasOperation_t transa,
+                                  cublasOperation_t transb,
+                                  const int         batch,
+                                  const int         m,
+                                  const int         n,
+                                  const int         k,
+                                  const dtype*      A,
+                                  const int         lda,
+                                  const dtype*      B,
+                                  const int         ldb,
+                                  dtype*            C,
+                                  const int         ldc,
+                                  const void*       alpha,  // host pointer
+                                  const void*       beta,   // host pointer
+                                  CublasMMAlgoInfo& best_algo_info,
+                                  cudaStream_t      stream,
+                                  const int         max_workspace_size = 0,
+                                  const bool        enable_cublas_algo = true,
+                                  const bool        enable_cublaslt_algo = true);
+
+};  // class CublasMMWrapper
+
+
+
+// Template Func Definitions of Class CublasMMWrapper
+template <typename dtype>
+cublasStatus_t CublasMMWrapper::FindBestCublasLtMMAlgo(cublasOperation_t transa,
+                                                       cublasOperation_t transb,
+                                                       const int         batch,
+                                                       const int         m,
+                                                       const int         n,
+                                                       const int         k,
+                                                       const dtype*      A,
+                                                       const int         lda,
+                                                       const dtype*      B,
+                                                       const int         ldb,
+                                                       dtype*            C,
+                                                       const int         ldc,
+                                                       const void*       alpha,  // host pointer
+                                                       const void*       beta,   // host pointer
+                                                       CublasMMAlgoInfo& best_cublaslt_algo,
+                                                       cudaStream_t      stream,
+                                                       const int         max_workspace_size) {
+    const int MAX_NUM_ALGO_IDS = 100;
+    const int MAX_NUM_ALGO_COMBINATIONS = 1000;
+    const int NUM_KERNEL_REPEATS = 100;
+    const int NUM_MAX_TRAVERSAL = 50;
+    const int splitKSequenceArray[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    CublasMMAlgoInfo perfResults[NUM_MAX_TRAVERSAL];
+
+    int                  AlgoCount            = 0;
+    int                  AlgoCountNoWorkspace = 0;
+    cublasLtMatmulAlgo_t algos[MAX_NUM_ALGO_COMBINATIONS];
+    cublasLtMatmulAlgo_t algosNoWorkspace[MAX_NUM_ALGO_COMBINATIONS];
+
+    int algoIdArray[MAX_NUM_ALGO_IDS];
+    int numAlgosAvailable = 0;
+
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cudaEvent_t    startEvent;
+    cudaEvent_t    stopEvent;
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cudaDataType_t Atype, Btype, Ctype, scaleType;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+
+    if (std::is_same<dtype, float>::value) {
+        Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, scaleType = CUDA_R_32F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+        for (int i=0; i<NUM_MAX_TRAVERSAL; i++) {
+            perfResults[i].dtype = int(DATA_TYPE_FLOAT);
+        }
+    } else if (std::is_same<dtype, half>::value) {
+        Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, scaleType = CUDA_R_16F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+        for (int i=0; i<NUM_MAX_TRAVERSAL; i++) {
+            perfResults[i].dtype = int(DATA_TYPE_HALF);
+        }
+    } else {
+        return status;
+    }
+
+    // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
+    // details about defaults; here we just need to set the transforms for A and B
+    // creates a matrix multiply descriptor
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    // Create matrix descriptors. We are good with the details here so no need
+    // to set any extra attributes
+    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    // Create CUDA event to time the execution time of each algo
+    if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+    if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+
+
+    // Request the 100 first AlgoId available
+    status = cublasLtMatmulAlgoGetIds(cublaslt_handle_, computeType, scaleType, Atype, Btype, Ctype, Ctype, MAX_NUM_ALGO_IDS, algoIdArray, &numAlgosAvailable);
+    if (status != CUBLAS_STATUS_SUCCESS && numAlgosAvailable == 0) {
+        goto CLEANUP;
+    }
+    LOGD("CublasLt FindAlgo: Found [%d] AlgoIDs available.\n", numAlgosAvailable);
+
+
+    // Main Algorithm Loops
+    for (int i_algo=0; i_algo < numAlgosAvailable; i_algo++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+
+        // Initialize algo structure with given Algp ID
+        status = cublasLtMatmulAlgoInit(cublaslt_handle_, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdArray[i_algo], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileArray   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileArray[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        } else {
+            cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over
+        // the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileArray, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+
+        // Loop over the different tiles
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            // Loop over different stages count
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                // Loop over the different custom option if any
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    // Loop over the CTAs swizzling support
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceArray) / sizeof(splitKSequenceArray[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence
+                        // splitKSequenceArray in addition to the case where splitK
+                        // is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < MAX_NUM_ALGO_COMBINATIONS); l++) {
+                            // Setup attribute of the algo to run
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileArray[tileIdx], sizeof(tileArray[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceArray[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceArray[l - 1],
+                                                                     sizeof(splitKSequenceArray[l - 1]));
+                                // Going over all the reduction scheme
+                                for (redScheme = 1;
+                                     redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < MAX_NUM_ALGO_COMBINATIONS);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+
+                                        cublasLtMatmulHeuristicResult_t heurResult;
+                                        cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                            cublaslt_handle_, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                        if (heurResult.workspaceSize > max_workspace_size) {
+                                            LOGD("CublasLt MatMul Perf: No enough workspace. Required=%d, Max Size Available=%d.\n", heurResult.workspaceSize, max_workspace_size);
+                                            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+                                        } else if (heurResult.workspaceSize == 0) {
+                                            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                                algosNoWorkspace[AlgoCountNoWorkspace++] = algo;
+                                            }
+                                        }
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algos[AlgoCount++] = algo;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            } else {  // Non-splitK case
+                                // if user preference is ok with workspace
+                                if (AlgoCount < MAX_NUM_ALGO_COMBINATIONS) {
+                                    cublasLtMatmulHeuristicResult_t heurResult;
+                                    cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                        cublaslt_handle_, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                    if (heurResult.workspaceSize > max_workspace_size) {
+                                        LOGD("CublasLt MatMul Perf: No enough workspace. Required=%d, Max Size Available=%d.\n", heurResult.workspaceSize, max_workspace_size);
+                                        algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not Enough Workspace
+                                    } else if (heurResult.workspaceSize == 0) {
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algosNoWorkspace[AlgoCountNoWorkspace++] = algo;
+                                        }
+                                    }
+                                    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                        algos[AlgoCount++] = algo;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileArray;
+    }  // Algo i loop
+    LOGD("CublasLt FindAlgo: Found [%d] Algos total, include [%d] Algos without workspace.\n", AlgoCount, AlgoCountNoWorkspace);
+
+    if (AlgoCount < NUM_MAX_TRAVERSAL) {
+        // 0 <= workspacesize <= 32MB
+        for (int i_algo = 0; i_algo < AlgoCount; i_algo++) {
+            status = RunCublasLtMMAlgoPerf(operationDesc,
+                                           alpha, // host or device pointer
+                                           A,
+                                           Adesc,
+                                           B,
+                                           Bdesc,
+                                           beta, // host or device pointer
+                                           C,
+                                           Cdesc,
+                                           C,
+                                           Cdesc,
+                                           algos[i_algo],
+                                           NUM_KERNEL_REPEATS,
+                                           max_workspace_size,
+                                           perfResults[i_algo],
+                                           stream,
+                                           startEvent,
+                                           stopEvent);
+            perfResults[i_algo].status = status;
+        }
+    } else {
+        // Heuristic + workspacesize==0
+        AlgoCount = 0;
+        numAlgosAvailable = 0;
+        cublasLtMatmulPreference_t pref;
+        cublasLtMatmulPreferenceCreate(&pref);
+        cublasLtMatmulPreferenceSetAttribute(
+            pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &max_workspace_size, sizeof(max_workspace_size)); // (Recommend: 32MB)
+        cublasLtMatmulHeuristicResult_t heuristicResultsArray[NUM_MAX_TRAVERSAL];
+
+        cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
+                                       operationDesc,
+                                       Adesc,
+                                       Bdesc,
+                                       Cdesc,
+                                       Cdesc,
+                                       pref,
+                                       NUM_MAX_TRAVERSAL,
+                                       heuristicResultsArray,
+                                       &numAlgosAvailable);
+        cublasLtMatmulPreferenceDestroy(pref);
+        LOGD("CublasLt MatMul Perf: heuristic suggest return %d algo IDs.\n", numAlgosAvailable);
+
+        for (int i_algo = 0; i_algo < numAlgosAvailable; i_algo++) {
+            if (heuristicResultsArray[i_algo].state == CUBLAS_STATUS_SUCCESS) {
+                status = RunCublasLtMMAlgoPerf(operationDesc,
+                                               alpha, // host or device pointer
+                                               A,
+                                               Adesc,
+                                               B,
+                                               Bdesc,
+                                               beta, // host or device pointer
+                                               C,
+                                               Cdesc,
+                                               C,
+                                               Cdesc,
+                                               heuristicResultsArray[i_algo].algo,
+                                               NUM_KERNEL_REPEATS,
+                                               max_workspace_size,
+                                               perfResults[AlgoCount],
+                                               stream,
+                                               startEvent,
+                                               stopEvent);
+                perfResults[AlgoCount].status = status;
+                if (status == CUBLAS_STATUS_SUCCESS) {
+                    AlgoCount++;
+                }
+            }
+        }
+
+        // workspacesize==0
+        LOGD("CublasLt MatMul Perf: Run %d Algos with NO Workspace.\n", AlgoCountNoWorkspace);
+        for (int i_algo = 0; i_algo < AlgoCountNoWorkspace && i_algo < (NUM_MAX_TRAVERSAL - numAlgosAvailable); i_algo++) {
+            status = RunCublasLtMMAlgoPerf(operationDesc,
+                                           alpha, // host or device pointer
+                                           A,
+                                           Adesc,
+                                           B,
+                                           Bdesc,
+                                           beta, // host or device pointer
+                                           C,
+                                           Cdesc,
+                                           C,
+                                           Cdesc,
+                                           algosNoWorkspace[i_algo],
+                                           NUM_KERNEL_REPEATS,
+                                           0,
+                                           perfResults[AlgoCount],
+                                           stream,
+                                           startEvent,
+                                           stopEvent);
+            perfResults[AlgoCount].status = status;
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                AlgoCount++;
+            }
+        }
+    }
+
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount,
+        [](const CublasMMAlgoInfo& perf_a, const CublasMMAlgoInfo& perf_b) {
+            return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.exec_time < perf_b.exec_time));} );
+
+    if (perfResults[0].exec_time > 0.0f) {
+        best_cublaslt_algo = perfResults[0];
+        InitCublasLtDesc(DataType(perfResults[0].dtype), transa, transb, m, n, k, lda, ldb, ldc, best_cublaslt_algo);
+
+        LOGD("CublasLt MatMul Perf: Best Algo: AlgoId=[%d], exec_time=[%.4f]ms, tile=%d, customOption=%d, numSplitsK=%d, swizzle=%d, reductionScheme=%d, workspaceSize=%d, stages=%d, wavesCount=%.4f.\n",
+             best_cublaslt_algo.algoId, best_cublaslt_algo.exec_time,
+             best_cublaslt_algo.tile, best_cublaslt_algo.customOption, best_cublaslt_algo.splitK_val,
+             best_cublaslt_algo.swizzle, best_cublaslt_algo.reductionScheme, best_cublaslt_algo.workspaceSize,
+             best_cublaslt_algo.stages, best_cublaslt_algo.wavesCount);
+    }
+
+
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    if (startEvent) {
+        cudaEventDestroy(startEvent);
+    }
+    if (stopEvent) {
+        cudaEventDestroy(stopEvent);
+    }
+    return status;
+}
+
+
+template <typename dtype>
+cublasStatus_t CublasMMWrapper::FindBestCublasMMAlgo(cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     const int         batch,
+                                                     const int         m,
+                                                     const int         n,
+                                                     const int         k,
+                                                     const dtype*      A,
+                                                     const int         lda,
+                                                     const dtype*      B,
+                                                     const int         ldb,
+                                                     dtype*            C,
+                                                     const int         ldc,
+                                                     const void*       alpha,  // host pointer
+                                                     const void*       beta,   // host pointer
+                                                     CublasMMAlgoInfo& best_cublas_algo,
+                                                     cudaStream_t      stream) {
+    const int NUM_KERNEL_REPEATS = 100;
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+
+    if (std::is_same<dtype, float>::value) {
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+        best_cublas_algo.dtype = int(DATA_TYPE_FLOAT);
+    } else if (std::is_same<dtype, half>::value) {
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_16F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+        best_cublas_algo.dtype = int(DATA_TYPE_HALF);
+    }
+
+    float best_time = 99999.0f;
+    int   best_algo = 0;
+    for (int i_algo = startAlgo; i_algo <= endAlgo; i_algo++) {
+        cublasStatus_t status;
+        cudaDeviceSynchronize();
+        auto start_time = std::chrono::high_resolution_clock::now();
+        for (int i_iter = 0; i_iter<NUM_KERNEL_REPEATS; i_iter++) {
+            status = cublasGemmEx(cublas_handle_,
+                                  transa,
+                                  transb,
+                                  m,
+                                  n,
+                                  k,
+                                  alpha,
+                                  A,
+                                  AType,
+                                  lda,
+                                  B,
+                                  BType,
+                                  ldb,
+                                  beta,
+                                  C,
+                                  CType,
+                                  ldc,
+                                  computeType,
+                                  static_cast<cublasGemmAlgo_t>(i_algo));
+        } // i_iter
+        cudaDeviceSynchronize();
+        auto end_time = std::chrono::high_resolution_clock::now();
+        float avg_time = std::chrono::duration<float, std::milli>(end_time - start_time).count() / NUM_KERNEL_REPEATS;
+        if (status == CUBLAS_STATUS_SUCCESS) {
+            //LOGD("algo_%d costs %.3fms \n", i_algo, avg_time);
+            if (avg_time < best_time) {
+                best_time = avg_time;
+                best_algo = i_algo;
+            }
+        }
+    }  // i_algo
+    LOGD("CublasLt MatMul Perf: Best Algo id = %d, time = %.4f ms\n", best_algo, best_time);
+
+    if (best_time != 99999.0f) {
+        best_cublas_algo.algoId      = best_algo;
+        best_cublas_algo.exec_time   = best_time;
+        best_cublas_algo.batch       = 1;
+        best_cublas_algo.is_cublaslt = false;
+    }
+
+    return status;
+}
+
+template <typename dtype>
+cublasStatus_t CublasMMWrapper::FindBestMMAlgo(cublasOperation_t transa,
+                                               cublasOperation_t transb,
+                                               const int         batch,
+                                               const int         m,
+                                               const int         n,
+                                               const int         k,
+                                               const dtype*      A,
+                                               const int         lda,
+                                               const dtype*      B,
+                                               const int         ldb,
+                                               dtype*            C,
+                                               const int         ldc,
+                                               const void*       alpha,  // host pointer
+                                               const void*       beta,   // host pointer
+                                               CublasMMAlgoInfo& best_algo_info,
+                                               cudaStream_t      stream,
+                                               const int         max_workspace_size,
+                                               const bool        enable_cublas_algo,
+                                               const bool        enable_cublaslt_algo) {
+    if (!enable_cublas_algo && !enable_cublaslt_algo) {
+        LOGD("Choose At least one of Classical Cublas and CublasLt to find Best Algo.\n");
+        return CUBLAS_STATUS_NOT_SUPPORTED;
+    }
+
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+    // Try to find out if Best Also has already been placed in this->algo_map_;
+    CublasMMConfig config;
+    if (std::is_same<dtype, float>::value) {
+        float f_alpha = reinterpret_cast<const float*>(alpha)[0];
+        float f_beta  = reinterpret_cast<const float*>(beta)[0];
+        config = CublasMMConfig(int(DATA_TYPE_FLOAT), batch, transa, transb, m, n, k, lda, ldb, ldc, f_alpha, f_beta, max_workspace_size);
+    } else if (std::is_same<dtype, half>::value) {
+        half h_alpha = reinterpret_cast<const half*>(alpha)[0];
+        half h_beta  = reinterpret_cast<const half*>(beta)[0];
+        config = CublasMMConfig(int(DATA_TYPE_HALF), batch, transa, transb, m, n, k, lda, ldb, ldc, h_alpha, h_beta, max_workspace_size);
+    } else {
+        LOGE("Unable to find Best Cublas MM Algo, data type not supported.");
+        return status;
+    }
+
+    auto iter = this->algo_map_.find(config);
+    if (iter != this->algo_map_.end()) {
+        best_algo_info = iter->second;
+        return status;
+    }
+
+    // If best algo is not set yet.
+    if (enable_cublas_algo) {
+        cublasStatus_t status = FindBestCublasMMAlgo(transa, transb, batch, m, n, k, A, lda, B, ldb, C, ldc,
+                                                     alpha, beta, best_algo_info, stream);
+    }
+    
+    if (enable_cublaslt_algo) {
+        CublasMMAlgoInfo best_lt_algo_info;
+        status = status = FindBestCublasLtMMAlgo(transa, transb, batch, m, n, k, A, lda, B, ldb, C, ldc,
+                                                       alpha, beta, best_lt_algo_info, stream, max_workspace_size);
+        if (best_lt_algo_info.exec_time < best_algo_info.exec_time) {
+            best_algo_info = best_lt_algo_info;
+        }
+    }
+
+    this->algo_map_[config] = best_algo_info;
+
+    LOGD("Cublas MatMul Perf: Best Algo: AlgoId=[%d], exec_time=[%.4f]ms, tile=%d, customOption=%d, numSplitsK=%d, swizzle=%d, reductionScheme=%d, workspaceSize=%d, stages=%d, wavesCount=%.4f, is_cublaslt=[%d].\n",
+             best_algo_info.algoId, best_algo_info.exec_time,
+             best_algo_info.tile, best_algo_info.customOption, best_algo_info.splitK_val,
+             best_algo_info.swizzle, best_algo_info.reductionScheme, best_algo_info.workspaceSize,
+             best_algo_info.stages, best_algo_info.wavesCount, int(best_algo_info.is_cublaslt));
+
+    /////////////////////////////////
+    // TEST Save/Load Module. TO BE REMOVED
+    //SaveAlgoMapToFile();
+    //LoadAlgoMapFromFile();
+    /////////////////////////////////
+
+    return status;
+}
+
+
+
+
+
+
+
+class cublasMMWrapper {
+public:
+    cublasMMWrapper(cublasHandle_t   cublas_handle,
+                    cublasLtHandle_t cublaslt_handle);
+
+    ~cublasMMWrapper();
+
+    cublasMMWrapper(const cublasMMWrapper& wrapper);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       alpha,
+              const void*       A,
+              cudaDataType_t    Atype,
+              int               lda,
+              const void*       B,
+              cudaDataType_t    Btype,
+              int               ldb,
+              const void*       beta,
+              void*             C,
+              cudaDataType_t    Ctype,
+              int               ldc,
+              cudaDataType_t    computeType,
+              cublasGemmAlgo_t  algo);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc,
+              cudaStream_t      stream);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc,
+              float             f_alpha,
+              float             f_beta,
+              cudaStream_t      stream);
+
+    void batchedGemm(cublasOperation_t  transa,
+                     cublasOperation_t  transb,
+                     const int          batch_count,
+                     const int          m,
+                     const int          n,
+                     const int          k,
+                     const void* const* A,
+                     const int          lda,
+                     const void* const* B,
+                     const int          ldb,
+                     void* const*       C,
+                     const int          ldc,
+                     cudaStream_t       stream);
+
+    void stridedBatchedGemm(cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         batch_count,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const void*       A,
+                            const int         lda,
+                            const int64_t     strideA,
+                            const void*       B,
+                            const int         ldb,
+                            const int64_t     strideB,
+                            void*             C,
+                            const int         ldc,
+                            const int64_t     strideC,
+                            cudaStream_t      stream,
+                            const float       f_alpha = 1.0f,
+                            const float       f_beta  = 0.0f);
+
+    void setFP32GemmConfig();
+    void setFP16GemmConfig();
+
+    void prepareCublasLtDesc(cublasOperation_t transa,
+                             cublasOperation_t transb,
+                             const int         m,
+                             const int         n,
+                             const int         k,
+                             const int         lda,
+                             const int         ldb,
+                             const int         ldc);
+
+    void freeCublasLtDesc();
+
+private:
+    cublasHandle_t   cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
+
+    cudaDataType_t Atype_;
+    cudaDataType_t Btype_;
+    cudaDataType_t Ctype_;
+    cudaDataType_t computeType_;
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+
+    bool cublaslt_inited_ = false;
+    cublasOperation_t cached_transa;
+    cublasOperation_t cached_transb;
+    int cached_m, cached_n, cached_k;
+    int cached_lda, cached_ldb, cached_ldc;
+};
+
+
+
 }  //  namespace TNN_NS;
 
 #endif  //  TNN_SOURCE_TNN_DEVICE_CUDA_UTILS_CUH_
diff --git a/source/tnn/device/huawei_npu/CMakeLists.txt b/source/tnn/device/huawei_npu/CMakeLists.txt
index 213f05d4f..d23610041 100644
--- a/source/tnn/device/huawei_npu/CMakeLists.txt
+++ b/source/tnn/device/huawei_npu/CMakeLists.txt
@@ -6,4 +6,5 @@ file(GLOB NPU_SRC
         convert/math/*.h
         convert/math/*.cc)
 add_library(TNNNPU OBJECT ${NPU_SRC})
-include_directories(../../../../third_party/huawei_npu/hiai_ddk_latest/include)
+#include_directories(../../../../third_party/huawei_npu/hiai_ddk_latest/include) #for old DDK 100.320.XXX.XXX
+include_directories(../../../../third_party/huawei_npu/hiai_ddk_latest/ddk/ai_ddk_lib/include)
diff --git a/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h b/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h
index 1a5521413..019f9013f 100644
--- a/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h
+++ b/source/tnn/device/huawei_npu/convert/npu_base_layer_convert.h
@@ -38,13 +38,13 @@ class OperatorInfo {
 public:
     OperatorInfo();
     explicit OperatorInfo(std::shared_ptr<ge::Operator> op);
-    OperatorInfo(std::shared_ptr<ge::Operator> op, vector<int> shape);
+    OperatorInfo(std::shared_ptr<ge::Operator> op, std::vector<int> shape);
 
     virtual ~OperatorInfo();
 
     shared_ptr<ge::Operator> GetOperator();
     std::vector<int> GetShape();
-    void SetShape(vector<int> shape);
+    void SetShape(std::vector<int> shape);
     void SetOperator(std::shared_ptr<ge::Operator> op);
 
 private:
diff --git a/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc b/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc
index 414dab623..15fd69d12 100644
--- a/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc
+++ b/source/tnn/device/huawei_npu/convert/npu_inner_product_layer_convert.cc
@@ -31,7 +31,7 @@ Status NpuInnerProductLayer::Convert() {
 
     // weight
     int input_dims_size = (int)input_ops_[0]->GetShape().size();
-    vector<int> w_shape = input_ops_[0]->GetShape();
+    std::vector<int> w_shape = input_ops_[0]->GetShape();
     w_shape[0]          = param->num_output;
     for (int i = input_dims_size; i < 4; ++i) {
         w_shape.push_back(1);
diff --git a/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc b/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc
index 3d9381788..90a993956 100644
--- a/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc
+++ b/source/tnn/device/huawei_npu/convert/npu_squeeze_layer_convert.cc
@@ -39,7 +39,7 @@ Status NpuSqueezeLayer::Convert() {
     } else {
         output->set_input_x(*input_ops_[0]->GetOperator());
     }
-    output->set_attr_axis(ge::AttrValue::LIST_INT(NpuUtils::Int32VecToTVec<long>(param->axes)));
+    output->set_attr_axis(ge::AttrValue::LIST_INT(NpuUtils::Int32VecToTVec<int64_t>(param->axes)));
     ADD_OUTPUT_OP(output)
 }
 
diff --git a/source/tnn/device/huawei_npu/npu_context.cc b/source/tnn/device/huawei_npu/npu_context.cc
index d6a89eb11..1f2a63a9d 100644
--- a/source/tnn/device/huawei_npu/npu_context.cc
+++ b/source/tnn/device/huawei_npu/npu_context.cc
@@ -24,6 +24,10 @@ Status NpuContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status NpuContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status NpuContext::OnInstanceForwardBegin() {
     return TNN_OK;
 }
diff --git a/source/tnn/device/huawei_npu/npu_context.h b/source/tnn/device/huawei_npu/npu_context.h
index a9143a327..cad7a22a2 100644
--- a/source/tnn/device/huawei_npu/npu_context.h
+++ b/source/tnn/device/huawei_npu/npu_context.h
@@ -31,6 +31,10 @@ class NpuContext : public Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
     // @brief before instance forward
     virtual Status OnInstanceForwardBegin() override;
 
diff --git a/source/tnn/device/huawei_npu/npu_network.cc b/source/tnn/device/huawei_npu/npu_network.cc
index 369ecdaf4..ecfefef04 100644
--- a/source/tnn/device/huawei_npu/npu_network.cc
+++ b/source/tnn/device/huawei_npu/npu_network.cc
@@ -44,8 +44,10 @@ NpuNetwork::~NpuNetwork() {
     DeInit();
 }
 
-Status NpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+Status NpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, 
+                        AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, 
+                        InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     // config check
     if (InitConfigCheck(net_config, model_config)) {
         return Status(TNNERR_NULL_PARAM, "ERROR: Npu not support device_type or model type");
@@ -84,12 +86,12 @@ Status NpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, Ab
     }
     model_name_            = model_name_ + model_suffix + "_" + version_str_;
     std::string model_path = use_path_ ? net_config.cache_path + "/" + model_name_ + ".om" : "";
-    LOGI("[TNN/NPU]The path %s\n", model_path.c_str());
+    LOGD("[TNN NPU] Model path = %s.\n", model_path.c_str());
 
     // hiai model init
     InputShapesMap cpu_inputs_shape;
     tnn_ret = HiAIModelInit(model_path, net_config, model_config, default_interpreter, input_shapes_map_temp,
-                            cpu_inputs_shape);
+                            cpu_inputs_shape, inputs_data_type);
     if (tnn_ret != TNN_OK) {
         return tnn_ret;
     }
@@ -129,7 +131,7 @@ Status NpuNetwork::RomVersionCheck() {
     version_str_ = version;
 
     // check if NPU version is greater than 300
-    LOGI("[TNN/NPU]ddk current version: %s\n", version_str_.c_str());
+    LOGD("[TNN NPU] run on HiAi ddk version: %s\n", version_str_.c_str());
     if (!NpuUtils::VersionCompare(version_str_, "100.320.xxx.xxx", VCT_BIGEQUAL)) {
         return Status(TNNERR_NPU_LOAD_ERROR, "ERROR: huawei_npu is installed but is below 100.320.xxx.xxx");
     }
@@ -156,7 +158,7 @@ Status NpuNetwork::InitContext(NetworkConfig &net_config) {
 
 Status NpuNetwork::HiAIModelInit(std::string model_path, NetworkConfig &net_config, ModelConfig &model_config,
                                  DefaultModelInterpreter *interpreter, InputShapesMap inputs_shape,
-                                 InputShapesMap &cpu_inputs_shape) {
+                                 InputShapesMap &cpu_inputs_shape, InputDataTypeMap &cpu_inputs_data_type) {
     // hiai variables
     std::vector<std::shared_ptr<hiai::AiModelDescription>> model_desc;
     auto model_builder                = std::make_shared<hiai::AiModelBuilder>(client_);
@@ -166,20 +168,20 @@ Status NpuNetwork::HiAIModelInit(std::string model_path, NetworkConfig &net_conf
     domi::ModelBufferData om_model_buff;
 
     if (use_path_ && NpuCommonUtils::FileExits(model_path)) {
-        LOGI("[TNN/NPU]The om file already exists in %s\n", model_path.c_str());
+        LOGD("[TNN NPU] The om file already exists in %s\n", model_path.c_str());
         model_mem_buffer = model_builder->InputMemBufferCreate(model_path);
     } else {
         // NPU IR build
         Status ir_ret = IRInitLayers(net_config, interpreter, inputs_shape);
         if (ir_ret != TNN_OK) {
-            LOGI("[TNN/NPU] Some layers not support in NPU, switch to ARM\n");
+            LOGD("[TNN NPU] Some layers not supported in NPU, switch to ARM implementation.\n");
             if (cpu_count_ != net_structure_->layers.size()) {
                 // create sub_network_interp_
                 sub_network_interp_.reset(new ModelInterpreter());
                 *sub_network_interp_->GetNetStructure() = *interpreter->GetNetStructure();
                 *sub_network_interp_->GetNetResource()  = *interpreter->GetNetResource();
 
-                ir_ret = InitSubNetwork(net_config, model_config, sub_network_interp_.get(), cpu_inputs_shape);
+                ir_ret = InitSubNetwork(net_config, model_config, sub_network_interp_.get(), cpu_inputs_shape, cpu_inputs_data_type);
                 if (ir_ret != TNN_OK) {
                     return ir_ret;
                 }
@@ -236,10 +238,10 @@ Status NpuNetwork::HiAIModelInit(std::string model_path, NetworkConfig &net_conf
     // check model
     bool is_compatible = true;
     ret                = client_->CheckModelCompatibility(*desc, is_compatible);
-    LOGI("[TNN/NPU] is model compatible: %s", is_compatible ? "true" : "false");
-    LOGI("[TNN/NPU] ret value %d", ret);
     if (ret != hiai::AI_SUCCESS) {
-        return Status(TNNERR_NPU_HIAI_API_ERROR, "ERROR: check model CheckModelCompatibility() failed");
+        // HiAI 100.500+ CheckModelCompatibility may have non-zero return,
+        // Non-Zero return may not affect model inference.
+        LOGI("WARNING: TNN HiAi ModelManagerClient::CheckModelCompatibility() return non-zero:%d. Ignore this warning if result is correct.\n", ret);
     }
 
     return TNN_OK;
@@ -273,7 +275,8 @@ Status NpuNetwork::IRInitLayers(NetworkConfig &net_config, DefaultModelInterpret
 }
 
 Status NpuNetwork::InitSubNetwork(NetworkConfig &net_config, ModelConfig &model_config,
-                                  DefaultModelInterpreter *interpreter, InputShapesMap &cpu_inputs_shape) {
+                                  DefaultModelInterpreter *interpreter, InputShapesMap &cpu_inputs_shape,
+                                  InputDataTypeMap &cpu_inputs_data_type) {
     // from here load cpu
     sub_network_                 = std::make_shared<DefaultNetwork>();
     NetworkConfig cpu_net_config = net_config;
@@ -283,13 +286,11 @@ Status NpuNetwork::InitSubNetwork(NetworkConfig &net_config, ModelConfig &model_
     NpuUtils::SplitNetwork(cpu_count_, interpreter->GetNetStructure(), visited_, global_operator_map_);
     cpu_inputs_shape = interpreter->GetNetStructure()->inputs_shape_map;
     if (cpu_inputs_shape.empty()) {
-        LOGE(
-            "ERROR: When split the network,  the arm can not find input in the huawei_npu visited "
-            "layers\n");
+        LOGE("ERROR: When split the network,  the arm can not find input in the huawei_npu visited layers.\n");
         return Status(TNNERR_LAYER_ERR,
                       "ERROR: When split the network,  the arm can not find input in the huawei_npu visited layers");
     }
-    Status ret = sub_network_->Init(cpu_net_config, model_config, interpreter, cpu_inputs_shape, cpu_inputs_shape);
+    Status ret = sub_network_->InitWrapper(cpu_net_config, model_config, interpreter, cpu_inputs_shape, cpu_inputs_shape, cpu_inputs_data_type);
     if (ret != TNN_OK) {
         return ret;
     }
@@ -304,11 +305,11 @@ Status NpuNetwork::ConvertLayers(NetResource *net_resource) {
     for (auto layer_info : net_structure_->layers) {
         auto const_layers = net_resource->constant_layers;
         if (const_layers.find(layer_info->name) != const_layers.end()) {
-            LOGI("layer(name: %s) is constant layer, skip convert\n", layer_info->name.c_str());
+            LOGD("[TNN NPU] layer(name: %s) is constant layer, skip convert\n", layer_info->name.c_str());
             cpu_count_++;
             continue;
         }
-        LOGI("convert layer (type: %d, name: %s)\n", layer_info->type, layer_info->name.c_str());
+        LOGD("[TNN NPU] convert layer (type: %d, name: %s)\n", layer_info->type, layer_info->name.c_str());
         LayerType type          = layer_info->type;
         NpuBaseLayer *cur_layer = CreateNpuBaseLayer(type);
         if (cur_layer == nullptr) {
@@ -356,11 +357,9 @@ Status NpuNetwork::ConvertLayers(NetResource *net_resource) {
         }
 #endif
         LayerResource *layer_resource = net_resource->resource_map[layer_name].get();
-        /*
-         * cur_layer->convert
-         */
-        ret =
-            cur_layer->Init(context_, layer_info->param.get(), layer_resource, input_ops, device_, layer_info->outputs);
+        // cur_layer->convert
+        ret = cur_layer->Init(context_, layer_info->param.get(), 
+                              layer_resource, input_ops, device_, layer_info->outputs);
         if (ret != TNN_OK) {
             LOGE("Error Init layer %s (%s), may switch to arm\n", cur_layer->GetLayerName().c_str(),
                  ret.description().c_str());
@@ -534,7 +533,7 @@ Status NpuNetwork::InitBlobs(InputShapesMap &inputs_shape, InputShapesMap &cpu_i
         }
     }
 
-    LOGI("Init NPU Blobs Done!\n");
+    LOGD("[TNN NPU] Init HiAi NPU Blobs finished.\n");
     return TNN_OK;
 }
 
@@ -557,7 +556,7 @@ Blob *NpuNetwork::CreateNpuBlob(hiai::TensorDimension dims, std::string name, vo
     return new Blob(desc, handle);
 }
 
-Status NpuNetwork::GetForwardMemorySize(int &memory_size) {
+Status NpuNetwork::GetForwardMemorySize(size_t &memory_size) {
     memory_size = 0;
     return TNNERR_NPU_UNSUPPORT_ERROR;
 }
@@ -585,7 +584,9 @@ Status NpuNetwork::Reshape(const InputShapesMap &inputs) {
 }
 
 Status NpuNetwork::DeInit() {
-    client_->UnLoadModel();
+    if (client_ != nullptr) {
+        client_->UnLoadModel();
+    }
     auto iterator = input_blob_map_.begin();
     while (iterator != input_blob_map_.end()) {
         if (iterator->second != nullptr) {
@@ -634,6 +635,10 @@ Status NpuNetwork::GetCommandQueue(void **command_queue) {
     return TNN_OK;
 }
 
+Status NpuNetwork::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status NpuNetwork::Forward() {
     hiai::AiContext context;
     std::string key   = "model_name";
@@ -646,7 +651,7 @@ Status NpuNetwork::Forward() {
 #endif
     hiai::AIStatus ret = client_->Process(context, input_tensor_, output_tensor_, 1000, istamp);
     if (ret != hiai::AI_SUCCESS) {
-        LOGE("NPU Forward Failed (ret = %d)\n", (int)ret);
+        LOGE("HiAi NPU Forward Failed (ret = %d)\n", (int)ret);
         return Status(TNNERR_NPU_HIAI_API_ERROR, "Forward failed!");
     }
 #if TNN_PROFILE
diff --git a/source/tnn/device/huawei_npu/npu_network.h b/source/tnn/device/huawei_npu/npu_network.h
index 81f10c2b9..870c74793 100644
--- a/source/tnn/device/huawei_npu/npu_network.h
+++ b/source/tnn/device/huawei_npu/npu_network.h
@@ -49,7 +49,7 @@ class NpuNetwork : public AbstractNetwork {
     // @param net_cfg
     // @param net_res
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
 
     // @brief deinit release init create resource
     virtual Status DeInit();
@@ -59,7 +59,7 @@ class NpuNetwork : public AbstractNetwork {
     //  forward
     //  @return error code: If successful, returns zero. Otherwise, returns
     //  an error code.
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     //  @brief: set memory used by the rapidnet instance without forward
     //  memory, the memory size must be at least that returned by
@@ -79,6 +79,10 @@ class NpuNetwork : public AbstractNetwork {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue);
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void *command_queue);
+
     // @brief network infer, it will sync to wait result
     virtual Status Forward();
 
@@ -113,12 +117,12 @@ class NpuNetwork : public AbstractNetwork {
 
     Status HiAIModelInit(std::string model_path, NetworkConfig &net_config, ModelConfig &model_config,
                          DefaultModelInterpreter *interpreter, InputShapesMap inputs_shape,
-                         InputShapesMap &cpu_inputs_shape);
+                         InputShapesMap &cpu_inputs_shape, InputDataTypeMap &cpu_inputs_data_type);
 
     Status IRInitLayers(NetworkConfig &net_config, DefaultModelInterpreter *interpreter, InputShapesMap &inputs_shape);
 
     Status InitSubNetwork(NetworkConfig &net_config, ModelConfig &model_config, DefaultModelInterpreter *interpreter,
-                          InputShapesMap &cpu_inputs_shape);
+                          InputShapesMap &cpu_inputs_shape, InputDataTypeMap &cpu_inputs_data_type);
 
     Status ConvertLayers(NetResource *net_resource);
 
diff --git a/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm b/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm
index bda6b9c21..b0863f992 100644
--- a/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm
+++ b/source/tnn/device/metal/acc/metal_cpu_adapter_acc.mm
@@ -29,7 +29,7 @@ inline MatType MatTypeByBlob(const BlobDesc& desc) {
         return NCHW_FLOAT;
     
     if (desc.data_type == DATA_TYPE_HALF)
-        return RESERVED_BFP16_TEST;
+        return NCHW_BFP16;
     
     return INVALID;
 }
diff --git a/source/tnn/device/metal/metal_blob_converter.mm b/source/tnn/device/metal/metal_blob_converter.mm
index b34d2d1ac..1277a47d7 100644
--- a/source/tnn/device/metal/metal_blob_converter.mm
+++ b/source/tnn/device/metal/metal_blob_converter.mm
@@ -51,6 +51,7 @@
     Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob,
                                    void *command_queue);
     bool CheckDeviceAndMat(DeviceType device_type, MatType mat_type);
+    bool NeedDoScaleBias(MatConvertParam& param);
     std::shared_ptr<Mat> buffer_mat_ = nullptr;
 };
 
@@ -63,12 +64,28 @@ Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool
     bool device_supported = (device_type == DEVICE_METAL || device_type == DEVICE_ARM || 
             device_type == DEVICE_X86 || device_type == DEVICE_NAIVE);
 
-    bool mat_supported = (mat_type == N8UC4 || mat_type == N8UC3 || mat_type == NGRAY||
-            mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST || mat_type == NC_INT32);
+    bool mat_supported = (mat_type == N8UC4 || mat_type == N8UC3 || mat_type == NGRAY ||
+            mat_type == NCHW_FLOAT || mat_type == NCHW_BFP16 || mat_type == NC_INT32 ||
+            mat_type == RESERVED_BFP16_TEST);
 
     return device_supported && mat_supported;
 }
 
+bool MetalBlobConverterAcc::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 Status MetalBlobConverterAcc::AllocateBufferParam(MatConvertParam param, Mat *mat, Blob *blob, bool is_mat_to_blob) {
     auto dims = blob->GetBlobDesc().dims;
     MetalImageConverterParams metal_param;
@@ -93,7 +110,7 @@ Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool
         bias_texture_buffer  = is_mat_to_blob ? 1.0    : 1.0 / 255.0f;
     }
 
-    if (mat_type == NCHW_FLOAT || mat_type == NGRAY || mat_type == RESERVED_BFP16_TEST || mat_type == NC_INT32) {
+    if (mat_type == NCHW_FLOAT || mat_type == NGRAY || mat_type == NCHW_BFP16 || mat_type == NC_INT32 || mat_type == RESERVED_BFP16_TEST) {
         // scale and bias should at least have channel elements, so we use another buffer instead of metal_param
         if (param.scale.size() < metal_param.channel || param.bias.size() < metal_param.channel) {
             // invalid scale and bias
@@ -247,7 +264,7 @@ Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool
                 LOGD("data_converter_nc4hw4_2_nchw_float_v2\n");
             }
         }
-    } else if (mat_type == RESERVED_BFP16_TEST) {
+    } else if (mat_type == NCHW_BFP16 || mat_type == RESERVED_BFP16_TEST) {
         if (is_mat_to_blob) {
             if (blob_data_format == DATA_FORMAT_NCHW) {
                 func_name = @"data_converter_nchw_half2ftype";
@@ -427,7 +444,7 @@ Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool
 
         [command_buffer waitUntilCompleted];
         memcpy(output_mat.GetData(), output_mtl_buffer.contents, count * bytes_size);
-    } else if (mat_type == NGRAY ||mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST) {
+    } else if (mat_type == NGRAY ||mat_type == NCHW_FLOAT || mat_type == NCHW_BFP16 || mat_type == RESERVED_BFP16_TEST) {
         auto input_buffer_blob          = dynamic_cast<Blob *>(input_blob);
         id<MTLBuffer> output_mtl_buffer = nil;
 
@@ -723,7 +740,7 @@ Status AllocateComputePipeline(MatConvertParam param, Mat *mat, Blob *blob, bool
                 [command_buffer waitUntilScheduled];
             }
             return TNN_OK;
-        } else if (mat_type == NGRAY || mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST) {
+        } else if (mat_type == NGRAY || mat_type == NCHW_FLOAT || mat_type == NCHW_BFP16 || mat_type == RESERVED_BFP16_TEST) {
             // For Buffer input
 
             id<MTLBuffer> input_buffer = nil;
diff --git a/source/tnn/device/metal/metal_context.h b/source/tnn/device/metal/metal_context.h
index d1c2bc9fe..c090a08aa 100644
--- a/source/tnn/device/metal/metal_context.h
+++ b/source/tnn/device/metal/metal_context.h
@@ -60,7 +60,12 @@ class MetalContext : public Context {
 
     // @brief get tnn command queue
     // @param command_queue device command queue for forward
-    Status GetCommandQueue(void **command_queue);
+    Status GetCommandQueue(void **command_queue) override;
+
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
     // @brief share tnn command queue to another context
     Status ShareCommandQueue(Context* context);
 
diff --git a/source/tnn/device/metal/metal_context.mm b/source/tnn/device/metal/metal_context.mm
index 5dbfeafad..8f6f3206b 100644
--- a/source/tnn/device/metal/metal_context.mm
+++ b/source/tnn/device/metal/metal_context.mm
@@ -54,6 +54,10 @@ static NSUInteger smallest_log2(NSUInteger integer) {
     return TNN_OK;
 }
 
+Status MetalContext::SetCommandQueue(void *command_queue) {
+    return Status(TNNERR_COMMON_ERROR, "SetCommandQueue is not supported in metal");
+}
+
 Status MetalContext::ShareCommandQueue(Context* context) {
     if (!metal_context_impl_) {
         return Status(TNNERR_DEVICE_LIBRARY_LOAD, "metal context is nil");
diff --git a/source/tnn/device/opencl/opencl_blob_converter.cc b/source/tnn/device/opencl/opencl_blob_converter.cc
index 2bf03954f..10fd27773 100644
--- a/source/tnn/device/opencl/opencl_blob_converter.cc
+++ b/source/tnn/device/opencl/opencl_blob_converter.cc
@@ -221,6 +221,21 @@ Status OpenCLBlobConverterAcc::ConvertFromMat(Mat &mat, MatConvertParam param, v
     return ret;
 }
 
+bool OpenCLBlobConverterAcc::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 Status OpenCLBlobConverterAcc::GetConvertToMatKernelName(Mat &mat, std::string& kernel_name, std::string& program_name) {
     int dims_size = blob_->GetBlobDesc().dims.size();
     if (blob_->GetBlobDesc().data_type == DATA_TYPE_INT32) {
diff --git a/source/tnn/device/opencl/opencl_blob_converter.h b/source/tnn/device/opencl/opencl_blob_converter.h
index bd5bf8692..35235b312 100644
--- a/source/tnn/device/opencl/opencl_blob_converter.h
+++ b/source/tnn/device/opencl/opencl_blob_converter.h
@@ -34,6 +34,7 @@ class OpenCLBlobConverterAcc : public BlobConverterAcc {
     virtual Status ConvertFromMatAsync(Mat& image, MatConvertParam param, void* command_queue = NULL);
 
 private:
+    bool NeedDoScaleBias(MatConvertParam& param);
     Status CreateConvertUnit(OpenCLExecuteUnit& unit, Mat& mat, MatConvertParam param, bool convert_to_mat);
     void CalculateWorkgroupSize(OpenCLExecuteUnit& unit);
     Status SetConvertArgs(OpenCLExecuteUnit& unit, Mat& mat, MatConvertParam param, bool convert_to_mat);
diff --git a/source/tnn/device/opencl/opencl_context.cc b/source/tnn/device/opencl/opencl_context.cc
index 6dfcb7a91..df71d4f83 100644
--- a/source/tnn/device/opencl/opencl_context.cc
+++ b/source/tnn/device/opencl/opencl_context.cc
@@ -39,6 +39,10 @@ Status OpenCLContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status OpenCLContext::SetCommandQueue(void* command_queue) {
+    return Status(TNNERR_COMMON_ERROR, "SetCommandQueue is not supported in opencl");
+}
+
 Status OpenCLContext::ShareCommandQueue(Context* context) {
     auto context_target = dynamic_cast<OpenCLContext *>(context);
     if (!context_target) {
diff --git a/source/tnn/device/opencl/opencl_context.h b/source/tnn/device/opencl/opencl_context.h
index 345fe87b4..7d47b375c 100644
--- a/source/tnn/device/opencl/opencl_context.h
+++ b/source/tnn/device/opencl/opencl_context.h
@@ -53,6 +53,10 @@ class OpenCLContext : public Context {
     // @param command_queue device command queue for forward
     Status GetCommandQueue(void **command_queue) override;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void *command_queue) override;
+
     // @brief share tnn command queue to another context
     Status ShareCommandQueue(Context* context) override;
 
diff --git a/source/tnn/device/rknpu/rknpu_context.cc b/source/tnn/device/rknpu/rknpu_context.cc
index 0040e4fbb..025955474 100644
--- a/source/tnn/device/rknpu/rknpu_context.cc
+++ b/source/tnn/device/rknpu/rknpu_context.cc
@@ -24,6 +24,10 @@ Status RknpuContext::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status RknpuContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status RknpuContext::OnInstanceForwardBegin() {
     return TNN_OK;
 }
diff --git a/source/tnn/device/rknpu/rknpu_context.h b/source/tnn/device/rknpu/rknpu_context.h
index 79d973082..692e64d6a 100644
--- a/source/tnn/device/rknpu/rknpu_context.h
+++ b/source/tnn/device/rknpu/rknpu_context.h
@@ -31,6 +31,10 @@ class RknpuContext : public Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue);
+
     // @brief before instance forward
     virtual Status OnInstanceForwardBegin() override;
 
diff --git a/source/tnn/device/rknpu/rknpu_network.cc b/source/tnn/device/rknpu/rknpu_network.cc
index 391be9230..6f7317b71 100644
--- a/source/tnn/device/rknpu/rknpu_network.cc
+++ b/source/tnn/device/rknpu/rknpu_network.cc
@@ -40,7 +40,7 @@ RknpuNetwork::~RknpuNetwork() {
 }
 
 Status RknpuNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                          InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+                          InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     if (net_config.device_type != DEVICE_RK_NPU ||
         (model_config.model_type != MODEL_TYPE_TNN && model_config.model_type != MODEL_TYPE_RKCACHE)) {
         return Status(TNNERR_NULL_PARAM, "Rknpu not support device_type or model type");
@@ -345,7 +345,7 @@ Status RknpuNetwork::InitCacheGraph(std::string &cache_path, rk::nn::Graph *grap
     return TNN_OK;
 }
 
-Status RknpuNetwork::GetForwardMemorySize(int &memory_size) {
+Status RknpuNetwork::GetForwardMemorySize(size_t &memory_size) {
     memory_size = 0;
     return TNN_OK;
 }
diff --git a/source/tnn/device/rknpu/rknpu_network.h b/source/tnn/device/rknpu/rknpu_network.h
index 0fc1d24ab..a6f73e2d8 100644
--- a/source/tnn/device/rknpu/rknpu_network.h
+++ b/source/tnn/device/rknpu/rknpu_network.h
@@ -48,7 +48,7 @@ class RknpuNetwork : public AbstractNetwork {
     // @param net_cfg
     // @param net_res
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true) = 0;
 
     // @brief deinit release init create resource
     virtual Status DeInit();
@@ -58,7 +58,7 @@ class RknpuNetwork : public AbstractNetwork {
     //  forward
     //  @return error code: If successful, returns zero. Otherwise, returns
     //  an error code.
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     //  @brief: set memory used by the rapidnet instance without forward
     //  memory, the memory size must be at least that returned by
@@ -78,6 +78,10 @@ class RknpuNetwork : public AbstractNetwork {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue);
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue);
+
     // @brief network infer, it will sync to wait result
     virtual Status Forward();
 
diff --git a/source/tnn/device/rknpu/tnn_impl_rknpu.cc b/source/tnn/device/rknpu/tnn_impl_rknpu.cc
index ddecee0e9..64a67234f 100644
--- a/source/tnn/device/rknpu/tnn_impl_rknpu.cc
+++ b/source/tnn/device/rknpu/tnn_impl_rknpu.cc
@@ -44,17 +44,32 @@ Status TNNImplRknpu::GetModelInputShapesMap(InputShapesMap& shapes_map) {
     return Status(TNNERR_COMMON_ERROR, "RKNPU IMPL can not get model input shapes map");
 }
 
+Status TNNImplRknpu::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+    return Status(TNNERR_COMMON_ERROR, "RKNPU IMPL can not get model input data type map");
+}
+
+Status TNNImplRknpu::GetModelInputNames(std::vector<std::string>& input_names) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model input names");
+}
+
+Status TNNImplRknpu::GetModelOutputNames(std::vector<std::string>& output_names) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model output names");
+
+}
+
 std::shared_ptr<Instance> TNNImplRknpu::CreateInst(NetworkConfig& net_config, Status& status,
-                                                   InputShapesMap inputs_shape) {
+                                                   InputShapesMap inputs_shape,
+                                                   InputDataTypeMap inputs_data_type) {
     auto instance = std::make_shared<Instance>(net_config, model_config_);
-    status        = instance->Init(interpreter_, inputs_shape);
+    status        = instance->Init(interpreter_, inputs_shape, inputs_data_type);
     return instance;
 }
 
 std::shared_ptr<Instance> TNNImplRknpu::CreateInst(NetworkConfig& net_config, Status& status,
-                                                   InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+                                                   InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                   InputDataTypeMap inputs_data_type) {
     auto instance = std::make_shared<Instance>(net_config, model_config_);
-    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape, inputs_data_type);
     return instance;
 }
 
diff --git a/source/tnn/device/rknpu/tnn_impl_rknpu.h b/source/tnn/device/rknpu/tnn_impl_rknpu.h
index 86eb06b26..2b7ae0326 100644
--- a/source/tnn/device/rknpu/tnn_impl_rknpu.h
+++ b/source/tnn/device/rknpu/tnn_impl_rknpu.h
@@ -49,25 +49,37 @@ class TNNImplRknpu : public TNNImpl {
 
     //@brief get input shapes map from model
     virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+    
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    // return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    // return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& input_names);
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param inputs_shape: modify input shape, or it will use the shape in the
     // proto
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap inputs_shape = InputShapesMap());
-
+                                                 InputShapesMap inputs_shape = InputShapesMap(),
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param min_inputs_shape: support min shape
     // @param max_inputs_shape: support max shape
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 
 private:
diff --git a/source/tnn/device/snpe/CMakeLists.txt b/source/tnn/device/snpe/CMakeLists.txt
new file mode 100644
index 000000000..5d8cf8e50
--- /dev/null
+++ b/source/tnn/device/snpe/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB_RECURSE SNPE_SRC *.h *.cc)
+
+add_library(TNNSNPE OBJECT ${SNPE_SRC})
+
+# Verson 2.9 
+include_directories(../../../../third_party/snpe/include/zdl)
+# Version 2.11+
+include_directories(../../../../third_party/snpe/include/SNPE)
diff --git a/source/tnn/device/snpe/snpe_blob_converter.cc b/source/tnn/device/snpe/snpe_blob_converter.cc
new file mode 100644
index 000000000..a4e4980d5
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_blob_converter.cc
@@ -0,0 +1,22 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/snpe/snpe_blob_converter.h"
+
+namespace TNN_NS {
+
+DECLARE_BLOB_CONVERTER_CREATER(Snpe);
+REGISTER_BLOB_CONVERTER(Snpe, DEVICE_DSP);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_blob_converter.h b/source/tnn/device/snpe/snpe_blob_converter.h
new file mode 100644
index 000000000..02e681217
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_blob_converter.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_BLOB_CONVERTER_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_BLOB_CONVERTER_H_
+
+#include "tnn/core/macro.h"
+#include "tnn/device/arm/arm_util.h"
+#include "tnn/utils/blob_converter.h"
+#include "tnn/utils/blob_converter_internal.h"
+#include "tnn/utils/blob_converter_default.h"
+
+namespace TNN_NS {
+
+class SnpeBlobConverterAcc : public DefaultBlobConverterAcc {
+public:
+    SnpeBlobConverterAcc(Blob *blob) : DefaultBlobConverterAcc(blob) {}
+    ~SnpeBlobConverterAcc() {}
+};
+
+} // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_BLOB_CONVERTER_H_
diff --git a/source/tnn/device/snpe/snpe_dsp_context.cc b/source/tnn/device/snpe/snpe_dsp_context.cc
new file mode 100644
index 000000000..dcf43c27e
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_dsp_context.cc
@@ -0,0 +1,56 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/device/snpe/snpe_dsp_context.h"
+
+namespace TNN_NS {
+
+Status SnpeDspContext::LoadLibrary(std::vector<std::string> path) {
+    return TNN_OK;
+}
+
+Status SnpeDspContext::GetCommandQueue(void** command_queue) {
+    return TNN_OK;
+}
+
+Status SnpeDspContext::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
+Status SnpeDspContext::ShareCommandQueue(Context* context) {
+    return TNN_OK;
+}
+
+Status SnpeDspContext::OnInstanceForwardBegin() {
+    Context::OnInstanceForwardBegin();
+    return TNN_OK;
+}
+
+Status SnpeDspContext::SetNumThreads(int num_threads) {
+    return TNN_OK;
+}
+
+int SnpeDspContext::GetNumThreads() {
+    return 1;
+}
+
+Status SnpeDspContext::OnInstanceForwardEnd() {
+    return TNN_OK;
+}
+
+Status SnpeDspContext::Synchronize() {
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_dsp_context.h b/source/tnn/device/snpe/snpe_dsp_context.h
new file mode 100644
index 000000000..360a4ab98
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_dsp_context.h
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_CONTEXT_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "tnn/core/context.h"
+
+namespace TNN_NS {
+
+class SnpeDspContext : public Context {
+public:
+    // load library
+    virtual Status LoadLibrary(std::vector<std::string> path) override;
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status GetCommandQueue(void** command_queue) override;
+
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue) override;
+
+    // @brief share tnn command queue to another context
+    Status ShareCommandQueue(Context* context) override;
+    
+    // @brief before instance forward
+    virtual Status OnInstanceForwardBegin() override;
+
+    // @brief after instance forward
+    virtual Status OnInstanceForwardEnd() override;
+
+    // @brief wait for jobs in the current context to complete
+    virtual Status Synchronize() override;
+
+    // @brief set threads run on device
+    virtual Status SetNumThreads(int num_threads) override;
+    
+    // @brief get threads run on device
+    virtual int GetNumThreads();
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_CONTEXT_H_
diff --git a/source/tnn/device/snpe/snpe_dsp_device.cc b/source/tnn/device/snpe/snpe_dsp_device.cc
new file mode 100644
index 000000000..aaa089613
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_dsp_device.cc
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include "tnn/core/blob.h"
+#include "tnn/device/snpe/snpe_dsp_context.h"
+#include "tnn/device/snpe/snpe_dsp_device.h"
+#include "tnn/utils/blob_memory_size_utils.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+BlobMemorySizeInfo SnpeDspDevice::Calculate1DMemorySize(BlobDesc &desc) {
+    BlobMemorySizeInfo info;
+    info.data_type = desc.data_type;
+    int count      = 0;
+    if (desc.data_format == DATA_FORMAT_NC4HW4) {
+        count = desc.dims[0] * ROUND_UP(DimsFunctionUtils::GetDim(desc.dims, 1), 4) * DimsVectorUtils::Count(desc.dims, 2);
+    } else {
+        count = DimsVectorUtils::Count(desc.dims);
+    }
+    info.dims.push_back(count);
+    return info;
+}
+
+SnpeDspDevice::SnpeDspDevice(DeviceType device_type) : AbstractDevice(device_type) {}
+
+SnpeDspDevice::~SnpeDspDevice() {}
+
+BlobMemorySizeInfo SnpeDspDevice::Calculate(BlobDesc &desc) {
+    return SnpeDspDevice::Calculate1DMemorySize(desc);
+}
+
+Status SnpeDspDevice::Allocate(void **handle, MatType mat_type, DimsVector dims) {
+    // Use CPU Allocation now, update to SNPE dsp later.
+    BlobDesc desc;
+    desc.dims        = dims;
+    desc.device_type = DEVICE_NAIVE;
+    if (mat_type == NCHW_FLOAT) {
+        desc.data_type   = DATA_TYPE_FLOAT;
+        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else {
+        LOGE("SnpeDspDevice does not support mat_type:%d\n", mat_type);
+        return Status(TNNERR_PARAM_ERR, "SNPE DSP does not support mat_type");
+    }
+}
+
+Status SnpeDspDevice::Allocate(void **handle, BlobMemorySizeInfo &size_info) {
+    // Use CPU Allocation now, update to SNPE dsp later.
+    if (handle) {
+        auto size = GetBlobMemoryBytesSize(size_info);
+        if (size > 0) {
+            *handle = malloc(size);
+            if (*handle && size > 0) {
+                memset(*handle, 0, size);
+            }
+        } else if (size == 0) {
+            *handle = nullptr;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "CpuDevice::Allocate malloc bytes size < 0");
+        }
+    }
+    return TNN_OK;
+}
+
+Status SnpeDspDevice::Free(void *handle) {
+    if (handle) {
+        free(handle);
+    }
+    return TNN_OK;
+}
+
+Status SnpeDspDevice::CopyToDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+    // TODO: Use CPU CopyToDevice now, update to SNPE DSP Version later.
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+Status SnpeDspDevice::CopyFromDevice(BlobHandle *dst, const BlobHandle *src, BlobDesc &desc, void *command_queue) {
+    // TODO: Use CPU CopyFromDevice now, update to SNPE DSP Version later.
+    auto size_info       = Calculate(desc);
+    size_t size_in_bytes = GetBlobMemoryBytesSize(size_info);
+    memcpy(reinterpret_cast<char*>(dst->base) + dst->bytes_offset,
+           reinterpret_cast<char*>(src->base) + src->bytes_offset, size_in_bytes);
+    return TNN_OK;
+}
+
+
+AbstractLayerAcc *SnpeDspDevice::CreateLayerAcc(LayerType type) {
+    return nullptr;
+}
+
+
+NetworkType SnpeDspDevice::ConvertAutoNetworkType() {
+    return NETWORK_TYPE_SNPE;
+}
+
+Context* SnpeDspDevice::CreateContext(int device_id) {
+    return new SnpeDspContext();
+}
+
+
+TypeDeviceRegister<SnpeDspDevice> g_snpe_dsp_device_register(DEVICE_DSP);
+
+} // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_dsp_device.h b/source/tnn/device/snpe/snpe_dsp_device.h
new file mode 100644
index 000000000..537ff908a
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_dsp_device.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_DEVICE_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_DEVICE_H_
+
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief SnpeDspDevice define create memory, context and layer acc interface.
+class SnpeDspDevice : public AbstractDevice {
+public:
+    // @brief constructor
+    explicit SnpeDspDevice(DeviceType device_type);
+
+    // @brief virtual destructor
+    ~SnpeDspDevice();
+
+    // @brief calculate blob memory size for different dims
+    // @param BlobDesc blob description
+    // @return blob memory size info
+    virtual BlobMemorySizeInfo Calculate(BlobDesc& desc);
+
+    // @brief Allocates mat  memory
+    // @param MatType mat type description
+    // @param DimsVector mat dims
+    // @return blob memory size info
+    virtual Status Allocate(void** handle, MatType mat_type, DimsVector dims);
+
+    // @brief Allocates memory
+    // @param size info blob size info to allocate
+    // @param handle handle blob memory
+    // @return TNN_OK if free success, otherwise error code.
+    virtual Status Allocate(void** handle, BlobMemorySizeInfo& size_info);
+
+    // @brief Releases memory resources associated by the handle.
+    // @return TNN_OK if free success, otherwise error code.
+    virtual Status Free(void* handle);
+
+    // @brief Transfer memory from Host to Device
+    // @return TNN_OK if copy success, otherwise error code.
+    virtual Status CopyToDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    // @brief Transfer memory from Device to Host
+    // @return TNN_OK if copy success, otherwise error code.
+    virtual Status CopyFromDevice(BlobHandle* dst, const BlobHandle* src, BlobDesc& desc, void* command_queue);
+
+    // @brief CreateLayerAcc create different layer type acc
+    virtual AbstractLayerAcc* CreateLayerAcc(LayerType type);
+
+    // @brief CreateContext create tnn instance device context
+    virtual Context* CreateContext(int device_id);
+    
+    // @brief get implemented layouts on the device by layer type
+    //virtual std::shared_ptr<const ImplementedLayout> GetImplementedLayout(LayerType type);
+
+    // @brief auto network type decided by device.
+    virtual NetworkType ConvertAutoNetworkType();
+
+private:
+    static BlobMemorySizeInfo Calculate1DMemorySize(BlobDesc& desc);
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_DSP_DEVICE_H_
diff --git a/source/tnn/device/snpe/snpe_model_interpreter.cc b/source/tnn/device/snpe/snpe_model_interpreter.cc
new file mode 100644
index 000000000..c714ac074
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_model_interpreter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#include <fstream>
+
+#include "tnn/device/snpe/snpe_model_interpreter.h"
+
+namespace TNN_NS {
+
+SnpeModelInterpreter::SnpeModelInterpreter() {}
+
+SnpeModelInterpreter::~SnpeModelInterpreter() {}
+
+Status SnpeModelInterpreter::Interpret(std::vector<std::string>& params) {
+    std::string dlc_content = params[0];
+    std::ifstream dlc_file(dlc_content);
+    if (!dlc_file) {
+        LOGE("SnpeModelInterpreter: Invalied dlc file path!\n");
+        return TNNERR_INVALID_MODEL;
+    }
+
+    container_ = zdl::DlContainer::IDlContainer::open(zdl::DlSystem::String(dlc_content.c_str()));
+    if (container_ == nullptr) {
+        LOGE("SnpeModelInterpreter: Load dlc file failed!\n");
+        return TNNERR_INVALID_MODEL;
+    }
+
+    return TNN_OK;
+}
+
+std::unique_ptr<zdl::DlContainer::IDlContainer>& SnpeModelInterpreter::GetContainer() {
+    return container_;
+}
+
+TypeModelInterpreterRegister<TypeModelInterpreterCreator<SnpeModelInterpreter>>
+    g_snpe_model_interpreter_register(MODEL_TYPE_SNPE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_model_interpreter.h b/source/tnn/device/snpe/snpe_model_interpreter.h
new file mode 100644
index 000000000..0dcdcf5c5
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_model_interpreter.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_MODEL_INTERPRETER_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_MODEL_INTERPRETER_H_
+
+#include <memory>
+#include <vector>
+
+#include "DlContainer/IDlContainer.hpp"
+
+#include "tnn/core/status.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+// @brief SNPE model interpreter interpret SNPE model
+class SnpeModelInterpreter : public AbstractModelInterpreter {
+public:
+    SnpeModelInterpreter();
+
+    // @brief virtual destructor
+    virtual ~SnpeModelInterpreter();
+
+    // @brief different interpreter has different order param
+    virtual Status Interpret(std::vector<std::string>& params);
+
+    // @brief get SNPE container, only in SNPE Model Interpreter
+    std::unique_ptr<zdl::DlContainer::IDlContainer>& GetContainer();
+
+private:
+    std::unique_ptr<zdl::DlContainer::IDlContainer> container_;
+};
+
+}  // namespace TNN_NS
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_MODEL_INTERPRETER_H_
diff --git a/source/tnn/device/snpe/snpe_network.cc b/source/tnn/device/snpe/snpe_network.cc
new file mode 100644
index 000000000..f8fab2d4b
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_network.cc
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "DlSystem/DlEnums.hpp"
+#include "DlSystem/DlVersion.hpp"
+#include "DlSystem/String.hpp"
+#include "SNPE/SNPEFactory.hpp"
+
+#include "tnn/device/snpe/snpe_model_interpreter.h"
+#include "tnn/device/snpe/snpe_network.h"
+#include "tnn/device/snpe/snpe_utils.h"
+
+namespace TNN_NS {
+
+NetworkImplFactoryRegister<NetworkImplFactory<SnpeNetwork>>
+    g_network_impl_snpe_factory_register(NETWORK_TYPE_SNPE);
+
+SnpeNetwork::~SnpeNetwork() {
+    DeInit();
+}
+
+Status SnpeNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config,
+                         AbstractModelInterpreter *interpreter,
+                         InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                         InputDataTypeMap inputs_data_type, bool enable_const_folder) {
+    SnpeModelInterpreter *snpe_interpreter =
+        dynamic_cast<SnpeModelInterpreter *>(interpreter);
+
+    std::unique_ptr<zdl::DlContainer::IDlContainer> &container =
+        snpe_interpreter->GetContainer();
+
+    zdl::DlSystem::Version_t version =
+        zdl::SNPE::SNPEFactory::getLibraryVersion();
+    LOGD("Run TNN SNPE with SPNE Version: %s\n", version.asString().c_str());
+    
+    zdl::DlSystem::Runtime_t runtime = SelectSNPERuntime();
+    if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(runtime)) {
+        LOGE("SNPE Runtime not avaliable!\n");
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    //LoadUdoPackages();
+    
+    zdl::DlSystem::PlatformConfig platform_config;
+    zdl::DlSystem::RuntimeList runtime_list;
+    zdl::DlSystem::StringList outputs;
+    for (int i = 1; i < model_config.params.size(); i++) {
+        outputs.append(model_config.params[i].c_str());
+    }
+
+    snpe_ = SetBuilderOptions(container, runtime, runtime_list, true,
+                              platform_config, false, outputs);
+    if (snpe_ == nullptr) {
+        LOGE("Build snpe falied, API SetBuilderOptions() return nullptr.\n");
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    CreateInputBufferMap(input_map_, input_blob_map_,
+                         application_input_buffers_,
+                         snpe_userbacked_input_buffers_, snpe_, false);
+    CreateOutputBufferMap(output_map_, output_blob_map_,
+                          application_output_buffers_,
+                          snpe_userbacked_output_buffers_, snpe_, false);
+
+    device_ = GetDevice(DEVICE_DSP);
+    if (device_ == NULL) {
+        return TNNERR_DEVICE_NOT_SUPPORT;
+    }
+
+    context_ = device_->CreateContext(net_config.device_id);
+    context_->LoadLibrary(net_config.library_path);
+
+    return TNN_OK;
+}
+
+Status SnpeNetwork::GetForwardMemorySize(int &memory_size) {
+    memory_size = 0;
+    return TNN_OK;
+}
+
+Status SnpeNetwork::SetForwardMemory(void *memory) {
+    return TNN_OK;
+}
+
+Status SnpeNetwork::GetAllInputBlobs(BlobMap &blobs) {
+    blobs = input_blob_map_;
+    return TNN_OK;
+}
+
+Status SnpeNetwork::GetAllOutputBlobs(BlobMap &blobs) {
+    blobs = output_blob_map_;
+    return TNN_OK;
+}
+
+Status SnpeNetwork::Reshape(const InputShapesMap &inputs) {
+    LOGD("Calling TNN SNPE Network Reshape\n");
+    return TNN_OK;
+}
+
+Status SnpeNetwork::DeInit() {
+    for (auto item : input_blob_map_) {
+        delete item.second;
+    }
+    input_blob_map_.clear();
+    for (auto item : output_blob_map_) {
+        delete item.second;
+    }
+    output_blob_map_.clear();
+    return TNN_OK;
+}
+
+//Status SnpeNetwork::GetCommandQueue(void **command_queue) {
+//    return TNN_OK;
+//}
+
+Status SnpeNetwork::Forward() {
+    LOGD("Calling TNN SNPE Network Forward\n");
+    bool ret = snpe_->execute(input_map_, output_map_);
+    if (!ret) {
+        LOGE("TNN SnpeNetwork::Forward returned non-zero.\n");
+        return TNNERR_SNPE_API_ERROR;
+    }
+    return TNN_OK;
+}
+
+Status SnpeNetwork::ForwardAsync(Callback call_back) {
+    LOGD("Calling TNN SNPE Network ForwardAsync\n");
+    bool ret = snpe_->execute(input_map_, output_map_);
+    if (!ret) {
+        LOGE("TNN SnpeNetwork::ForwardAsync returned non-zero.\n");
+        return TNNERR_SNPE_API_ERROR;
+    }
+    return TNN_OK;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_network.h b/source/tnn/device/snpe/snpe_network.h
new file mode 100644
index 000000000..df4b3f2b4
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_network.h
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_NETWORK_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_NETWORK_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "SNPE/SNPE.hpp"
+
+#include "tnn/core/abstract_network.h"
+#include "tnn/core/default_network.h"
+
+namespace TNN_NS {
+
+class SnpeNetwork : public DefaultNetwork {
+public:
+    // @brief virtual default destructor
+    virtual ~SnpeNetwork();
+
+    // @brief int net with network config, net structure and net resource info
+    // @param config network config info
+    // @param net_structure network structure info
+    // @param net_resource network resource info
+    // @param inputs_shape_map modify input shape, if empty, it will use the shape in proto
+    // @param inputs_data_type specify input data type, by default float
+    virtual Status Init(NetworkConfig &net_config,
+                        ModelConfig &model_config,
+                        AbstractModelInterpreter *interpreter,
+                        InputShapesMap min_inputs_shape,
+                        InputShapesMap max_inputs_shape,
+                        InputDataTypeMap inputs_data_type,
+                        bool enable_const_folder=true);
+
+    // @brief network deinit to release init create resource
+    virtual Status DeInit();
+
+    // @brief reshape with input shape info
+    // @inputs input shape info
+    virtual Status Reshape(const InputShapesMap &inputs);
+
+    // @brief network infer, it will sync to wait result
+    virtual Status Forward();
+
+    // @brief tnn instance network infer, it will not wait
+    virtual Status ForwardAsync(Callback call_back);
+    
+    //  @brief return the amount of memory required for forward
+    //  @param memory_size: the memory size used by tnn layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, return an error code.
+    virtual Status GetForwardMemorySize(int &memory_size);
+
+    //  @brief: set memory used by the tnn instance without forward
+    //  memory, the memory size must be at least that returned by
+    //  GetForwardMemorySize(). releasing or otherwise using the memory for
+    //  other purposes during the tnn network run will result in
+    //  undefined behavior.
+    //  @param memory: the memory used by tnn layers for forward
+    //  @return error code: If successful, returns zero. Otherwise, return an error code.
+    virtual Status SetForwardMemory(void *memory);
+
+    // @brief get tnn command queue
+    // @param command_queue device command queue for forward
+    //virtual Status GetCommandQueue(void **command_queue);
+
+    // @brief get all input blobs
+    // @param blobs input blobs name map
+    virtual Status GetAllInputBlobs(BlobMap &blobs);
+
+    // @brief get all output blobs
+    // @param blobs output blobs name map
+    virtual Status GetAllOutputBlobs(BlobMap &blobs);
+
+private:
+    std::unique_ptr<zdl::SNPE::SNPE> snpe_;
+    zdl::DlSystem::UserBufferMap input_map_;
+    zdl::DlSystem::UserBufferMap output_map_;
+    std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>> snpe_userbacked_input_buffers_;
+    std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>> snpe_userbacked_output_buffers_;
+    std::unordered_map<std::string, std::vector<uint8_t>> application_input_buffers_;
+    std::unordered_map<std::string, std::vector<uint8_t>> application_output_buffers_;
+
+    BlobMap input_blob_map_;
+    BlobMap output_blob_map_;
+};
+
+}  // namespace TNN_NS
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_NETWORK_H_
diff --git a/source/tnn/device/snpe/snpe_utils.cc b/source/tnn/device/snpe/snpe_utils.cc
new file mode 100644
index 000000000..0cb5bb789
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_utils.cc
@@ -0,0 +1,225 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "DlSystem/StringList.hpp"
+#include "DlSystem/TensorShape.hpp"
+#include "SNPE/SNPEFactory.hpp"
+
+#include "tnn/core/macro.h"
+#include "tnn/device/snpe/snpe_utils.h"
+
+namespace TNN_NS {
+
+zdl::DlSystem::Runtime_t SelectSNPERuntime(std::string prefered_runtime) {
+    zdl::DlSystem::Runtime_t runtime = zdl::DlSystem::Runtime_t::CPU;
+
+    if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::CPU) &&
+        !zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU)) {
+        LOGE("Error: SNPE cannot run on both CPU and GPU, perhaps you are not running on a Qualcomm Snapdragon SoC.\n");
+    }
+
+    if (prefered_runtime == "CPU") {
+        if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::CPU)) {
+            LOGI("Run TNN SNPE on Selected CPU device.\n");
+            return zdl::DlSystem::Runtime_t::CPU;
+        }
+    }
+    if (prefered_runtime == "GPU") {
+        if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU)) {
+            LOGI("Run TNN SNPE on Selected GPU device.\n");
+            return zdl::DlSystem::Runtime_t::GPU;
+        }
+    }
+    if (prefered_runtime == "DSP") {
+        if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::DSP)) {
+            LOGI("Run TNN SNPE on Selected DSP device.\n");
+            return zdl::DlSystem::Runtime_t::DSP;
+        }
+    }
+
+    // Else Select GPU -> CPU
+    if (zdl::SNPE::SNPEFactory::isRuntimeAvailable(zdl::DlSystem::Runtime_t::GPU)) {
+        LOGI("Run TNN SNPE on GPU device.\n");
+        return zdl::DlSystem::Runtime_t::GPU;
+    }
+
+    return runtime;
+}
+
+size_t CalcSizeFromDims(const zdl::DlSystem::Dimension* dims,
+                        size_t rank,
+                        size_t element_size) {
+    if (rank == 0) {
+        return 0;
+    }
+    size_t size = element_size;
+    while (rank--) {
+        (*dims == 0) ? size *= 0 : size *= *dims;
+        dims++;
+    }
+    return size;
+}
+
+std::unique_ptr<zdl::SNPE::SNPE> SetBuilderOptions(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                                                   zdl::DlSystem::Runtime_t runtime,
+                                                   zdl::DlSystem::RuntimeList runtime_list,
+                                                   bool use_user_supplied_buffers,
+                                                   zdl::DlSystem::PlatformConfig platform_config,
+                                                   bool use_caching,
+                                                   zdl::DlSystem::StringList outputs) {
+    std::unique_ptr<zdl::SNPE::SNPE> snpe;
+    zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
+
+    if (runtime_list.empty()) {
+        runtime_list.add(runtime);
+    }
+
+    snpe = snpeBuilder.setOutputLayers(outputs)
+               .setRuntimeProcessorOrder(runtime_list)
+               .setUseUserSuppliedBuffers(use_user_supplied_buffers)
+               .setPlatformConfig(platform_config)
+               .setInitCacheMode(use_caching)
+               .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::HIGH_PERFORMANCE)
+               .build();
+    return snpe;
+}
+
+void CreateUserBuffer(zdl::DlSystem::UserBufferMap& user_buffer_map,
+                      BlobMap& blobmap,
+                      std::unordered_map<std::string, std::vector<uint8_t>>& application_buffers,
+                      std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>>&snpe_userbacked_buffers,
+                      std::unique_ptr<zdl::SNPE::SNPE>& snpe,
+                      const char* name,
+                      const bool is_tf8_buffer) {
+    // get attributes of buffer by name
+    auto buffer_attr = snpe->getInputOutputBufferAttributes(name);
+    if (!buffer_attr) {
+        LOGE("Error while creating user buffer, TNN SNPE GetInputBufferAttributes ERROR.\n");
+        throw std::runtime_error(std::string("TNN SNPE: Error obtaining attributes for input tensor ") + name);
+    }
+
+    // calculate the size of buffer required by the input tensor
+    const zdl::DlSystem::TensorShape& buffer_shape = (*buffer_attr)->getDims();
+    
+    // Calculate the stride based on buffer strides.
+    // Note: Strides = Number of bytes to advance to the next element in each
+    // dimension. For example, if a float tensor of dimension 2x4x3 is tightly
+    // packed in a buffer of 96 bytes, then the strides would be (48,12,4) Note:
+    // Buffer stride is usually known and does not need to be calculated.
+    std::vector<size_t> strides(buffer_shape.rank());
+    strides[strides.size() - 1] = is_tf8_buffer ? sizeof(uint8_t) : sizeof(float);
+    size_t stride = strides[strides.size() - 1];
+    for (size_t i = buffer_shape.rank() - 1; i > 0; i--) {
+        (buffer_shape[i] == 0) ? stride *= 0 : stride *= buffer_shape[i];
+        strides[i - 1] = stride;
+    }
+
+    const size_t buffer_element_size = is_tf8_buffer ? sizeof(uint8_t) : sizeof(float);
+    size_t buf_size = CalcSizeFromDims(buffer_shape.getDimensions(), buffer_shape.rank(), buffer_element_size);
+    
+    // set the buffer encoding type
+    std::unique_ptr<zdl::DlSystem::UserBufferEncoding> user_buffer_encoding;
+    if (buffer_element_size == sizeof(uint8_t)) {
+        user_buffer_encoding =
+            std::move(std::unique_ptr<zdl::DlSystem::UserBufferEncodingTf8>(
+                new zdl::DlSystem::UserBufferEncodingTf8(0, 1.0)));
+    } else {
+        user_buffer_encoding =
+            std::move(std::unique_ptr<zdl::DlSystem::UserBufferEncodingFloat>(
+                new zdl::DlSystem::UserBufferEncodingFloat()));
+    }
+
+    // create user-backed storage to load input data onto it
+    application_buffers.emplace(name, std::vector<uint8_t>(buf_size));
+
+    // create SNPE user buffer from the user-backed buffer
+    zdl::DlSystem::IUserBufferFactory& ub_factory = zdl::SNPE::SNPEFactory::getUserBufferFactory();
+    snpe_userbacked_buffers.push_back(ub_factory.createUserBuffer(
+        application_buffers.at(name).data(), buf_size, strides,
+        user_buffer_encoding.get()));
+    if (snpe_userbacked_buffers.back() == nullptr) {
+        LOGE("TNN SNPE: Error while creating SNPE user buffer.\n");
+    }
+    // add the user-backed buffer to the inputMap, which is later on fed to the
+    // network for execution
+    user_buffer_map.add(name, snpe_userbacked_buffers.back().get());
+
+    // add blob
+    BlobDesc desc;
+    desc.data_format = DATA_FORMAT_NHWC;
+    desc.name        = name;
+    desc.device_type = DEVICE_DSP;
+    for (int i = 0; i<buffer_shape.rank(); i++) {
+        desc.dims.push_back(buffer_shape[i]);
+    }
+    BlobHandle handle;
+    handle.base   = application_buffers.at(name).data();
+    blobmap[name] = new Blob(desc, handle);
+}
+
+void CreateInputBufferMap(zdl::DlSystem::UserBufferMap& input_map,
+                          BlobMap& input_blobmap,
+                          std::unordered_map<std::string, std::vector<uint8_t>>& application_buffers,
+                          std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>>&snpe_userbacked_buffers,
+                          std::unique_ptr<zdl::SNPE::SNPE>& snpe,
+                          bool is_tf8_buffer) {
+    // get input tensor names of the network that need to be populated
+    const auto& input_names_opt = snpe->getInputTensorNames();
+    if (!input_names_opt) {
+        LOGE("TNN SNPE: Error obtaining input tensor names.\n");
+        throw std::runtime_error("TNN SNPE: Error obtaining input tensor names");
+    }
+    const zdl::DlSystem::StringList& input_names = *input_names_opt;
+    assert(input_names.size() > 0);
+
+    // create SNPE user buffers for each application storage buffer
+    for (const char* name : input_names) {
+        CreateUserBuffer(input_map, input_blobmap, application_buffers,
+                         snpe_userbacked_buffers, snpe, name, is_tf8_buffer);
+    }
+}
+
+void CreateOutputBufferMap(zdl::DlSystem::UserBufferMap& output_map,
+                           BlobMap& output_blobmap,
+                           std::unordered_map<std::string, std::vector<uint8_t>>& application_buffers,
+                           std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>>&snpe_userbacked_buffers,
+                           std::unique_ptr<zdl::SNPE::SNPE>& snpe,
+                           bool is_tf8_buffer) {
+    // get input tensor names of the network that need to be populated
+    const auto& output_names_opt = snpe->getOutputTensorNames();
+    if (!output_names_opt) {
+        throw std::runtime_error("Error obtaining output tensor names");
+    }
+    const zdl::DlSystem::StringList& output_names = *output_names_opt;
+
+    // create SNPE user buffers for each application storage buffer
+    for (const char* name : output_names) {
+        CreateUserBuffer(output_map, output_blobmap, application_buffers,
+                         snpe_userbacked_buffers, snpe, name, is_tf8_buffer);
+    }
+}
+
+// WARNING: SNPE UDO not fully TESTED.
+void LoadUdoPackages(const std::string& package_dir) {
+    std::vector<std::string> udo_package_names = {"Selu"};
+
+    for (const auto & name : udo_package_names) {
+        std::string full_regpkg_path = package_dir + "/libUdoTNN" + name + "ImplCpu.so";
+        if (zdl::SNPE::SNPEFactory::addOpPackage(full_regpkg_path) == false) {
+            LOGE("Fail to Add Op Package %s.\n", full_regpkg_path.c_str());
+        }
+    }
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/snpe_utils.h b/source/tnn/device/snpe/snpe_utils.h
new file mode 100644
index 000000000..1b21d0c60
--- /dev/null
+++ b/source/tnn/device/snpe/snpe_utils.h
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_UTILS_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "DlContainer/IDlContainer.hpp"
+#include "DlSystem/IUserBuffer.hpp"
+#include "DlSystem/IUserBufferFactory.hpp"
+#include "DlSystem/UserBufferMap.hpp"
+#include "SNPE/SNPE.hpp"
+#include "SNPE/SNPEBuilder.hpp"
+
+#include "tnn/core/blob.h"
+
+namespace TNN_NS {
+
+zdl::DlSystem::Runtime_t SelectSNPERuntime(std::string prefered_runtime = "GPU");
+
+std::unique_ptr<zdl::SNPE::SNPE> SetBuilderOptions(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                                                   zdl::DlSystem::Runtime_t runtime,
+                                                   zdl::DlSystem::RuntimeList runtime_list,
+                                                   bool use_user_supplied_buffers,
+                                                   zdl::DlSystem::PlatformConfig platform_config,
+                                                   bool use_caching,
+                                                   zdl::DlSystem::StringList outputs);
+
+void CreateInputBufferMap(zdl::DlSystem::UserBufferMap& input_map,
+                          BlobMap& input_blobmap,
+                          std::unordered_map<std::string, std::vector<uint8_t>>& application_buffers,
+                          std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>>& snpe_userbacked_buffers,
+                          std::unique_ptr<zdl::SNPE::SNPE>& snpe,
+                          bool is_tf8_buffer);
+
+void CreateOutputBufferMap(zdl::DlSystem::UserBufferMap& output_map,
+                           BlobMap& output_blobmap,
+                           std::unordered_map<std::string, std::vector<uint8_t>>& application_buffers,
+                           std::vector<std::unique_ptr<zdl::DlSystem::IUserBuffer>>& snpe_userbacked_buffers,
+                           std::unique_ptr<zdl::SNPE::SNPE>& snpe,
+                           bool is_tf8_buffer);
+
+void LoadUdoPackages(const std::string& package_dir = "/data/local/tmp/tnn-test/lib");
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_SNPE_UTILS_H_
diff --git a/source/tnn/device/snpe/tnn_impl_snpe.cc b/source/tnn/device/snpe/tnn_impl_snpe.cc
new file mode 100644
index 000000000..1787a039c
--- /dev/null
+++ b/source/tnn/device/snpe/tnn_impl_snpe.cc
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+
+#include "tnn/core/instance.h"
+#include "tnn/device/snpe/tnn_impl_snpe.h"
+#include "tnn/interpreter/abstract_model_interpreter.h"
+
+namespace TNN_NS {
+
+TNNImplFactoryRegister<TNNImplFactory<TNNImplSnpe>>
+    g_tnn_impl_snpe_factory_register(MODEL_TYPE_SNPE);
+
+TNNImplSnpe::TNNImplSnpe() {}
+
+TNNImplSnpe::~TNNImplSnpe() {}
+
+Status TNNImplSnpe::Init(ModelConfig& config) {
+    TNNImpl::Init(config);
+    auto interpreter = CreateModelInterpreter(config.model_type);
+    if (!interpreter) {
+        return Status(TNNERR_NET_ERR, "interpreter is nil");
+    }
+    interpreter_ = std::shared_ptr<AbstractModelInterpreter>(interpreter);
+    interpreter_->Interpret(config.params);
+    return TNN_OK;
+}
+
+Status TNNImplSnpe::DeInit() {
+    return TNN_OK;
+}
+
+Status TNNImplSnpe::AddOutput(const std::string& layer_name,
+                              int output_index) {
+    return TNN_OK;
+}
+
+//Status TNNImplSnpe::AddOutput(const std::string& layer_name, int output_index) {
+//    return Status(TNNERR_MODEL_ERR, "Error: TNN SNPE Impl does not support adding output");
+//}
+
+Status TNNImplSnpe::GetModelInputShapesMap(InputShapesMap& shapes_map) {
+    return Status(TNNERR_NET_ERR, "Error: TNN SNPE Impl does not supprt get model input shapes");
+}
+
+Status TNNImplSnpe::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+    return Status(TNNERR_NET_ERR, "Error: TNN SNPE Impl does not supprt get model input data types");
+}
+
+Status TNNImplSnpe::GetModelInputNames(std::vector<std::string>& input_names) {
+    return Status(TNNERR_NET_ERR, "Error: TNN SNPE Impl does not supprt get model input names");
+}
+
+Status TNNImplSnpe::GetModelOutputNames(std::vector<std::string>& output_names) {
+    return Status(TNNERR_NET_ERR, "Error: TNN SNPE Impl does not supprt get model output names");
+}
+
+std::shared_ptr<Instance> TNNImplSnpe::CreateInst(NetworkConfig& net_config,
+                                                  Status& status,
+                                                  InputShapesMap inputs_shape,
+                                                  InputDataTypeMap inputs_data_type) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, inputs_shape, inputs_data_type);
+    return instance;
+}
+
+std::shared_ptr<Instance> TNNImplSnpe::CreateInst(NetworkConfig& net_config,
+                                                  Status& status,
+                                                  InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                  InputDataTypeMap inputs_data_type) {
+    auto instance = std::make_shared<Instance>(net_config, model_config_);
+    status        = instance->Init(interpreter_, min_inputs_shape, max_inputs_shape, inputs_data_type);
+    return instance;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/device/snpe/tnn_impl_snpe.h b/source/tnn/device/snpe/tnn_impl_snpe.h
new file mode 100644
index 000000000..b98427386
--- /dev/null
+++ b/source/tnn/device/snpe/tnn_impl_snpe.h
@@ -0,0 +1,90 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_SNPE_TNN_IMPL_SNPE_H_
+#define TNN_SOURCE_TNN_DEVICE_SNPE_TNN_IMPL_SNPE_H_
+
+#include "tnn/core/tnn_impl.h"
+
+namespace TNN_NS {
+
+// @brief tnn impl with interpreter
+class TNNImplSnpe : public TNNImpl {
+public:
+    // @brief tnn constructor
+    TNNImplSnpe();
+
+    // @brief tnn destructor
+    virtual ~TNNImplSnpe();
+
+    // @brief init the tnn, contruct model interpreter
+    // @param config config model type and params
+    // @return status code: Successful, returns zero. Otherwise, returns
+    // error code.
+    virtual Status Init(ModelConfig& config);
+
+    // @brief release model interpreter
+    virtual Status DeInit();
+
+    //@brief Adds output to the layer. If layerName not found, then search
+    // outputIndex.
+    //@param output_name Name of the output blob
+    //@param output_index Index of the output layer
+    //@return status code: If successful, returns zero. Otherwise, returns
+    // error
+    // code.
+    virtual Status AddOutput(const std::string& output_name,
+                             int output_index = 0);
+
+    // return input shapes map from model
+    virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+    
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    // return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    // return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& output_names);
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param inputs_shape: modify input shape, or it will use the shape in the
+    // proto
+    // @param inputs_data_type: modify input data type
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap inputs_shape = InputShapesMap(),
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
+
+
+    // @brief create an instance
+    // @param instance: The instance to be created.
+    // @param min_inputs_shape: support min shape
+    // @param max_inputs_shape: support max shape
+    // @param inputs_data_type: modify input data type
+    // @param status code: If successful, returns zero. Otherwise, returns
+    // error code.
+    virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
+
+private:
+    std::shared_ptr<AbstractModelInterpreter> interpreter_;
+};
+
+}  // namespace TNN_NS
+#endif  // TNN_SOURCE_TNN_DEVICE_SNPE_TNN_IMPL_SNPE_H_
diff --git a/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc b/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc
index a1f1ba178..5964931a3 100644
--- a/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc
+++ b/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.cc
@@ -14,6 +14,7 @@
 
 #include <math.h>
 #include "tnn/device/x86/acc/x86_layer_acc.h"
+#include "tnn/device/x86/acc/x86_layer_norm_layer_acc.h"
 #include "tnn/utils/data_type_utils.h"
 #include "tnn/utils/dims_utils.h"
 
@@ -21,7 +22,41 @@
 #include "tnn/device/x86/acc/Float8.h"
 namespace TNN_NS {
 
-DECLARE_X86_ACC(LayerNorm, LAYER_LAYER_NORM);
+X86LayerNormLayerAcc::~X86LayerNormLayerAcc() {}
+
+Status X86LayerNormLayerAcc::Init(Context *context, LayerParam *param, LayerResource *resource,
+                                  const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) {
+    auto scale_blob = inputs[1];
+    auto bias_blob  = inputs[2];
+
+    // Convert Scale and Bias to float if they are of half type.
+    if (scale_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        std::string name = scale_blob->GetBlobDesc().name;
+        if (const_resource_ == nullptr || const_resource_->find(name) == const_resource_->end()) {
+            return Status(TNNERR_LAYER_ERR, "X86LayerNormLayerAcc has invalid scale, unable to find scale in constant_map.");
+        }
+        auto scale_fp16 = (*const_resource_)[name];
+        auto scale_fp32 = std::make_shared<RawBuffer>(ConvertHalfHandle(*scale_fp16));
+        scale_fp32->SetBufferDims(scale_fp16->GetBufferDims());
+        (*const_resource_)[name] = scale_fp32;
+    }
+    
+    if (bias_blob->GetBlobDesc().data_type == DATA_TYPE_HALF) {
+        std::string name = bias_blob->GetBlobDesc().name;
+        if (const_resource_ == nullptr || const_resource_->find(name) == const_resource_->end()) {
+            return Status(TNNERR_LAYER_ERR, "X86LayerNormLayerAcc has invalid bias, unable to find bias in constant_map.");
+        }
+        auto bias_fp16 = (*const_resource_)[name];
+        auto bias_fp32 = std::make_shared<RawBuffer>(ConvertHalfHandle(*bias_fp16));
+        bias_fp32->SetBufferDims(bias_fp16->GetBufferDims());
+        (*const_resource_)[name] = bias_fp32;
+    }
+
+    Status ret = X86LayerAcc::Init(context, param, resource, inputs, outputs);
+    
+    RETURN_ON_NEQ(ret, TNN_OK);
+    return TNN_OK;
+}
 
 template <typename VEC, int pack>
 static void norm_func(float *input, float *output, int channels, int area, const float *k_data, const float *b_data,
diff --git a/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.h b/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.h
new file mode 100644
index 000000000..9f81830b4
--- /dev/null
+++ b/source/tnn/device/x86/acc/x86_layer_norm_layer_acc.h
@@ -0,0 +1,33 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_NORM_LAYER_ACC_H_
+#define TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_NORM_LAYER_ACC_H_
+
+#include "tnn/device/x86/acc/x86_layer_acc.h"
+
+namespace TNN_NS {
+
+class X86LayerNormLayerAcc : public X86LayerAcc {
+public:
+    virtual ~X86LayerNormLayerAcc();
+
+    Status Init(Context *context, LayerParam *param, LayerResource *resource, const std::vector<Blob *> &inputs,
+                const std::vector<Blob *> &outputs) override;
+    virtual Status DoForward(const std::vector<Blob *> &inputs, const std::vector<Blob *> &outputs) override;
+};
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_DEVICE_X86_X86_LAYER_NORM_LAYER_ACC_H_
diff --git a/source/tnn/device/x86/acc/x86_permute_layer_acc.cc b/source/tnn/device/x86/acc/x86_permute_layer_acc.cc
index fe47a8e9c..1e29bd8a1 100644
--- a/source/tnn/device/x86/acc/x86_permute_layer_acc.cc
+++ b/source/tnn/device/x86/acc/x86_permute_layer_acc.cc
@@ -143,5 +143,6 @@ Status X86PermuteLayerAcc::DoForward(const std::vector<Blob *> &inputs, const st
 }
 
 X86TypeLayerAccRegister<TypeLayerAccCreator<X86PermuteLayerAcc>> g_x86_permute_layer_acc_register(LAYER_PERMUTE);
+X86TypeLayerAccRegister<TypeLayerAccCreator<X86PermuteLayerAcc>> g_x86_permutev2_layer_acc_register(LAYER_PERMUTEV2);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc b/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc
index fe7c33bcd..f895eb984 100644
--- a/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc
+++ b/source/tnn/device/x86/acc/x86_splitv_layer_acc.cc
@@ -61,5 +61,6 @@ Status X86SplitVLayerAcc::DoForward(const std::vector<Blob *> &inputs, const std
 }
 
 REGISTER_X86_ACC(SplitV, LAYER_SPLITV);
+REGISTER_X86_ACC(SplitV, LAYER_SPLITTORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/x86/x86_blob_converter.cc b/source/tnn/device/x86/x86_blob_converter.cc
index 2f9a3c1b8..a2149ae62 100644
--- a/source/tnn/device/x86/x86_blob_converter.cc
+++ b/source/tnn/device/x86/x86_blob_converter.cc
@@ -316,6 +316,8 @@ REGISTER_X86_BLOB_CONVERT_FUNC(NGRAY,               DATA_TYPE_INT8,  CVT_DIR_MAT
 REGISTER_X86_BLOB_CONVERT_FUNC(NNV12,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV12ToInt8Blob)
 REGISTER_X86_BLOB_CONVERT_FUNC(NNV21,               DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNNV21ToInt8Blob)
 REGISTER_X86_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertNCHWFloatToInt8Blob)
+REGISTER_X86_BLOB_CONVERT_FUNC(NC_INT8,             DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
+// DEPRECATED, TO BE REMOVED IN FUTURE VERSION
 REGISTER_X86_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_MAT2BLOB, ConvertInt8MatToInt8Blob)
 
 template <bool reverse_channel>
@@ -423,6 +425,8 @@ static Status ConvertInt8BlobToInt8Mat(Mat& image, char* handle_ptr, const MatCo
 REGISTER_X86_BLOB_CONVERT_FUNC(N8UC4,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC4)
 REGISTER_X86_BLOB_CONVERT_FUNC(N8UC3,               DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToN8UC3)
 REGISTER_X86_BLOB_CONVERT_FUNC(NCHW_FLOAT,          DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToNCHWFloat)
+REGISTER_X86_BLOB_CONVERT_FUNC(NC_INT8,             DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
+// DEPRECATED, TO BE REMOVED IN FUTURE VERSION
 REGISTER_X86_BLOB_CONVERT_FUNC(RESERVED_INT8_TEST,  DATA_TYPE_INT8,  CVT_DIR_BLOB2MAT, ConvertInt8BlobToInt8Mat)
 
 }  // namespace TNN_NS
diff --git a/source/tnn/device/x86/x86_context.cc b/source/tnn/device/x86/x86_context.cc
index 61782c361..b79bbab91 100644
--- a/source/tnn/device/x86/x86_context.cc
+++ b/source/tnn/device/x86/x86_context.cc
@@ -25,6 +25,10 @@ Status X86Context::GetCommandQueue(void** command_queue) {
     return TNN_OK;
 }
 
+Status X86Context::SetCommandQueue(void* command_queue) {
+    return TNN_OK;
+}
+
 Status X86Context::OnInstanceForwardBegin() {
     Context::OnInstanceForwardBegin();
     OMP_SET_THREADS_(GetNumThreads());
diff --git a/source/tnn/device/x86/x86_context.h b/source/tnn/device/x86/x86_context.h
index c94fae8bb..81c495d92 100644
--- a/source/tnn/device/x86/x86_context.h
+++ b/source/tnn/device/x86/x86_context.h
@@ -32,6 +32,8 @@ class X86Context : public Context {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void** command_queue) override;
 
+    virtual Status SetCommandQueue(void* command_queue) override;
+
     // @brief before instance forward
     virtual Status OnInstanceForwardBegin() override;
 
diff --git a/source/tnn/device/x86/x86_device.cc b/source/tnn/device/x86/x86_device.cc
index 21bfa1e59..5a8e37068 100644
--- a/source/tnn/device/x86/x86_device.cc
+++ b/source/tnn/device/x86/x86_device.cc
@@ -45,15 +45,27 @@ Status X86Device::Allocate(void** handle, MatType mat_type, DimsVector dims) {
     BlobDesc desc;
     desc.dims = dims;
     desc.device_type = DEVICE_X86;
-    if (mat_type == NCHW_FLOAT || mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_INT8_TEST) {
+    desc.data_format = DATA_FORMAT_NCHW;
+    if (mat_type == NCHW_FLOAT || mat_type == NCHW_BFP16 || mat_type == NC_INT8 || mat_type == NC_UINT8 ||
+        mat_type == RESERVED_BFP16_TEST || mat_type == RESERVED_INT8_TEST) {
         desc.data_type   = DATA_TYPE_FLOAT;
-        desc.data_format = DATA_FORMAT_NCHW;
         auto size_info   = Calculate(desc);
         return Allocate(handle, size_info);
     } else if (mat_type == N8UC3 || mat_type == N8UC4 || mat_type == NGRAY ||
                mat_type == NNV21 || mat_type == NNV12) {
         desc.data_type   = DATA_TYPE_INT8;
-        desc.data_format = DATA_FORMAT_NCHW;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == NCHW_HALF || mat_type == RESERVED_FP16_TEST) {
+        desc.data_type   = DATA_TYPE_HALF;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == NC_INT32) {
+        desc.data_type   = DATA_TYPE_INT32;
+        auto size_info   = Calculate(desc);
+        return Allocate(handle, size_info);
+    } else if (mat_type == NC_INT64) {
+        desc.data_type   = DATA_TYPE_INT64;
         auto size_info   = Calculate(desc);
         return Allocate(handle, size_info);
     } else {
diff --git a/source/tnn/extern_wrapper/foreign_blob.cc b/source/tnn/extern_wrapper/foreign_blob.cc
index ff8bb1ff3..a930a0622 100644
--- a/source/tnn/extern_wrapper/foreign_blob.cc
+++ b/source/tnn/extern_wrapper/foreign_blob.cc
@@ -21,6 +21,10 @@
 
 namespace TNN_NS {
 
+//@brief create foreignBlob with blob descript only
+ForeignBlob::ForeignBlob(BlobDesc desc): Blob(desc, true) {
+}
+
 //@brief create foreignBlob with blob only
 ForeignBlob::ForeignBlob(Blob* blob): Blob(blob->GetBlobDesc(), blob->GetHandle()) {
     foreign_tensor_ = std::make_shared<ForeignTensor>();
@@ -43,10 +47,17 @@ std::shared_ptr<ForeignTensor> ForeignBlob::GetForeignTensor() {
 }
 
 //@brief set the ForeignTensor
-Status ForeignBlob::SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor) {
+Status ForeignBlob::SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor, bool replace) {
     foreign_tensor_ = foreign_tensor;
+    if (replace) {
+        is_replaced_ = true;
+    }
     return TNN_OK;
 }
 
+//@brief get replace flag
+bool ForeignBlob::GetReplaceFlag() {
+    return is_replaced_;
+}
 
 }  // namespace TNN_NS
diff --git a/source/tnn/extern_wrapper/foreign_blob.h b/source/tnn/extern_wrapper/foreign_blob.h
index c0abf7e07..a12825587 100644
--- a/source/tnn/extern_wrapper/foreign_blob.h
+++ b/source/tnn/extern_wrapper/foreign_blob.h
@@ -45,16 +45,19 @@ class ForeignBlob : public Blob {
     //@brief create foreignBlob with blob only
     ForeignBlob(Blob * blob);
 
-    ~ForeignBlob();    
+    virtual ~ForeignBlob();    
 
     //@brief get the ForeignTensor
     std::shared_ptr<ForeignTensor> GetForeignTensor();
 
     //@brief set the ForeignTensor
-    Status SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor);
+    Status SetForeignTensor(std::shared_ptr<ForeignTensor> foreign_tensor, bool replace = false);
 
-protected:
+    //@brief get replace flag
+    bool GetReplaceFlag();
 
+protected:
+    bool is_replaced_ = false;
     std::shared_ptr<ForeignTensor> foreign_tensor_;
 
 };
diff --git a/source/tnn/interpreter/layer_param.h b/source/tnn/interpreter/layer_param.h
index f6abbf4bd..3980b03a5 100644
--- a/source/tnn/interpreter/layer_param.h
+++ b/source/tnn/interpreter/layer_param.h
@@ -62,12 +62,22 @@ enum ActivationType {
     ActivationType_ReLU        = 0x0001,
     ActivationType_ReLU6       = 0x0002,
     ActivationType_SIGMOID_MUL = 0x0100,
+    ActivationType_GELU        = 0x0200,
 };
 
 enum FusionType {
     FusionType_None                = 0x0000,
     FusionType_Conv_Add_Activation = 0x0001,
     FusionType_Conv_Activation_Add = 0x0002,
+    FusionType_TRTPlugin_BertQKVtoContextV1 = 0x0003,
+    FusionType_TRTPlugin_BertQKVtoContextV2 = 0x0004,
+    FusionType_TRTPlugin_BertQKVtoContextV3 = 0x0005,
+    FusionType_TRTPlugin_Gelu = 0x0006,
+    FusionType_AddBiasResidualLayerNorm = 0x0007,
+    FusionType_FFN = 0x0008,
+    FusionType_Attention = 0x0009,
+    FusionType_Flash_Attention = 0x000a,
+    FusionType_Cross_Attention = 0x000b,
 };
 
 struct BatchNormLayerParam : public LayerParam {
@@ -210,6 +220,13 @@ struct RangeLayerParam : public LayerParam {
     RangeData delta = {1};
     // RangeData delta = { .i = 1};
 
+    // Added for TNN-Torch, in TNN-torch,
+    // num of inputs of aten::arange may vary from 0-3, in which cases,
+    // order of inputs is not certain.
+    int start_index = -1;
+    int limit_index = -1;
+    int delta_index = -1;
+
     PARAM_COPY(RangeLayerParam)
 };
 
@@ -238,6 +255,20 @@ struct NormalizeLayerParam : public LayerParam {
     PARAM_COPY(NormalizeLayerParam)
 };
 
+struct NormLayerParam : public LayerParam {
+    int dim      = 1;
+    bool keepdim = 1;
+    int p         = 2;
+
+    PARAM_COPY(NormLayerParam)
+};
+
+struct ClampminLayerParam : public LayerParam {
+    float min = 1e-12f;
+
+    PARAM_COPY(ClampminLayerParam)
+};
+
 struct ReshapeLayerParam : public LayerParam {
     // reshape_type:
     // onnx caffe reshape(nchw): 0
@@ -256,6 +287,13 @@ struct PermuteLayerParam : public LayerParam {
     PARAM_COPY(PermuteLayerParam)
 };
 
+struct PermuteV2LayerParam : public PermuteLayerParam {
+    int dim0 = 0;
+    int dim1 = 0;
+
+    PARAM_COPY(PermuteV2LayerParam)
+};
+
 struct CastLayerParam : public LayerParam {
     int to   = 0;
     int from = 0;  // used for HUAWEI_NPU
@@ -323,6 +361,15 @@ struct ReduceMaxLayerParam : public ReduceLayerParam {
     PARAM_COPY(ReduceMaxLayerParam)
 };
 
+struct CumsumLayerParam : public LayerParam {
+    int axis = 0;
+    bool exclusive = false;
+    bool exclusive_extend = false;  // Exclusive Extend includes "exclusive"
+    bool reverse = false;
+
+    PARAM_COPY(CumsumLayerParam)
+};
+
 struct InnerProductLayerParam : public LayerParam {
     int num_output = 0;
     int has_bias   = 0;
@@ -383,6 +430,12 @@ struct StrideSliceV2LayerParam : public LayerParam {
     std::vector<int> axes;
     std::vector<int> strides;
 
+    // Add for TNN-Torch
+    // Torch aten::slice converted StridedSliceV2 may have 2 inputs, 
+    // The second of which is not begins but ends.
+    int begins_index = -1;
+    int ends_index = -1;
+
     PARAM_COPY(StrideSliceV2LayerParam)
 };
 
@@ -443,6 +496,12 @@ struct HardSigmoidLayerParam : public LayerParam {
     PARAM_COPY(HardSigmoidLayerParam)
 };
 
+struct LeakyReluLayerParam : public LayerParam {
+    float alpha = 0.01f;
+
+    PARAM_COPY(LeakyReluLayerParam)
+};
+
 typedef enum {
     // only data_type
     QUANT_ONLY   = 0,
@@ -603,6 +662,11 @@ struct GatherLayerParam : public LayerParam {
     PARAM_COPY(GatherLayerParam)
 };
 
+struct GatherElementsLayerParam : public LayerParam {
+    int axis = 0;
+    PARAM_COPY(GatherElementsLayerParam)
+};
+
 struct GatherNDLayerParam : public LayerParam {
     int batch_dims = 0;
     PARAM_COPY(GatherNDLayerParam)
@@ -623,6 +687,13 @@ struct ExpandLayerParam : public LayerParam {
     PARAM_COPY(ExpandLayerParam)
 };
 
+struct RollLayerParam : public LayerParam {
+    std::vector<int> shifts;
+    std::vector<int> dims;
+
+    PARAM_COPY(RollLayerParam)
+};
+
 struct MatMulLayerParam : public LayerParam {
     int weight_position = -1;
     DimsVector matrix_a_dims;
@@ -635,6 +706,7 @@ struct MatMulLayerParam : public LayerParam {
 struct RoiAlignLayerParam : public LayerParam {
     // 0: max, 1: avg
     int mode = 1;
+    bool aligned = false;
     int output_height;
     int output_width;
     int sampling_ratio;
@@ -649,6 +721,13 @@ struct FlattenLayerParam : public LayerParam {
     PARAM_COPY(FlattenLayerParam)
 };
 
+struct FlattenTorchLayerParam : public LayerParam {
+    int start_dim = 0;
+    int end_dim = -1;
+
+    PARAM_COPY(FlattenTorchLayerParam)
+};
+
 struct EinsumLayerParam : public LayerParam {
     std::string equation;
     int out_size;
@@ -698,6 +777,67 @@ struct LogSoftmaxLayerParam : public LayerParam {
     PARAM_COPY(LogSoftmaxLayerParam)
 };
 
+struct SplitTorchLayerParam : public SplitVLayerParam {
+    int split_size          = 0;
+
+    PARAM_COPY(SplitTorchLayerParam)
+};
+
+struct QuantizeLayerParam : public LayerParam {
+    int64_t axis            = 0;
+
+    PARAM_COPY(QuantizeLayerParam)
+};
+struct LinspaceLayerParam : public LayerParam {
+    DataType data_type = DATA_TYPE_FLOAT;
+    RangeData start    = {0};
+    RangeData end    = {0};
+
+    RangeData steps = {1};
+
+    // Added for TNN-Torch, in TNN-torch,
+    // num of inputs of aten::linspace may vary from 0-3, in which cases,
+    // order of inputs is not certain.
+    int start_index = -1;
+    int end_index = -1;
+    int steps_index = -1;
+
+    PARAM_COPY(LinspaceLayerParam)
+};
+struct FusedLayerParam : public LayerParam {
+    FusionType type = FusionType_None;
+
+    // Param for BERT multi-head attention.
+    // FusionType_TRTPlugin_BertQKVtoContextV3 = 0x0003,
+    int bert_mha_hidden_size = -1;    
+    int bert_mha_num_heads = -1;
+
+    // Param for BERT add-layernorm fusion.
+    // FusionType_AddBiasResidualLayerNorm
+    LayerNormLayerParam layer_norm_param;
+
+    // Param for BERT ffn fusion.
+    // FusionType_FFN
+    ActivationType ffn_activation = ActivationType_None;
+    int ffn_inter_size = 0;
+
+    // Param for BERT attention fusion.
+    // FusionType_Attention
+    int attention_head_num = -1;
+    int attention_size_per_head = -1;
+    float attention_q_scaling = 1.0;
+    bool has_attention_mask = true;
+    bool dense_mode = false;
+
+    PARAM_COPY(FusedLayerParam)
+};
+
+struct EffectiveTransformerLayerParam : public LayerParam {
+    bool is_remove_padding = false;
+
+    PARAM_COPY(EffectiveTransformerLayerParam)
+};
+
 struct GLULayerParam : public LayerParam {
     int axis = -1;
 
diff --git a/source/tnn/interpreter/layer_resource.h b/source/tnn/interpreter/layer_resource.h
index b65b07867..008d8ffc6 100644
--- a/source/tnn/interpreter/layer_resource.h
+++ b/source/tnn/interpreter/layer_resource.h
@@ -172,6 +172,33 @@ struct BiasAddLayerResource : public LayerResource {
     RawBuffer bias_handle;
 };
 
+struct QuantizeLayerResource : public LayerResource {
+    RawBuffer scale_handle;
+};
+
+struct WhereLayerResource : public LayerResource {
+    RawBuffer x;
+    RawBuffer y;
+};
+
+struct FusedLayerResource : public LayerResource {
+
+    // Resource for BERT ffn fusion.
+    // FusionType_FFN
+    MatMulLayerResource ffn_matmul_in;
+    MatMulLayerResource ffn_matmul_out;
+    EltwiseLayerResource ffn_bias;
+
+    // Resource for BERT attention fusion.
+    // FusionType_Attention
+    MatMulLayerResource attention_q_mm;
+    MatMulLayerResource attention_k_mm;
+    MatMulLayerResource attention_v_mm;
+    MatMulLayerResource attention_o_mm;
+    EltwiseLayerResource attention_q_bias;
+    EltwiseLayerResource attention_k_bias;
+    EltwiseLayerResource attention_v_bias;
+};
 
 }  // namespace TNN_NS
 
diff --git a/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc b/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc
index ab07b9ec8..66039509b 100644
--- a/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc
+++ b/source/tnn/interpreter/ncnn/ncnn_model_interpreter.cc
@@ -50,10 +50,14 @@ namespace ncnn {
         NetStructure *structure = GetNetStructure();
         auto layers             = structure->layers;
         auto blobs              = structure->blobs;
+        std::vector<std::string> output_names_condidate;
         std::set<std::string> out_blobs;
 
         for (auto layer : layers) {
             for (auto out_blob : layer->outputs) {
+                if(out_blobs.count(out_blob) == 0) {
+                    output_names_condidate.push_back(out_blob);
+                }
                 out_blobs.insert(out_blob);
             }
 
@@ -66,6 +70,12 @@ namespace ncnn {
 
         structure->outputs = out_blobs;
 
+        for(auto output_name : output_names_condidate) {
+            if(out_blobs.count(output_name) > 0) {
+                structure->parsed_output_names_list.push_back(output_name);
+            }
+        }
+
         return TNN_OK;
     }
 
@@ -150,6 +160,7 @@ namespace ncnn {
                     input_shape.push_back(GetInt(hwc_vec, 1, 0));  // h
                     input_shape.push_back(GetInt(hwc_vec, 0, 0));  // w
                 }
+                structure->parsed_input_names_list.push_back(input_name);
                 structure->inputs_shape_map[input_name] = input_shape;
             } else {
                 ret = AppendCommonLayer(layer_cfg_arr, structure, layer_interpreter_map);
diff --git a/source/tnn/interpreter/net_structure.h b/source/tnn/interpreter/net_structure.h
index c7da301b5..9643d9735 100644
--- a/source/tnn/interpreter/net_structure.h
+++ b/source/tnn/interpreter/net_structure.h
@@ -64,6 +64,8 @@ struct NetStructure {
     InputDataTypeMap  input_data_type_map;
     std::set<std::string> outputs;
     std::vector<std::shared_ptr<LayerInfo>> layers;
+    std::vector<std::string> parsed_input_names_list;
+    std::vector<std::string> parsed_output_names_list;
     std::set<std::string> blobs;
     ModelType source_model_type = MODEL_TYPE_TNN;
 
diff --git a/source/tnn/interpreter/raw_buffer.cc b/source/tnn/interpreter/raw_buffer.cc
index af75b28ad..c0c6ebce7 100644
--- a/source/tnn/interpreter/raw_buffer.cc
+++ b/source/tnn/interpreter/raw_buffer.cc
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/interpreter/raw_buffer.h"
+#include <exception>
 #include <fstream>
 #include <string>
 #include <typeinfo>
@@ -20,6 +21,7 @@
 #include "tnn/utils/bfp16.h"
 #include "tnn/utils/bfp16_utils.h"
 #include "tnn/utils/data_type_utils.h"
+#include "tnn/utils/dims_vector_utils.h"
 
 using namespace TNN_NS;
 
@@ -109,6 +111,15 @@ void permute(void *in, void *out, size_t outter, size_t inner) {
     }
 }
 
+void RawBuffer::Reshape(DimsVector& new_dims) {
+    //printf("Reshape dims_ %d new_dims: %d \n", DimsVectorUtils::Count(dims_), DimsVectorUtils::Count(new_dims));
+    if(DimsVectorUtils::Count(dims_) == DimsVectorUtils::Count(new_dims)) {
+        dims_ = new_dims;
+    } else {
+        throw std::runtime_error("RawBuffer Reshape error \n");
+    }
+}
+
 void RawBuffer::Permute(size_t outter, size_t inner) {
     RawBuffer tmp(bytes_size_);
     switch (data_type_) {
@@ -214,6 +225,19 @@ RawBuffer ConvertHalfToBFP16(RawBuffer &buf) {
     }
 }
 
+RawBuffer ConvertFloatToHalf(RawBuffer &buf) {
+    if (buf.GetBytesSize() > 0 && buf.GetDataType() == DATA_TYPE_FLOAT) {
+        auto data_count = buf.GetDataCount();
+        RawBuffer buf_fp16(data_count * sizeof(fp16_t));
+        ConvertFromFloatToHalf(buf.force_to<float *>(), buf_fp16.force_to<void *>(), data_count);
+        buf_fp16.SetDataType(DATA_TYPE_HALF);
+        buf_fp16.SetBufferDims(buf.GetBufferDims());
+        return buf_fp16;
+    } else {
+        return buf;
+    }
+}
+
 std::shared_ptr<float> GetFloatFromRawBuffer(const RawBuffer &raw_buffer) {
     int element_size = 0;
     DataType type    = raw_buffer.GetDataType();
@@ -251,4 +275,42 @@ RawBuffer ConvertFloatToFP16(RawBuffer &buf) {
     }
 }
 
+RawBuffer Concat(std::vector<RawBuffer> & list, int axis) {
+    RawBuffer buffer0 = list[0];
+    auto buffer0_dims = buffer0.GetBufferDims();
+    for(int i= 1; i < list.size(); ++i) {
+        auto dims = list[i].GetBufferDims();
+        if(buffer0_dims.size() != dims.size() && axis >= buffer0_dims.size()) {
+             throw std::runtime_error("RawBuffer Concat Error \n");
+        }
+    }
+    auto output_dims = buffer0_dims;
+    int out_concat_dim_size = 0;
+    for(int i = 0; i <list.size(); ++i) {
+        out_concat_dim_size += list[i].GetBufferDims()[axis];
+    }
+    output_dims[axis] = out_concat_dim_size;
+    int num_concats = DimsVectorUtils::Count(buffer0_dims, 0, axis);
+    auto datasize                 = DataTypeUtils::GetBytesSize(buffer0.GetDataType());
+
+    RawBuffer output_buffer(DimsVectorUtils::Count(output_dims) * datasize);
+    output_buffer.SetBufferDims(output_dims);
+    int8_t* output_data = output_buffer.force_to<int8_t*>();
+
+    int concate_size = DimsVectorUtils::Count(buffer0_dims, axis+ 1);
+    int output_concat_axis_offset = 0;
+
+    for (size_t i = 0; i < list.size(); ++i) {
+        int8_t *input_data          = list[i].force_to<int8_t*>();
+        const int input_concat_axis = list[i].GetBufferDims()[axis];
+        for (int n = 0; n < num_concats; ++n) {
+            memcpy(output_data + (n * out_concat_dim_size + output_concat_axis_offset) * concate_size * datasize,
+                   input_data + n * input_concat_axis * concate_size * datasize,
+                   input_concat_axis * concate_size * datasize);
+        }
+        output_concat_axis_offset += input_concat_axis;
+    }
+    return output_buffer;
+}
+
 }  // namespace TNN_NS
diff --git a/source/tnn/interpreter/raw_buffer.h b/source/tnn/interpreter/raw_buffer.h
index 363e63762..66e57c9aa 100644
--- a/source/tnn/interpreter/raw_buffer.h
+++ b/source/tnn/interpreter/raw_buffer.h
@@ -43,14 +43,13 @@ class RawBuffer {
     void SetDataType(DataType data_type);
     void SetBufferDims(DimsVector shape);
 
-
-
     DataType GetDataType() const;
     int GetBytesSize() const;
     int GetDataCount() const;
     DimsVector GetBufferDims() const;
 
     void Permute(size_t outter, size_t inner);
+    void Reshape(DimsVector& new_dims);
 
     template <typename T>
     T force_to() {
@@ -73,8 +72,11 @@ RawBuffer ConvertFloatToFP16(RawBuffer &buf);
 RawBuffer ConvertHalfHandle(RawBuffer &buf);
 RawBuffer ConvertFloatToBFP16(RawBuffer &buf);
 RawBuffer ConvertHalfToBFP16(RawBuffer &buf);
+RawBuffer ConvertFloatToHalf(RawBuffer &buf);
 std::shared_ptr<float> GetFloatFromRawBuffer(const RawBuffer &raw_buffer);
 
+RawBuffer Concat(std::vector<RawBuffer> & list, int axis);
+
 }  // namespace TNN_NS
 
 #endif  // TNN_SOURCE_TNN_INTERPRETER_RAW_BUFFER_H_
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h b/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h
index 4bffa1d3b..af0701e66 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h
+++ b/source/tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h
@@ -34,7 +34,7 @@ class AbstractLayerInterpreter {
 
     virtual Status InterpretResource(Deserializer &deserializer, LayerResource **resource) = 0;
 
-    virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) = 0;
+    virtual Status SaveProto(std::ostream &output_stream, LayerParam *param) = 0;
 
     virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) = 0;
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc
index e28b15c85..fa0d957a7 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/add_layer_interpreter.cc
@@ -32,7 +32,7 @@ Status AddLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status AddLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status AddLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/and_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/and_layer_interpreter.cc
index 943875ccb..b2498bc96 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/and_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/and_layer_interpreter.cc
@@ -32,7 +32,7 @@ Status AndLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status AndLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status AndLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc
index 8c0fdb50c..097aa4e4d 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/arg_max_or_min_layer_interpreter.cc
@@ -45,7 +45,7 @@ Status ArgMaxOrMinLayerInterpreter::InterpretResource(Deserializer& deserializer
     return TNN_OK;
 }
 
-Status ArgMaxOrMinLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ArgMaxOrMinLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
 	auto layer_param = dynamic_cast<ArgMaxOrMinLayerParam*>(param);
     CHECK_PARAM_NULL(layer_param);
     output_stream << layer_param->mode << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc
index 4186a7149..419597803 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/batch_norm_layer_interpreter.cc
@@ -38,7 +38,7 @@ Status BatchNormLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status BatchNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status BatchNormLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc
index 47a71e3de..00b419f45 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/bias_add_layer_interpreter.cc
@@ -28,12 +28,12 @@ Status BiasAddLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status BiasAddLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status BiasAddLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     return TNN_OK;
 }
 
 Status BiasAddLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
-    CAST_OR_RET_ERROR(bias_res, BiasAddLayerResource, "invalid layer res to save", resource);
+    CAST_OR_RET_ERROR(bias_res, BatchNormLayerResource, "invalid layer res to save", resource);
     serializer.PutRaw(bias_res->bias_handle);
     return TNN_OK;
 }
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc
index c7429cac7..852942260 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/bitshift_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status BitShiftLayerInterpreter::InterpretResource(Deserializer &deserializer, L
     return TNN_OK;
 }
 
-Status BitShiftLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status BitShiftLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<BitShiftLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc
index 18762735e..828b5a74a 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/blob_scale_layer_interpreter.cc
@@ -50,7 +50,7 @@ Status BlobScaleLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status BlobScaleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status BlobScaleLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc
index 4202abaa0..d9a2fd161 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/cast_layer_interpreter.cc
@@ -28,7 +28,7 @@ Status CastLayerInterpreter::InterpretResource(Deserializer &deserializer, Layer
     return TNN_OK;
 }
 
-Status CastLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status CastLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<CastLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc
index 069d4e2ae..0ace6f790 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/clip_layer_interpreter.cc
@@ -31,7 +31,7 @@ Status ClipLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status ClipLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ClipLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ClipLayerParam, "invalid clip param to save", param);
     output_stream << layer_param->min << " " << layer_param->max << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc
index eabd6a2fe..691d834d8 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/concat_layer_interpreter.cc
@@ -30,7 +30,7 @@ Status ConcatLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status ConcatLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ConcatLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ConcatLayerParam, "invalid concat param to save", param);
     output_stream << layer_param->axis << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc
index b446f66b9..04be015b3 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/const_layer_interpreter.cc
@@ -39,7 +39,7 @@ Status ConstLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status ConstLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ConstLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<ConstLayerParam*>(param);
     output_stream << layer_param->dims.size() << " ";
     for (const auto& dim : layer_param->dims) {
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc
index 824c8f928..6b8c6361a 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/constantofshape_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status ConstantOfShapeLayerInterpreter::InterpretResource(Deserializer& deserial
     return TNN_OK;
 }
 
-Status ConstantOfShapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ConstantOfShapeLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc
index 1e4f123f8..18952d974 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/conv_1d_layer_interpreter.cc
@@ -63,7 +63,7 @@ Status Conv1DLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status Conv1DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status Conv1DLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->group << " ";
@@ -106,5 +106,6 @@ Status Conv1DLayerInterpreter::SaveResource(Serializer& serializer, LayerParam*
 }
 
 REGISTER_LAYER_INTERPRETER(Conv1D, LAYER_CONVOLUTION_1D);
+REGISTER_LAYER_INTERPRETER(Conv1D, LAYER_DECONVOLUTION_1D);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc
index dc16ff89c..d0069a9d2 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/conv_3d_layer_interpreter.cc
@@ -71,7 +71,7 @@ Status Conv3DLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status Conv3DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status Conv3DLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->group << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc
index a988e21e5..440d145f5 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/conv_layer_interpreter.cc
@@ -93,7 +93,7 @@ Status ConvLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status ConvLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ConvLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ConvLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->group << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/cumsum_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/cumsum_layer_interpreter.cc
new file mode 100644
index 000000000..af72997cf
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/cumsum_layer_interpreter.cc
@@ -0,0 +1,55 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Cumsum, LAYER_CUMSUM);
+
+Status CumsumLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto cumsum_param = CreateLayerParam<CumsumLayerParam>(param);
+
+    GET_INT_1(cumsum_param->axis);
+    GET_INT_1(cumsum_param->exclusive);
+    GET_INT_1(cumsum_param->exclusive_extend);
+    GET_INT_1(cumsum_param->reverse);
+
+    return TNN_OK;
+}
+
+Status CumsumLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status CumsumLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(cumsum_param, CumsumLayerParam, "invalid layer param to save", param);
+
+    output_stream << cumsum_param->axis << " ";
+    output_stream << int(cumsum_param->exclusive) << " ";
+    output_stream << int(cumsum_param->exclusive_extend) << " ";
+    output_stream << int(cumsum_param->reverse) << " ";
+
+    return TNN_OK;
+}
+
+Status CumsumLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Cumsum, LAYER_CUMSUM);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc
index 2dc054c46..71a287955 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/detection_output_interpreter.cc
@@ -47,7 +47,7 @@ Status DetectionOutputLayerInterpreter::InterpretResource(Deserializer &deserial
     return TNN_OK;
 }
 
-Status DetectionOutputLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status DetectionOutputLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     CAST_OR_RET_ERROR(layer_param, DetectionOutputLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->num_classes << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc
index 1c03495c0..0bf2c0ce9 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/detection_post_process_layer_interpreter.cc
@@ -56,7 +56,7 @@ Status DetectionPostProcessLayerInterpreter::InterpretResource(Deserializer &des
     return TNN_OK;
 }
 
-Status DetectionPostProcessLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status DetectionPostProcessLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     CAST_OR_RET_ERROR(layer_param, DetectionPostProcessLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->max_detections << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc
index c98e894fc..c6001e8b1 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/div_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status DivLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status DivLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status DivLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/effective_transformer_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/effective_transformer_layer_interpreter.cc
new file mode 100644
index 000000000..b39ba1536
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/effective_transformer_layer_interpreter.cc
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+Status EffectiveTransformerLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new EffectiveTransformerLayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    if (index < layer_cfg_arr.size()) {
+        layer_param->is_remove_padding = atoi(layer_cfg_arr[index++].c_str());
+    }
+    return TNN_OK;
+}
+
+Status EffectiveTransformerLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status EffectiveTransformerLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<EffectiveTransformerLayerParam*>(param);
+    output_stream << layer_param->is_remove_padding << " ";
+    return TNN_OK;
+}
+
+Status EffectiveTransformerLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc
index 8274ab6a9..057d49de5 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/einsum_layer_interpreter.cc
@@ -34,7 +34,7 @@ Status EinsumLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status EinsumLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status EinsumLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<EinsumLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc
index 732b109e1..1bfd0f704 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/elu_layer_interpreter.cc
@@ -34,7 +34,7 @@ Status EluLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status EluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status EluLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     EluLayerParam* layer_param = dynamic_cast<EluLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/equal_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/equal_layer_interpreter.cc
new file mode 100644
index 000000000..3873470c8
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/equal_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Equal, LAYER_EQUAL);
+
+Status EqualLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<MultidirBroadcastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->weight_input_index, 1);
+    return TNN_OK;
+}
+
+Status EqualLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<EltwiseLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, element_handle, deserializer);
+    return TNN_OK;
+}
+
+Status EqualLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
+    output_stream << layer_param->weight_input_index << " ";
+    return TNN_OK;
+}
+
+Status EqualLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, EltwiseLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Equal, LAYER_EQUAL);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc
index 5f3cd763e..80206bec2 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/expand_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status ExpandLayerInterpreter::InterpretResource(Deserializer &deserializer, Lay
     return TNN_OK;
 }
 
-Status ExpandLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status ExpandLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     CAST_OR_RET_ERROR(layer_param, ExpandLayerParam, "invalid expand param to save", param);
     output_stream << layer_param->shape.size() << " ";
     for (const auto &item : layer_param->shape) {
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc
index c6ad225d0..5652a9f52 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/flatten_layer_interpreter.cc
@@ -34,7 +34,7 @@ Status FlattenLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status FlattenLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status FlattenLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto* layer_param = static_cast<FlattenLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/flatten_torch_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/flatten_torch_layer_interpreter.cc
new file mode 100644
index 000000000..058cb4c60
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/flatten_torch_layer_interpreter.cc
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(FlattenTorch, LAYER_FLATTENTORCH);
+
+Status FlattenTorchLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto* layer_param = new FlattenTorchLayerParam();
+    *param            = layer_param;
+    int index         = start_index;
+
+    layer_param->start_dim = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->end_dim   = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status FlattenTorchLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status FlattenTorchLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    auto* layer_param = static_cast<FlattenTorchLayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->start_dim << " ";
+    output_stream << layer_param->end_dim << " ";
+
+    return TNN_OK;
+}
+
+Status FlattenTorchLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(FlattenTorch, LAYER_FLATTENTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/fused_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/fused_layer_interpreter.cc
new file mode 100644
index 000000000..500696278
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/fused_layer_interpreter.cc
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Fused, LAYER_FUSED);
+
+Status FusedLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<FusedLayerParam>(param);
+    int fused_type = -1;
+    GET_INT_1_OR_DEFAULT(fused_type, -1);
+    layer_param->type = static_cast<FusionType>(fused_type);
+    GET_INT_1_OR_DEFAULT(layer_param->bert_mha_hidden_size, -1);
+    GET_INT_1_OR_DEFAULT(layer_param->bert_mha_num_heads, -1);
+
+    GET_INT_1_OR_DEFAULT(layer_param->layer_norm_param.reduce_dims_size, 0);
+    GET_FLOAT_1_OR_DEFAULT(layer_param->layer_norm_param.eps, 1e-5f);
+
+    int ffn_activation = 0;
+    GET_INT_1_OR_DEFAULT(ffn_activation, 0);
+    layer_param->ffn_activation = static_cast<ActivationType>(ffn_activation);
+    GET_INT_1_OR_DEFAULT(layer_param->ffn_inter_size, 0);
+
+    GET_INT_1_OR_DEFAULT(layer_param->attention_head_num, -1);
+    GET_INT_1_OR_DEFAULT(layer_param->attention_size_per_head, -1);
+    GET_FLOAT_1_OR_DEFAULT(layer_param->attention_q_scaling, 1.0f);
+
+    return TNN_OK;
+}
+
+Status FusedLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res   = CreateLayerRes<FusedLayerResource>(resource);
+
+    RawBuffer ffn_matmul_in;
+    deserializer.GetRaw(ffn_matmul_in);
+    layer_res->ffn_matmul_in.weight = ffn_matmul_in;
+
+    RawBuffer ffn_matmul_out;
+    deserializer.GetRaw(ffn_matmul_out);
+    layer_res->ffn_matmul_out.weight = ffn_matmul_out;
+
+    RawBuffer ffn_bias;
+    deserializer.GetRaw(ffn_bias);
+    layer_res->ffn_bias.element_handle = ffn_bias;
+
+    RawBuffer attention_q_mm;
+    deserializer.GetRaw(attention_q_mm);
+    layer_res->attention_q_mm.weight = attention_q_mm;
+
+    RawBuffer attention_k_mm;
+    deserializer.GetRaw(attention_k_mm);
+    layer_res->attention_k_mm.weight = attention_k_mm;
+
+    RawBuffer attention_v_mm;
+    deserializer.GetRaw(attention_v_mm);
+    layer_res->attention_v_mm.weight = attention_v_mm;
+
+    RawBuffer attention_o_mm;
+    deserializer.GetRaw(attention_o_mm);
+    layer_res->attention_o_mm.weight = attention_o_mm;
+
+    RawBuffer attention_q_bias;
+    deserializer.GetRaw(attention_q_bias);
+    layer_res->attention_q_bias.element_handle = attention_q_bias;
+
+    RawBuffer attention_k_bias;
+    deserializer.GetRaw(attention_k_bias);
+    layer_res->attention_k_bias.element_handle = attention_k_bias;
+
+    RawBuffer attention_v_bias;
+    deserializer.GetRaw(attention_v_bias);
+    layer_res->attention_v_bias.element_handle = attention_v_bias;
+
+    return TNN_OK;
+}
+
+Status FusedLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, FusedLayerParam, "invalid layer param to save", param);
+    output_stream << int(layer_param->type) << " ";
+    output_stream << int(layer_param->bert_mha_hidden_size) << " ";
+    output_stream << int(layer_param->bert_mha_num_heads) << " ";
+
+    output_stream << layer_param->layer_norm_param.reduce_dims_size << " ";
+    output_stream << layer_param->layer_norm_param.eps << " ";
+
+    output_stream << int(layer_param->ffn_activation) << " ";
+    output_stream << layer_param->ffn_inter_size << " ";
+
+    output_stream << layer_param->attention_head_num << " ";
+    output_stream << layer_param->attention_size_per_head << " ";
+    output_stream << layer_param->attention_q_scaling << " ";
+
+    return TNN_OK;
+}
+
+Status FusedLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_param, FusedLayerParam, "invalid layer param when save resource", param);
+    FusedLayerResource empty_res;
+    FusedLayerResource *layer_res = &empty_res;
+    if (layer_param->type == FusionType_FFN ||
+        layer_param->type == FusionType_Attention) {
+        CAST_OR_RET_ERROR(fused_layer_res, FusedLayerResource, "invalid layer res to save", resource);
+        layer_res = fused_layer_res;
+    }
+
+    serializer.PutRaw(layer_res->ffn_matmul_in.weight);
+    serializer.PutRaw(layer_res->ffn_matmul_out.weight);
+    serializer.PutRaw(layer_res->ffn_bias.element_handle);
+    serializer.PutRaw(layer_res->attention_q_mm.weight);
+    serializer.PutRaw(layer_res->attention_k_mm.weight);
+    serializer.PutRaw(layer_res->attention_v_mm.weight);
+    serializer.PutRaw(layer_res->attention_o_mm.weight);
+    serializer.PutRaw(layer_res->attention_q_bias.element_handle);
+    serializer.PutRaw(layer_res->attention_k_bias.element_handle);
+    serializer.PutRaw(layer_res->attention_v_bias.element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Fused, LAYER_FUSED);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/gather_elements_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/gather_elements_layer_interpreter.cc
new file mode 100644
index 000000000..17be3d6bd
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/gather_elements_layer_interpreter.cc
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(GatherElements, LAYER_GATHERELEMENTS);
+
+Status GatherElementsLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<GatherElementsLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(layer_param->axis, 0);
+    return TNN_OK;
+}
+
+Status GatherElementsLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status GatherElementsLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    auto layer_param = dynamic_cast<GatherElementsLayerParam*>(param);
+    if (layer_param == nullptr) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+    output_stream << layer_param->axis << " ";
+    return TNN_OK;
+}
+
+Status GatherElementsLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(GatherElements, LAYER_GATHERELEMENTS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc
index 7235ca877..2ab07af76 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/gather_layer_interpreter.cc
@@ -43,7 +43,7 @@ Status GatherLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status GatherLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status GatherLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<GatherLayerParam*>(param);
     if (layer_param == nullptr) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc
index f7bbcb0bd..3f2802889 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/gathernd_layer_interpreter.cc
@@ -28,7 +28,7 @@ Status GatherNDLayerInterpreter::InterpretResource(Deserializer &deserializer, L
     return TNN_OK;
 }
 
-Status GatherNDLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status GatherNDLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<GatherNDLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/glu_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/glu_layer_interpreter.cc
index 4eced8216..7ae6f6cd0 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/glu_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/glu_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status GLULayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status GLULayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status GLULayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     GLULayerParam* layer_param = dynamic_cast<GLULayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/greater_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/greater_layer_interpreter.cc
index 7c1589529..55821233e 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/greater_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/greater_layer_interpreter.cc
@@ -32,7 +32,7 @@ Status GreaterLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status GreaterLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status GreaterLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc
index 6112a96b7..0ecddd818 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/gridsample_layer_interpreter.cc
@@ -30,7 +30,7 @@ Status GridSampleLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status GridSampleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status GridSampleLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, GridSampleLayerParam, "invalid grid sample layer param to save", param);
     output_stream << layer_param->mode << " ";
     output_stream << layer_param->pad_type << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc
index 8fea9cfa3..67084a118 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/group_norm_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status GroupNormLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status GroupNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status GroupNormLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, GroupNormLayerParam, "invalid group norm layer param to save", param);
     output_stream << layer_param->group << " ";
     output_stream << layer_param->eps << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
index 60c1a489f..f1099f5eb 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/hard_sigmoid_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status HardSigmoidLayerInterpreter::InterpretResource(Deserializer& deserializer
     return TNN_OK;
 }
 
-Status HardSigmoidLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status HardSigmoidLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<HardSigmoidLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc
index 63b7ac25e..f058af739 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/hard_swish_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status HardSwishLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status HardSwishLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status HardSwishLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<HardSwishLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc
index 0bc438b89..0f4b2d5cf 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/hdrguide_layer_interpreter.cc
@@ -53,7 +53,7 @@ Status HdrGuideLayerInterpreter::InterpretResource(Deserializer& deserializer, L
     return TNN_OK;
 }
 
-Status HdrGuideLayerInterpreter::SaveProto(std::ofstream&, LayerParam*) {
+Status HdrGuideLayerInterpreter::SaveProto(std::ostream&, LayerParam*) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc
index fffb82362..bd130654f 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/histogram_layer_interpreter.cc
@@ -28,7 +28,7 @@ Status HistogramLayerInterpreter::InterpretResource(Deserializer &deserializer,
     return TNN_OK;
 }
 
-Status HistogramLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status HistogramLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<HistogramLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc
index f9994a6fc..f04379478 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/inner_product_layer_interpreter.cc
@@ -72,7 +72,7 @@ Status InnerProductLayerInterpreter::InterpretResource(Deserializer& deserialize
     return TNN_OK;
 }
 
-Status InnerProductLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status InnerProductLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     InnerProductLayerParam* layer_param = dynamic_cast<InnerProductLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc
index 46a35121c..f283ac196 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/instance_norm_layer_interpreter.cc
@@ -39,7 +39,7 @@ Status InstanceNormLayerInterpreter::InterpretResource(Deserializer& deserialize
     return TNN_OK;
 }
 
-Status InstanceNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status InstanceNormLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, InstanceNormLayerParam, "invalid group norm layer param to save", param);
     output_stream << layer_param->channels << " ";
     output_stream << layer_param->eps << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h b/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h
index e41ff6a9b..c9f573378 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h
+++ b/source/tnn/interpreter/tnn/layer_interpreter/layer_interpreter_macro.h
@@ -22,7 +22,7 @@
     public:                                                                                                            \
         virtual Status InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param);                     \
         virtual Status InterpretResource(Deserializer &deserializer, LayerResource **resource);                        \
-        virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param);                                     \
+        virtual Status SaveProto(std::ostream &output_stream, LayerParam *param);                                     \
         virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource);               \
     }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc
index 7b6ee3169..6730cde23 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/layer_norm_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status LayerNormLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status LayerNormLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status LayerNormLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, LayerNormLayerParam, "invalid layer norm layer param to save", param);
     output_stream << layer_param->reduce_dims_size << " ";
     output_stream << layer_param->eps << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/less_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/less_layer_interpreter.cc
index 9fbd2f80c..313fc615f 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/less_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/less_layer_interpreter.cc
@@ -32,7 +32,7 @@ Status LessLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status LessLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status LessLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/linspace_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/linspace_layer_interpreter.cc
new file mode 100644
index 000000000..077251a32
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/linspace_layer_interpreter.cc
@@ -0,0 +1,69 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Linspace, LAYER_LINSPACE);
+
+Status LinspaceLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = CreateLayerParam<LinspaceLayerParam>(param);
+    int index = start_index;
+
+    if (index + 7 > layer_cfg_arr.size()) {
+        LOGE("Linspace TNN Interpreter: Linspace requires at least 7 layer param.");
+        return Status(TNNERR_PARAM_ERR, "LinspaceLayerInterpreter param is invalid");
+    }
+
+    int dtype_value = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->data_type   = (DataType)dtype_value;
+    layer_param->start.f     = std::atof(layer_cfg_arr[index++].c_str());
+    layer_param->end.f       = std::atof(layer_cfg_arr[index++].c_str());
+    layer_param->steps.i     = std::atoi(layer_cfg_arr[index++].c_str());
+    
+    layer_param->start_index = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->end_index   = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->steps_index = std::atoi(layer_cfg_arr[index++].c_str());
+ 
+    return TNN_OK;
+}
+
+Status LinspaceLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status LinspaceLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, LinspaceLayerParam, "invalid layer param to save", param);
+    
+    output_stream << layer_param->data_type << " ";
+
+    output_stream << layer_param->start.f << " ";
+    output_stream << layer_param->end.f << " ";
+    output_stream << layer_param->steps.i << " ";
+
+    output_stream << layer_param->start_index << " ";
+    output_stream << layer_param->end_index << " ";
+    output_stream << layer_param->steps_index << " ";
+    
+    return TNN_OK;
+}
+
+Status LinspaceLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Linspace, LAYER_LINSPACE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/log_softmax_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/log_softmax_layer_interpreter.cc
index 8c2a0398b..962ab642a 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/log_softmax_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/log_softmax_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status LogSoftmaxLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status LogSoftmaxLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status LogSoftmaxLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     LogSoftmaxLayerParam* layer_param = dynamic_cast<LogSoftmaxLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc
index 1e038b4b2..db890a626 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/lrn_layer_interpreter.cc
@@ -38,7 +38,7 @@ Status LRNLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status LRNLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status LRNLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<LRNLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc
index b28e08d53..e5390064b 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/lstm_layer_interpreter.cc
@@ -30,7 +30,7 @@ Status LSTMONNXLayerInterpreter::InterpretResource(Deserializer& deserializer, L
     return TNN_OK;
 }
 
-Status LSTMONNXLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status LSTMONNXLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<LSTMONNXLayerParam*>(param);
     if (layer_param == nullptr) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc
index 8ac4207ae..e8741a8d0 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/mat_mul_layer_interpreter.cc
@@ -44,12 +44,13 @@ Status MatMulLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status MatMulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status MatMulLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MatMulLayerParam*>(param);
     if (nullptr == layer_param) {
         return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
     }
     output_stream << layer_param->weight_position << " ";
+ 
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc
index dd3b0273c..ad8c10c50 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/max_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status MaxLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status MaxLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status MaxLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc
index 347bd2a91..37e9119f1 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/min_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status MinLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status MinLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status MinLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/mod_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/mod_layer_interpreter.cc
new file mode 100644
index 000000000..9af2782e2
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/mod_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Mod, LAYER_MOD);
+
+Status ModLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<MultidirBroadcastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->weight_input_index, 1);
+    return TNN_OK;
+}
+
+Status ModLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<EltwiseLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, element_handle, deserializer);
+    return TNN_OK;
+}
+
+Status ModLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
+    output_stream << layer_param->weight_input_index << " ";
+    return TNN_OK;
+}
+
+Status ModLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, EltwiseLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Mod, LAYER_MOD);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc
index d4a0a226c..6072769c0 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/mul_layer_interpreter.cc
@@ -45,7 +45,7 @@ Status MulLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status MulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status MulLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/non_max_suppression_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/non_max_suppression_layer_interpreter.cc
index 584f7e1e9..67f6a7d28 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/non_max_suppression_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/non_max_suppression_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status NonMaxSuppressionLayerInterpreter::InterpretResource(Deserializer& deseri
     return TNN_OK;
 }
 
-Status NonMaxSuppressionLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status NonMaxSuppressionLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto* layer_param = static_cast<NonMaxSuppressionLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc
index 96881a246..9dcbb223e 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/normalize_layer_interpreter.cc
@@ -51,7 +51,7 @@ Status NormalizeLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status NormalizeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status NormalizeLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<NormalizeLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc
index 4942fd939..fda9595a5 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/onehot_layer_interpreter.cc
@@ -31,7 +31,7 @@ Status OneHotLayerInterpreter::InterpretResource(Deserializer &deserializer, Lay
     return TNN_OK;
 }
 
-Status OneHotLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status OneHotLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<OneHotLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/or_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/or_layer_interpreter.cc
new file mode 100644
index 000000000..13fee805f
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/or_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Or, LAYER_OR);
+
+Status OrLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<MultidirBroadcastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->weight_input_index, 1);
+    return TNN_OK;
+}
+
+Status OrLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<EltwiseLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, element_handle, deserializer);
+    return TNN_OK;
+}
+
+Status OrLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
+    output_stream << layer_param->weight_input_index << " ";
+    return TNN_OK;
+}
+
+Status OrLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, EltwiseLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Or, LAYER_OR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc
index 28f3fc9b6..17d1f498a 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pad_layer_interpreter.cc
@@ -75,7 +75,7 @@ Status PadLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status PadLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PadLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<PadLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc
index 62c815758..53e22fe5f 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/padv2_layer_interpreter.cc
@@ -50,7 +50,7 @@ Status PadV2LayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status PadV2LayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PadV2LayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<PadLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc
index 96ca2c263..5bffee3b8 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/permute_layer_interpreter.cc
@@ -43,7 +43,7 @@ Status PermuteLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status PermuteLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PermuteLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     PermuteLayerParam* layer_param = dynamic_cast<PermuteLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/permutev2_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/permutev2_layer_interpreter.cc
new file mode 100644
index 000000000..5a838db92
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/permutev2_layer_interpreter.cc
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(PermuteV2, LAYER_PERMUTEV2);
+
+Status PermuteV2LayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
+    auto layer_param = new PermuteV2LayerParam();
+    *param           = layer_param;
+    int index        = start_index;
+
+    // int order_size;
+    // order_size = atoi(layer_cfg_arr[index++].c_str());
+
+    layer_param->orders.clear();
+    // for (int i = 0; i < order_size; i++) {
+    //     int v = atoi(layer_cfg_arr[index++].c_str());
+    //     // v should be less than the input dimension.
+    //     ASSERT(v < order_size);
+    //     layer_param->orders.push_back(v);
+    // }
+    layer_param->dim0 = atoi(layer_cfg_arr[index++].c_str());
+    layer_param->dim1 = atoi(layer_cfg_arr[index++].c_str());
+
+    return TNN_OK;
+}
+
+Status PermuteV2LayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    return TNN_OK;
+}
+
+Status PermuteV2LayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    PermuteV2LayerParam* layer_param = dynamic_cast<PermuteV2LayerParam*>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    // output_stream << layer_param->orders.size() << " ";
+    // for (auto item : layer_param->orders)
+    //     output_stream << item << " ";
+    output_stream << layer_param->dim0 << " ";
+    output_stream << layer_param->dim1 << " ";
+
+    return TNN_OK;
+}
+
+Status PermuteV2LayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(PermuteV2, LAYER_PERMUTEV2);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc
index ec0af6e78..5bea62051 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pixel_shuffle_layer_interpreter.cc
@@ -31,7 +31,7 @@ Status PixelShuffleLayerInterpreter::InterpretResource(Deserializer &deserialize
     return TNN_OK;
 }
 
-Status PixelShuffleLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status PixelShuffleLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<PixelShuffleLayerParam *>(param);
     CHECK_PARAM_NULL(layer_param);
     output_stream << layer_param->upscale_factor << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pooling_1d_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pooling_1d_layer_interpreter.cc
index 9807d8a43..20475d1c1 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pooling_1d_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pooling_1d_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status Pooling1DLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status Pooling1DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status Pooling1DLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, PoolingLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->pool_type << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc
index 26061c232..1c833e17c 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pooling_3d_layer_interpreter.cc
@@ -60,7 +60,7 @@ Status Pooling3DLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status Pooling3DLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status Pooling3DLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, PoolingLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->pool_type << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc
index 13f48b7fc..32fbc57b4 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pooling_layer_interpreter.cc
@@ -61,7 +61,7 @@ Status PoolingLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status PoolingLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PoolingLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, PoolingLayerParam, "invalid layer param to save", param);
 
     output_stream << layer_param->pool_type << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc
index a12600ced..ec92ed24c 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/pow_layer_interpreter.cc
@@ -41,7 +41,7 @@ Status PowLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status PowLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PowLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<PowLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc
index 3cdcaa945..491527b30 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/prelu_layer_interpreter.cc
@@ -47,7 +47,7 @@ Status PReluLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status PReluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status PReluLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<PReluLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc
index f91cbd890..743037870 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/prior_box_layer_interpreter.cc
@@ -77,7 +77,7 @@ Status PriorBoxLayerInterpreter::InterpretResource(Deserializer &deserializer, L
     return TNN_OK;
 }
 
-Status PriorBoxLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status PriorBoxLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     PriorBoxLayerParam *layer_param = dynamic_cast<PriorBoxLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/quantize_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/quantize_layer_interpreter.cc
new file mode 100644
index 000000000..9d61f83e0
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/quantize_layer_interpreter.cc
@@ -0,0 +1,64 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_INTERPRETER(Quantize, LAYER_QUANTIZE);
+
+Status QuantizeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam **param) {
+    auto *layer_param = new QuantizeLayerParam();
+    *param            = layer_param;
+    int index         = start_index;
+
+    GET_INT_1_OR_DEFAULT(layer_param->axis, 0);
+
+    return TNN_OK;
+}
+
+Status QuantizeLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **resource) {
+    auto layer_resource = CreateLayerRes<QuantizeLayerResource>(resource);
+
+    GET_BUFFER_FOR_ATTR(layer_resource, scale_handle, deserializer);
+
+    return TNN_OK;
+}
+
+Status QuantizeLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
+    auto *layer_param = static_cast<QuantizeLayerParam *>(param);
+    if (nullptr == layer_param) {
+        LOGE("invalid layer param to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
+    }
+
+    output_stream << layer_param->axis << " ";
+
+    return TNN_OK;
+}
+
+Status QuantizeLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    auto layer_resource = dynamic_cast<QuantizeLayerResource *>(resource);
+    if (nullptr == layer_resource) {
+        LOGE("invalid layer resourve to save\n");
+        return Status(TNNERR_NULL_PARAM, "invalid layer resource to save");
+    }
+
+    serializer.PutRaw(layer_resource->scale_handle);
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Quantize, LAYER_QUANTIZE);
+REGISTER_LAYER_INTERPRETER(Quantize, LAYER_DEQUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc
index 930e77e95..b7c17d4a7 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/range_layer_interpreter.cc
@@ -20,6 +20,28 @@ DECLARE_LAYER_INTERPRETER(Range, LAYER_RANGE);
 
 Status RangeLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start_index, LayerParam** param) {
     auto layer_param = CreateLayerParam<RangeLayerParam>(param);
+    int index = start_index;
+
+    if (index >= layer_cfg_arr.size()) {
+        // To support OLD TNN Proto, with no param.
+        return TNN_OK;
+    }
+
+    if (index + 7 > layer_cfg_arr.size()) {
+        LOGE("Range TNN Interpreter: Range support 0 or at least 7 layer param.");
+        return Status(TNNERR_PARAM_ERR, "RangeLayerInterpreter param is invalid");
+    }
+
+    int dtype_value = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->data_type   = (DataType)dtype_value;
+    layer_param->start.i     = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->limit.i     = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->delta.i     = std::atoi(layer_cfg_arr[index++].c_str());
+    
+    layer_param->start_index = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->limit_index = std::atoi(layer_cfg_arr[index++].c_str());
+    layer_param->delta_index = std::atoi(layer_cfg_arr[index++].c_str());
+ 
     return TNN_OK;
 }
 
@@ -27,7 +49,19 @@ Status RangeLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status RangeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status RangeLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, RangeLayerParam, "invalid layer param to save", param);
+    
+    output_stream << layer_param->data_type << " ";
+
+    output_stream << layer_param->start.i << " ";
+    output_stream << layer_param->limit.i << " ";
+    output_stream << layer_param->delta.i << " ";
+
+    output_stream << layer_param->start_index << " ";
+    output_stream << layer_param->limit_index << " ";
+    output_stream << layer_param->delta_index << " ";
+    
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc
index 52c00f7bd..29279e4f3 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.cc
@@ -30,7 +30,7 @@ Status ReduceOpLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int start
     return TNN_OK;
 }
 
-Status ReduceOpLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status ReduceOpLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto *layer_param = dynamic_cast<ReduceLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h b/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h
index eb9e833b1..108e2f380 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h
+++ b/source/tnn/interpreter/tnn/layer_interpreter/reduce_op_interpreter.h
@@ -30,7 +30,7 @@ class ReduceOpLayerInterpreter : public AbstractLayerInterpreter {
         return TNN_OK;
     }
 
-    Status SaveProto(std::ofstream &output_stream, LayerParam *param);
+    Status SaveProto(std::ostream &output_stream, LayerParam *param);
     virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
         return TNN_OK;
     }
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc
index 246c8eff6..effda1682 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/reformat_layer_interpreter.cc
@@ -58,7 +58,7 @@ Status ReformatLayerInterpreter::InterpretResource(Deserializer& deserializer, L
     return TNN_OK;
 }
 
-Status ReformatLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ReformatLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<ReformatLayerParam*>(param);
     output_stream << layer_param->src_type << " ";
     output_stream << layer_param->dst_type << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc
index 874c40fa2..be7be914a 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/reorg_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status ReorgLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status ReorgLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ReorgLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     ReorgLayerParam* layer_param = dynamic_cast<ReorgLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc
index a3eabb294..8f1d49f38 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/reshape_layer_interpreter.cc
@@ -43,7 +43,7 @@ Status ReshapeLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status ReshapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ReshapeLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, ReshapeLayerParam, "invalid reshape param to save", param);
 
     output_stream << layer_param->axis << " ";
@@ -62,5 +62,6 @@ Status ReshapeLayerInterpreter::SaveResource(Serializer& serializer, LayerParam*
 }
 
 REGISTER_LAYER_INTERPRETER(Reshape, LAYER_RESHAPE);
+REGISTER_LAYER_INTERPRETER(Reshape, LAYER_RESHAPETORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc
index 99b25ba32..b33ab3d7d 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/roi_pooling_layer_interpreter.cc
@@ -50,7 +50,7 @@ Status RoiPoolingLayerInterpreter::InterpretResource(Deserializer& deserializer,
     return TNN_OK;
 }
 
-Status RoiPoolingLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status RoiPoolingLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     RoiPoolingLayerParam* layer_param = dynamic_cast<RoiPoolingLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc
index a4da982a9..b05d2626f 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/roialign_layer_interpreter.cc
@@ -23,6 +23,7 @@ DECLARE_LAYER_INTERPRETER(RoiAlign, LAYER_ROIALIGN);
 Status RoiAlignLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
     auto p = CreateLayerParam<RoiAlignLayerParam>(param);
     GET_INT_1_OR_DEFAULT(p->mode, 1);
+    GET_INT_1_OR_DEFAULT(p->aligned, 0);
     GET_INT_3(p->output_height, p->output_width, p->sampling_ratio);
     GET_FLOAT_1(p->spatial_scale);
     return TNN_OK;
@@ -32,13 +33,14 @@ Status RoiAlignLayerInterpreter::InterpretResource(Deserializer& deserializer, L
     return TNN_OK;
 }
 
-Status RoiAlignLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status RoiAlignLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto* layer_param = dynamic_cast<RoiAlignLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
         return Status(TNNERR_NULL_PARAM, "invalid layer param to save");
     }
     output_stream << layer_param->mode << " ";
+    output_stream << layer_param->aligned << " ";
     output_stream << layer_param->output_height << " ";
     output_stream << layer_param->output_width << " ";
     output_stream << layer_param->sampling_ratio << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/roll_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/roll_layer_interpreter.cc
new file mode 100644
index 000000000..f58f9d0f2
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/roll_layer_interpreter.cc
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+#include "abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Roll, LAYER_ROLL);
+
+Status RollLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam **param) {
+    auto p = CreateLayerParam<RollLayerParam>(param);
+    int shape_size = 0;
+    GET_INT_1_OR_DEFAULT(shape_size, 0);
+    GET_INT_N_INTO_VEC(p->shifts, shape_size);
+    GET_INT_N_INTO_VEC(p->dims, shape_size);
+    return TNN_OK;
+}
+
+Status RollLayerInterpreter::InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
+    return TNN_OK;
+}
+
+Status RollLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
+    CAST_OR_RET_ERROR(layer_param, RollLayerParam, "invalid Roll param to save", param);
+    output_stream << layer_param->shifts.size() << " ";
+    for (const auto &item : layer_param->shifts) {
+        output_stream << item << " ";
+    }
+    for (const auto &item : layer_param->dims) {
+        output_stream << item << " ";
+    }
+    return TNN_OK;
+}
+
+Status RollLayerInterpreter::SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Roll, LAYER_ROLL);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc
index d14a8920d..cc382da7c 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/scale_layer_interpreter.cc
@@ -53,7 +53,7 @@ Status ScaleLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status ScaleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ScaleLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     ScaleLayerParam* layer_param = dynamic_cast<ScaleLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/scatter_elements_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/scatter_elements_layer_interpreter.cc
index f9d819179..a525858e2 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/scatter_elements_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/scatter_elements_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status ScatterElementsLayerInterpreter::InterpretResource(Deserializer &deserial
     return TNN_OK;
 }
 
-Status ScatterElementsLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status ScatterElementsLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     CAST_OR_RET_ERROR(layer_param, ScatterElementsLayerParam, "invalid scatter elements param to save", param);
     output_stream << layer_param->axis << " " << layer_param->op << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/scatter_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/scatter_layer_interpreter.cc
index 5674b603c..c62ed82c7 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/scatter_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/scatter_layer_interpreter.cc
@@ -39,7 +39,7 @@ Status ScatterLayerInterpreter::InterpretResource(Deserializer &deserializer, La
     return TNN_OK;
 }
 
-Status ScatterLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status ScatterLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto *layer_param = static_cast<ScatterLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc
index e621ea3dc..fdb57e5a2 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/scatter_nd_layer_interpreter.cc
@@ -33,7 +33,7 @@ Status ScatterNDLayerInterpreter::InterpretResource(Deserializer &deserializer,
     return TNN_OK;
 }
 
-Status ScatterNDLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status ScatterNDLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc
index 84460f5d3..aee9ebea9 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/selu_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status SeluLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status SeluLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status SeluLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<SeluLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc
index 1e1eb009d..9971583dd 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/shape_layer_interpreter.cc
@@ -26,7 +26,7 @@ Status ShapeLayerInterpreter::InterpretResource(Deserializer& deserializer, Laye
     return TNN_OK;
 }
 
-Status ShapeLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ShapeLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc
index c25ac6de7..ae82f0d77 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/shuffle_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status ShuffleLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status ShuffleLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status ShuffleLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     ShuffleLayerParam* layer_param = dynamic_cast<ShuffleLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc
index 94dc9bf06..b74af66d9 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/signed_mul_layer_interpreter.cc
@@ -35,7 +35,7 @@ DECLARE_LAYER_INTERPRETER(SignedMul, LAYER_SIGNED_MUL);
         return TNN_OK;
     }
 
-    Status SignedMulLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+    Status SignedMulLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
         auto layer_param = dynamic_cast<SignedMulLayerParam*>(param);
 
         if (nullptr == layer_param) {
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc
index 694961170..fe743a531 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/size_layer_interpreter.cc
@@ -26,7 +26,7 @@ Status SizeLayerInterpreter::InterpretResource(Deserializer &deserializer, Layer
     return TNN_OK;
 }
 
-Status SizeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status SizeLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc
index 37b266ffd..2e91605cc 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/softmax_layer_interpreter.cc
@@ -37,7 +37,7 @@ Status SoftmaxLayerInterpreter::InterpretResource(Deserializer& deserializer, La
     return TNN_OK;
 }
 
-Status SoftmaxLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status SoftmaxLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     SoftmaxLayerParam* layer_param = dynamic_cast<SoftmaxLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/split_torch_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/split_torch_layer_interpreter.cc
new file mode 100644
index 000000000..a76194f7b
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/split_torch_layer_interpreter.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+#include <stdlib.h>
+
+namespace TNN_NS {
+
+    DECLARE_LAYER_INTERPRETER(SplitTorch, LAYER_SPLITTORCH);
+
+    Status SplitTorchLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+        auto p = CreateLayerParam<SplitTorchLayerParam>(param);
+
+        int slice_count = 0;
+        GET_INT_2(p->axis, slice_count);
+
+        p->slices.clear();
+        GET_INT_N_INTO_VEC(p->slices, slice_count);
+        GET_INT_1_OR_DEFAULT(p->is_split_specified, 0);
+
+        GET_INT_1(p->split_size);
+
+        return TNN_OK;
+    }
+
+    Status SplitTorchLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+        return TNN_OK;
+    }
+
+    Status SplitTorchLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+        CAST_OR_RET_ERROR(split_torch_param, SplitTorchLayerParam, "invalid layer param to save", param);
+
+        output_stream << split_torch_param->axis << " ";
+        output_stream << split_torch_param->slices.size() << " ";
+        for (auto item : split_torch_param->slices) {
+            output_stream << item << " ";
+        }
+        output_stream << int(split_torch_param->is_split_specified) << " ";
+        output_stream << split_torch_param->split_size << " ";
+
+        return TNN_OK;
+    }
+
+    Status SplitTorchLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+        return TNN_OK;
+    }
+
+    REGISTER_LAYER_INTERPRETER(SplitTorch, LAYER_SPLITTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc
index bd7a371bb..6185c14b8 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/splitv_layer_interpreter.cc
@@ -28,6 +28,8 @@ Status SplitVLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index,
 
     p->slices.clear();
     GET_INT_N_INTO_VEC(p->slices, slice_count);
+    
+    GET_INT_1_OR_DEFAULT(p->is_split_specified, 1);
 
     return TNN_OK;
 }
@@ -36,7 +38,7 @@ Status SplitVLayerInterpreter::InterpretResource(Deserializer& deserializer, Lay
     return TNN_OK;
 }
 
-Status SplitVLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status SplitVLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(splitv_param, SplitVLayerParam, "invalid layer param to save", param);
 
     output_stream << splitv_param->axis << " ";
@@ -44,6 +46,7 @@ Status SplitVLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerPara
     for (auto item : splitv_param->slices) {
         output_stream << item << " ";
     }
+    output_stream << int(splitv_param->is_split_specified) << " ";
 
     return TNN_OK;
 }
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc
index b13ac66ff..38fead622 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/squared_difference_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status SquaredDifferenceLayerInterpreter::InterpretResource(Deserializer& deseri
     return TNN_OK;
 }
 
-Status SquaredDifferenceLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status SquaredDifferenceLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc
index a8830432b..c046ed5b1 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/squeeze_layer_interpreter.cc
@@ -34,7 +34,7 @@ Status SqueezeLayerInterpreter::InterpretResource(Deserializer &deserializer, La
     return TNN_OK;
 }
 
-Status SqueezeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status SqueezeLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto squeeze_param = dynamic_cast<SqueezeLayerParam *>(param);
     if (nullptr == squeeze_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc
index efc383516..7c0352b56 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_layer_interpreter.cc
@@ -71,7 +71,7 @@ Status StrideSliceLayerInterpreter::InterpretResource(Deserializer& deserializer
     return TNN_OK;
 }
 
-Status StrideSliceLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status StrideSliceLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<StrideSliceLayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc
index 0b578391c..7f91c30a1 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/stride_slice_v2_layer_interpreter.cc
@@ -73,6 +73,10 @@ Status StrideSliceV2LayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int
         }
     }
     layer_param->strides = strides;
+    
+    GET_INT_1_OR_DEFAULT(layer_param->begins_index, -1);
+    GET_INT_1_OR_DEFAULT(layer_param->ends_index, -1);
+    
     return TNN_OK;
 }
 
@@ -80,7 +84,7 @@ Status StrideSliceV2LayerInterpreter::InterpretResource(Deserializer& deserializ
     return TNN_OK;
 }
 
-Status StrideSliceV2LayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status StrideSliceV2LayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<StrideSliceV2LayerParam*>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
@@ -106,6 +110,10 @@ Status StrideSliceV2LayerInterpreter::SaveProto(std::ofstream& output_stream, La
     for (const auto& stride : strides) {
         output_stream << stride << " ";
     }
+    
+    output_stream << layer_param->begins_index << " ";
+    output_stream << layer_param->ends_index << " ";
+    
     return TNN_OK;
 }
 
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc
index 9e65856d8..52f9b0706 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/sub_layer_interpreter.cc
@@ -46,7 +46,7 @@ Status SubLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerR
     return TNN_OK;
 }
 
-Status SubLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status SubLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param);
     output_stream << layer_param->weight_input_index << " ";
     return TNN_OK;
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc
index 6cece518b..b925ed7b7 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/tile_layer_interpreter.cc
@@ -29,7 +29,7 @@ Status TileLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status TileLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
+Status TileLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, TileLayerParam, "invalid tile layer param to save", param);
     
     for (int i=0; i< layer_param->reps.size(); i++) {
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc
index ae3253d0d..81dea5f83 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/topk_layer_interpreter.cc
@@ -36,8 +36,7 @@ Status TopKLayerInterpreter::InterpretResource(Deserializer& deserializer, Layer
     return TNN_OK;
 }
 
-Status TopKLayerInterpreter::SaveProto(std::ofstream& output_stream, LayerParam* param) {
-
+Status TopKLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
     CAST_OR_RET_ERROR(layer_param, TopKLayerParam, "invalid topk param to save", param);
     output_stream << layer_param->axis << " " << layer_param->largest << " " << 
                      layer_param->sorted << " " << layer_param->k << " ";
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h b/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h
index bac455080..128cd98ae 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h
+++ b/source/tnn/interpreter/tnn/layer_interpreter/unary_op_layer_interpreter.h
@@ -31,7 +31,7 @@ class UnaryOpLayerInterpreter : public AbstractLayerInterpreter {
     virtual Status InterpretResource(Deserializer &deserializer, LayerResource **Resource) {
         return TNN_OK;
     }
-    virtual Status SaveProto(std::ofstream &output_stream, LayerParam *param) {
+    virtual Status SaveProto(std::ostream &output_stream, LayerParam *param) {
         return TNN_OK;
     }
     virtual Status SaveResource(Serializer &serializer, LayerParam *param, LayerResource *resource) {
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc
index 96f22c008..8a62f8c34 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/unsqueeze_layer_interpreter.cc
@@ -34,7 +34,7 @@ Status UnsqueezeLayerInterpreter::InterpretResource(Deserializer &deserializer,
     return TNN_OK;
 }
 
-Status UnsqueezeLayerInterpreter::SaveProto(std::ofstream &output_stream, LayerParam *param) {
+Status UnsqueezeLayerInterpreter::SaveProto(std::ostream &output_stream, LayerParam *param) {
     auto layer_param = dynamic_cast<UnsqueezeLayerParam *>(param);
     if (nullptr == layer_param) {
         LOGE("invalid layer param to save\n");
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc
index 05d754440..70d737ebf 100644
--- a/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc
+++ b/source/tnn/interpreter/tnn/layer_interpreter/upsample_layer_interpreter.cc
@@ -58,7 +58,7 @@ DECLARE_LAYER_INTERPRETER(Upsample, LAYER_UPSAMPLE);
         return TNN_OK;
     }
 
-    Status UpsampleLayerInterpreter::SaveProto(std::ofstream& output_stream,
+    Status UpsampleLayerInterpreter::SaveProto(std::ostream& output_stream,
                                                LayerParam* param) {
         UpsampleLayerParam* layer_param =
             dynamic_cast<UpsampleLayerParam*>(param);
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/where_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/where_layer_interpreter.cc
new file mode 100644
index 000000000..5daefecd8
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/where_layer_interpreter.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Where, LAYER_WHERE);
+
+Status WhereLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    return TNN_OK;
+}
+
+Status WhereLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<WhereLayerResource>(resource);
+    RawBuffer x_buf;
+    deserializer.GetRaw(x_buf);
+    layer_res->x = x_buf;
+    
+    RawBuffer y_buf;
+    deserializer.GetRaw(y_buf);
+    layer_res->y = y_buf;
+    
+    return TNN_OK;
+}
+
+Status WhereLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    return TNN_OK;
+}
+
+Status WhereLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, WhereLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->x);
+    serializer.PutRaw(layer_res->y);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Where, LAYER_WHERE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/layer_interpreter/xor_layer_interpreter.cc b/source/tnn/interpreter/tnn/layer_interpreter/xor_layer_interpreter.cc
new file mode 100644
index 000000000..0fff44e15
--- /dev/null
+++ b/source/tnn/interpreter/tnn/layer_interpreter/xor_layer_interpreter.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <stdlib.h>
+
+#include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER_INTERPRETER(Xor, LAYER_XOR);
+
+Status XorLayerInterpreter::InterpretProto(str_arr layer_cfg_arr, int index, LayerParam** param) {
+    auto p = CreateLayerParam<MultidirBroadcastLayerParam>(param);
+    GET_INT_1_OR_DEFAULT(p->weight_input_index, 1);
+    return TNN_OK;
+}
+
+Status XorLayerInterpreter::InterpretResource(Deserializer& deserializer, LayerResource** resource) {
+    auto layer_res = CreateLayerRes<EltwiseLayerResource>(resource);
+    GET_BUFFER_FOR_ATTR(layer_res, element_handle, deserializer);
+    return TNN_OK;
+}
+
+Status XorLayerInterpreter::SaveProto(std::ostream& output_stream, LayerParam* param) {
+    CAST_OR_RET_ERROR(layer_param, MultidirBroadcastLayerParam, "invalid layer param to save", param);
+    output_stream << layer_param->weight_input_index << " ";
+    return TNN_OK;
+}
+
+Status XorLayerInterpreter::SaveResource(Serializer& serializer, LayerParam* param, LayerResource* resource) {
+    CAST_OR_RET_ERROR(layer_res, EltwiseLayerResource, "invalid layer res to save", resource);
+    serializer.PutRaw(layer_res->element_handle);
+    return TNN_OK;
+}
+
+REGISTER_LAYER_INTERPRETER(Xor, LAYER_XOR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/model_interpreter.cc b/source/tnn/interpreter/tnn/model_interpreter.cc
index 8f5a695ba..c375a2d10 100644
--- a/source/tnn/interpreter/tnn/model_interpreter.cc
+++ b/source/tnn/interpreter/tnn/model_interpreter.cc
@@ -25,6 +25,13 @@ namespace TNN_NS {
 
 TypeModelInterpreterRegister<TypeModelInterpreterCreator<ModelInterpreter>> g_tnn_model_interpreter_register(
     MODEL_TYPE_TNN);
+TypeModelInterpreterRegister<TypeModelInterpreterCreator<IRModelInterpreter>> g_tnn_ir_model_interpreter_register(
+    MODEL_TYPE_TNNIR);
+
+Status IRModelInterpreter::InterpretMd5(const std::string& content) {
+    this->params_md5_.emplace_back(md5(content));
+    return TNN_OK;
+};
 
 std::string ModelInterpreter::Transfer(std::string content) {
     return content;
@@ -42,7 +49,7 @@ std::shared_ptr<Deserializer> ModelInterpreter::GetDeserializer(std::istream &is
 ModelInterpreter::ModelInterpreter() {}
 
 ModelInterpreter::ModelInterpreter(const ModelInterpreter &interp) {
-    this->version_magic_number = interp.version_magic_number;
+    this->version_magic_number_ = interp.version_magic_number_;
 
     if (nullptr != this->net_structure_) {
         delete this->net_structure_;
@@ -57,6 +64,8 @@ ModelInterpreter::ModelInterpreter(const ModelInterpreter &interp) {
     *(this->net_resource_) = *interp.net_resource_;
 
     this->params_md5_ = interp.params_md5_;
+
+    this->cache_buf_ = interp.cache_buf_;
 }
 
 ModelInterpreter &ModelInterpreter::operator=(ModelInterpreter interp) {
@@ -64,7 +73,7 @@ ModelInterpreter &ModelInterpreter::operator=(ModelInterpreter interp) {
         return *this;
     }
 
-    this->version_magic_number = interp.version_magic_number;
+    this->version_magic_number_ = interp.version_magic_number_;
 
     if (nullptr != this->net_structure_) {
         delete this->net_structure_;
@@ -79,6 +88,8 @@ ModelInterpreter &ModelInterpreter::operator=(ModelInterpreter interp) {
 
     this->params_md5_ = interp.params_md5_;
 
+    this->cache_buf_ = interp.cache_buf_;
+
     return *this;
 }
 
@@ -214,7 +225,7 @@ Status ModelInterpreter::InterpretProto(std::string &content) {
             return ret;
         }
         if (cfg_line0.size() >= 4) {
-            this->version_magic_number = atoll(cfg_line0[3].c_str());
+            this->version_magic_number_ = atoll(cfg_line0[3].c_str());
         }
     }
 
@@ -250,7 +261,7 @@ Status ModelInterpreter::InterpretInput(const std::string &inputs_content) {
     if (ret != TNN_OK) {
         return Status(TNNERR_INVALID_NETCFG, "split input line error");
     }
-    if (this->version_magic_number == g_version_magic_number) {
+    if (this->version_magic_number_ == g_version_magic_number) {
         /*
          * input list is separated by : symbol
          * eg:
@@ -263,12 +274,13 @@ Status ModelInterpreter::InterpretInput(const std::string &inputs_content) {
                 return Status(TNNERR_INVALID_NETCFG, "split input line error");
             }
             DimsVector &input_shape = structure->inputs_shape_map[input_cfg_vec[0]];
+            structure->parsed_input_names_list.push_back(input_cfg_vec[0]);
             // input_shape.set_name(input_cfg_vec[0]);
             for (int dim_i = 1; dim_i < input_cfg_vec.size(); dim_i++) {
                 input_shape.push_back(atoi(input_cfg_vec[dim_i].c_str()));
             }
         }
-    } else if (this->version_magic_number == g_version_magic_number_v2) {
+    } else if (this->version_magic_number_ == g_version_magic_number_v2) {
         /* new tnn input format
          * input list is separated by : symbol
          * eg:
@@ -281,6 +293,7 @@ Status ModelInterpreter::InterpretInput(const std::string &inputs_content) {
                 return Status(TNNERR_INVALID_NETCFG, "split input line error");
             }
             DimsVector &input_shape = structure->inputs_shape_map[input_cfg[0]];
+            structure->parsed_input_names_list.push_back(input_cfg[0]);
             int dims_size           = atoi(input_cfg[1].c_str());
             for (int i = 2; i < dims_size + 2; ++i) {
                 if (i >= input_cfg.size()) {
@@ -307,6 +320,7 @@ Status ModelInterpreter::InterpretOutput(const std::string &outputs_content) {
         return Status(TNNERR_INVALID_NETCFG, "split output line error");
     }
     for (auto iter : output_cfg_vec) {
+        structure->parsed_output_names_list.push_back(iter);
         structure->outputs.insert(iter);
     }
     return TNN_OK;
@@ -475,6 +489,16 @@ Status ModelInterpreter::InterpretModel(std::string &model_content) {
     return TNN_OK;
 }
 
+Status ModelInterpreter::SetCache(std::string &cache) {
+    cache_buf_ = cache;
+    return TNN_OK;
+}
+
+Status ModelInterpreter::GetCache(std::string &cache) {
+    cache = cache_buf_;
+    return TNN_OK;
+}
+
 Status ModelInterpreter::RegisterLayerInterpreter(LayerType type, AbstractLayerInterpreter *interpreter) {
     safe_map<LayerType, std::shared_ptr<AbstractLayerInterpreter>> &layer_interpreter_map = LayerInterpreterMap();
     layer_interpreter_map[type] = std::shared_ptr<AbstractLayerInterpreter>(interpreter);
diff --git a/source/tnn/interpreter/tnn/model_interpreter.h b/source/tnn/interpreter/tnn/model_interpreter.h
index 436895305..5b2ac64aa 100644
--- a/source/tnn/interpreter/tnn/model_interpreter.h
+++ b/source/tnn/interpreter/tnn/model_interpreter.h
@@ -110,6 +110,12 @@ class ModelInterpreter : public DefaultModelInterpreter {
     // @brief copy interpreter
     virtual std::shared_ptr<AbstractModelInterpreter> Copy();
 
+    // @brief set cache if there is one
+    virtual Status SetCache(std::string &cache);
+
+    // &brief get cache if there is one
+    virtual Status GetCache(std::string &cache);
+
 private:
     // @brief get layer interpreter by layer type
     static safe_map<LayerType, std::shared_ptr<AbstractLayerInterpreter>>& LayerInterpreterMap();
@@ -128,7 +134,15 @@ class ModelInterpreter : public DefaultModelInterpreter {
     ;
 
 protected:
-    uint32_t version_magic_number = 0;
+    uint32_t version_magic_number_ = 0;
+    std::string cache_buf_;
+};
+
+class IRModelInterpreter : public ModelInterpreter {
+public:
+    IRModelInterpreter() {};
+    ~IRModelInterpreter() {};
+    Status InterpretMd5(const std::string& content);
 };
 
 }  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/model_packer.cc b/source/tnn/interpreter/tnn/model_packer.cc
index e226d4a07..54fd28453 100644
--- a/source/tnn/interpreter/tnn/model_packer.cc
+++ b/source/tnn/interpreter/tnn/model_packer.cc
@@ -14,6 +14,9 @@
 
 #include "tnn/interpreter/tnn/model_packer.h"
 
+#include <sstream>
+#include <streambuf>
+
 #include "tnn/interpreter/tnn/layer_interpreter/abstract_layer_interpreter.h"
 #include "tnn/interpreter/tnn/model_interpreter.h"
 #include "tnn/interpreter/tnn/objseri.h"
@@ -79,6 +82,172 @@ Status ModelPacker::PackProto(std::string file_path) {
         return Status(TNNERR_PACK_MODEL, "proto file cannot be written");
     }
 
+    std::string proto_s;
+    ret = GetProtoSeriString(proto_s);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_PACK_MODEL, "get proto serialized string failed");
+    }
+
+    write_stream << proto_s;
+
+    write_stream.close();
+
+    return TNN_OK;
+}
+
+Status ModelPacker::PackModel(std::string file_path) {
+    Status ret                = TNN_OK;
+    NetResource *net_resource = GetNetResource();
+    NetStructure *net_struct  = GetNetStructure();
+    std::ofstream write_stream;
+    write_stream.open(file_path, std::ios::binary);
+    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
+        write_stream.close();
+        LOGE("invalid model file name! (%s)\n", file_path.c_str());
+        return Status(TNNERR_PACK_MODEL, "model file cannot be written");
+    }
+
+    std::string model_s;
+    ret = GetModelSeriString(model_s);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_PACK_MODEL, "get model serialized string failed");
+    }
+
+    write_stream << model_s;
+
+    write_stream.close();
+
+    return TNN_OK;
+}
+
+Status ModelPacker::GetSerialization(std::string &seri_proto, std::string &seri_model) {
+    Status ret = TNN_OK;
+    ret = GetProtoSeriString(seri_proto);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_PACK_MODEL, "get proto serialized string failed");
+    }
+
+    ret = GetModelSeriString(seri_model);
+    if (ret != TNN_OK) {
+        return Status(TNNERR_PACK_MODEL, "get model serialized string failed");
+    }
+
+    return TNN_OK;
+}
+
+Status ModelPacker::PackLayers(std::shared_ptr<Serializer> &serializer, bool save_resource, int &resource_count) {
+    resource_count = 0;
+
+    NetResource *net_resource = GetNetResource();
+    NetStructure *net_struct  = GetNetStructure();
+
+    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
+    auto layers                 = net_struct->layers;
+    auto resource_map           = net_resource->resource_map;
+
+    std::set<std::string> blob_scale_set;
+    Status result;
+    for (const auto &layer_info : layers) {
+        // save input blobs scale
+        std::string layer_name = layer_info->name;
+        if (layer_info->param->quantized) {
+            for (auto &input_name : layer_info->inputs) {
+                auto blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
+                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
+                    continue;
+                }
+                if (resource_map.find(blob_scale_name) == resource_map.end() ||
+                    resource_map.find(blob_scale_name)->second == nullptr) {
+                    continue;
+                }
+                if (save_resource) {
+                    result = PackResource(resource_map, blob_scale_name, serializer);
+                    if (result != TNN_OK) {
+                        return result;
+                    }
+                }
+                resource_count++;
+                blob_scale_set.insert(blob_scale_name);
+            }
+        }
+        // save layer resource
+        if (resource_map.find(layer_name) != resource_map.end() && resource_map.find(layer_name)->second != nullptr) {
+            if (save_resource) {
+                result = PackResource(resource_map, layer_name, serializer);
+                if (result != TNN_OK) {
+                    return result;
+                }
+            }
+            resource_count++;
+        }
+        // save output blob scale
+        if (layer_info->param->quantized) {
+            for (auto &output_name : layer_info->outputs) {
+                auto blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
+                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
+                    continue;
+                }
+                if (resource_map.find(blob_scale_name) == resource_map.end() ||
+                    resource_map.find(blob_scale_name)->second == nullptr) {
+                    continue;
+                }
+                if (save_resource) {
+                    result = PackResource(resource_map, blob_scale_name, serializer);
+                    if (result != TNN_OK) {
+                        return result;
+                    }
+                }
+                resource_count++;
+                blob_scale_set.insert(blob_scale_name);
+            }
+        }
+    }
+    return TNN_OK;
+}
+
+Status ModelPacker::PackResource(std::map<std::string, std::shared_ptr<LayerResource>> &resource_map,
+                                 std::string &layer_name, std::shared_ptr<Serializer> &serializer) {
+    // quantized
+    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
+    auto iter                   = resource_map.find(layer_name);
+    auto layer_info             = FindLayerInfo(layer_name);
+    layer_header ly_header;
+    ly_header.name_                = iter->first;
+    ly_header.type_                = layer_info->type;
+    ly_header.type_str_            = layer_info->type_str;
+    static int resource_pack_count = 0;
+    ly_header.serialize(*serializer);
+    LayerResource *layer_resource = iter->second.get();
+    auto layer_interpreter        = layer_interpreter_map[layer_info->type];
+    if (layer_interpreter != nullptr) {
+        Status result = layer_interpreter->SaveResource(*serializer, layer_info->param.get(), layer_resource);
+        if (result != TNN_OK) {
+            LOGE(
+                "Error: layer interpreter save resource failed (name:%s "
+                "type_from_str:%s type:%d)\n",
+                ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
+            return Status(TNNERR_PACK_MODEL, "model content is invalid");
+        }
+    } else {
+        LOGE(
+            "Error: layer interpreter is null (name:%s "
+            "type_from_str:%s type:%d)\n",
+            ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
+        return Status(TNNERR_PACK_MODEL, "unsupport layer resource type");
+    }
+    return TNN_OK;
+}
+
+Status ModelPacker::GetProtoSeriString(std::string &seri_proto) {
+    Status ret              = TNN_OK;
+    NetStructure *net_struc = GetNetStructure();
+
+    std::stringbuf s_buf;
+    std::ostream write_stream(&s_buf);
+    if (!write_stream || !write_stream.good()) {
+        return Status(TNNERR_PACK_MODEL, "proto stream cannot be written");
+    }
+
     // 1st line: "1 <blob_size> 1 <magic_num> ,"
     auto magic_number = GetMagicNumber();
     if (magic_number > 0) {
@@ -183,20 +352,19 @@ Status ModelPacker::PackProto(std::string file_path) {
         write_stream << ",\"" << std::endl;
     }
 
-    write_stream.close();
+    seri_proto = s_buf.str();
 
     return TNN_OK;
 }
 
-Status ModelPacker::PackModel(std::string file_path) {
+Status ModelPacker::GetModelSeriString(std::string &seri_model) {
     NetResource *net_resource = GetNetResource();
     NetStructure *net_struct  = GetNetStructure();
-    std::ofstream write_stream;
-    write_stream.open(file_path, std::ios::binary);
-    if (!write_stream || !write_stream.is_open() || !write_stream.good()) {
-        write_stream.close();
-        LOGE("invalid model file name! (%s)\n", file_path.c_str());
-        return Status(TNNERR_PACK_MODEL, "model file cannot be written");
+
+    std::stringbuf s_buf;
+    std::ostream write_stream(&s_buf);
+    if (!write_stream || !write_stream.good()) {
+        return Status(TNNERR_PACK_MODEL, "model stream cannot be written");
     }
     auto magic_number = GetMagicNumber();
     if (magic_number > 0) {
@@ -210,7 +378,6 @@ Status ModelPacker::PackModel(std::string file_path) {
     auto serializer    = GetSerializer(write_stream);
     auto ret           = PackLayers(serializer, false, resource_count);
     if (ret != TNN_OK) {
-        write_stream.close();
         return ret;
     }
 
@@ -222,7 +389,6 @@ Status ModelPacker::PackModel(std::string file_path) {
 
     ret = PackLayers(serializer, true, resource_count);
     if (ret != TNN_OK) {
-        write_stream.close();
         return ret;
     }
     
@@ -239,115 +405,9 @@ Status ModelPacker::PackModel(std::string file_path) {
         }
     }
     
-    write_stream.close();
-    if (ret != TNN_OK) {
-        return ret;
-    }
+    seri_model = s_buf.str();
 
     return TNN_OK;
 }
 
-Status ModelPacker::PackLayers(std::shared_ptr<Serializer> &serializer, bool save_resource, int &resource_count) {
-    resource_count = 0;
-
-    NetResource *net_resource = GetNetResource();
-    NetStructure *net_struct  = GetNetStructure();
-
-    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
-    auto layers                 = net_struct->layers;
-    auto resource_map           = net_resource->resource_map;
-
-    std::set<std::string> blob_scale_set;
-    Status result;
-    for (const auto &layer_info : layers) {
-        // save input blobs scale
-        std::string layer_name = layer_info->name;
-        if (layer_info->param->quantized) {
-            for (auto &input_name : layer_info->inputs) {
-                auto blob_scale_name = input_name + BLOB_SCALE_SUFFIX;
-                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
-                    continue;
-                }
-                if (resource_map.find(blob_scale_name) == resource_map.end() ||
-                    resource_map.find(blob_scale_name)->second == nullptr) {
-                    continue;
-                }
-                if (save_resource) {
-                    result = PackResource(resource_map, blob_scale_name, serializer);
-                    if (result != TNN_OK) {
-                        return result;
-                    }
-                }
-                resource_count++;
-                blob_scale_set.insert(blob_scale_name);
-            }
-        }
-        // save layer resource
-        if (resource_map.find(layer_name) != resource_map.end() && resource_map.find(layer_name)->second != nullptr) {
-            if (save_resource) {
-                result = PackResource(resource_map, layer_name, serializer);
-                if (result != TNN_OK) {
-                    return result;
-                }
-            }
-            resource_count++;
-        }
-        // save output blob scale
-        if (layer_info->param->quantized) {
-            for (auto &output_name : layer_info->outputs) {
-                auto blob_scale_name = output_name + BLOB_SCALE_SUFFIX;
-                if (blob_scale_set.find(blob_scale_name) != blob_scale_set.end()) {
-                    continue;
-                }
-                if (resource_map.find(blob_scale_name) == resource_map.end() ||
-                    resource_map.find(blob_scale_name)->second == nullptr) {
-                    continue;
-                }
-                if (save_resource) {
-                    result = PackResource(resource_map, blob_scale_name, serializer);
-                    if (result != TNN_OK) {
-                        return result;
-                    }
-                }
-                resource_count++;
-                blob_scale_set.insert(blob_scale_name);
-            }
-        }
-    }
-    return TNN_OK;
-}
-
-Status ModelPacker::PackResource(std::map<std::string, std::shared_ptr<LayerResource>> &resource_map,
-                                 std::string &layer_name, std::shared_ptr<Serializer> &serializer) {
-    // quantized
-    auto &layer_interpreter_map = ModelInterpreter::GetLayerInterpreterMap();
-    auto iter                   = resource_map.find(layer_name);
-    auto layer_info             = FindLayerInfo(layer_name);
-    layer_header ly_header;
-    ly_header.name_                = iter->first;
-    ly_header.type_                = layer_info->type;
-    ly_header.type_str_            = layer_info->type_str;
-    static int resource_pack_count = 0;
-    ly_header.serialize(*serializer);
-    LayerResource *layer_resource = iter->second.get();
-    auto layer_interpreter        = layer_interpreter_map[layer_info->type];
-    if (layer_interpreter != nullptr) {
-        Status result = layer_interpreter->SaveResource(*serializer, layer_info->param.get(), layer_resource);
-        if (result != TNN_OK) {
-            LOGE(
-                "Error: layer interpreter save resource failed (name:%s "
-                "type_from_str:%s type:%d)\n",
-                ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
-            return Status(TNNERR_PACK_MODEL, "model content is invalid");
-        }
-    } else {
-        LOGE(
-            "Error: layer interpreter is null (name:%s "
-            "type_from_str:%s type:%d)\n",
-            ly_header.name_.c_str(), ly_header.type_str_.c_str(), ly_header.type_);
-        return Status(TNNERR_PACK_MODEL, "unsupport layer resource type");
-    }
-    return TNN_OK;
-}
-
 }  // namespace TNN_NS
diff --git a/source/tnn/interpreter/tnn/model_packer.h b/source/tnn/interpreter/tnn/model_packer.h
index b1fc54c52..d0da90a6a 100644
--- a/source/tnn/interpreter/tnn/model_packer.h
+++ b/source/tnn/interpreter/tnn/model_packer.h
@@ -32,6 +32,9 @@ class ModelPacker : public DefaultModelPacker {
     // @brief set the model version to pack
     void SetVersion(int version);
 
+    // @brief get serialized proto and model
+    Status GetSerialization(std::string &seri_proto, std::string &seri_model);
+
 private:
     std::shared_ptr<LayerInfo> FindLayerInfo(std::string layer_name);
     Status PackProto(std::string file_path);
@@ -39,6 +42,8 @@ class ModelPacker : public DefaultModelPacker {
     Status PackLayers(std::shared_ptr<Serializer> &serializer, bool save_resource, int &resource_count);
     Status PackResource(std::map<std::string, std::shared_ptr<LayerResource>> &resource_map, std::string &layer_name,
                         std::shared_ptr<Serializer> &serializer);
+    Status GetProtoSeriString(std::string &seri_proto);
+    Status GetModelSeriString(std::string &seri_model);
 
 protected:
     int model_version_ = 1;
diff --git a/source/tnn/layer/base_layer.cc b/source/tnn/layer/base_layer.cc
index 7e4de0eaa..36d8cf015 100644
--- a/source/tnn/layer/base_layer.cc
+++ b/source/tnn/layer/base_layer.cc
@@ -54,7 +54,7 @@ Status BaseLayer::Init(Context* context, LayerParam* param, LayerResource* resou
     if (!output_blobs_[0]->NeedAllocateInForward()){
         status = InferOutputShape();
         if (status != TNN_OK) {
-            LOGE("InferOutputShape failed\n");
+            LOGE("InferOutputShape failed, err msg: %s\n", status.description().c_str());
             return status;
         }
     }
diff --git a/source/tnn/layer/cast_layer.cc b/source/tnn/layer/cast_layer.cc
index dc6590218..e9a13731a 100644
--- a/source/tnn/layer/cast_layer.cc
+++ b/source/tnn/layer/cast_layer.cc
@@ -21,8 +21,16 @@ DECLARE_LAYER(Cast, LAYER_CAST);
 Status CastLayer::InferOutputDataType() {
     BaseLayer::InferOutputDataType();
     auto layer_param = dynamic_cast<CastLayerParam*>(param_);
+    if (input_blobs_.size()==2) {
+        layer_param->to = (int)input_blobs_[1]->GetBlobDesc().data_type;
+    }
     for (auto output_blob : output_blobs_) {
-        output_blob->GetBlobDesc().data_type = (DataType)layer_param->to;
+        if (input_blobs_.size()==2) {
+            // Cast To, pytorch::type_as
+            output_blob->GetBlobDesc().data_type = input_blobs_[1]->GetBlobDesc().data_type;
+        } else {
+            output_blob->GetBlobDesc().data_type = (DataType)layer_param->to;
+        }
     }
     return TNN_OK;
 }
diff --git a/source/tnn/layer/conv1d_layer.cc b/source/tnn/layer/conv1d_layer.cc
index 872da3e12..2467bd80f 100644
--- a/source/tnn/layer/conv1d_layer.cc
+++ b/source/tnn/layer/conv1d_layer.cc
@@ -30,6 +30,20 @@ Status Conv1DLayer::InferOutputShape(bool ignore_error) {
     Blob* input_blob           = input_blobs_[0];
     Blob* output_blob          = output_blobs_[0];
     ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    int group = conv_param->group;
+    const int pad_type = conv_param->pad_type;
+
+    if (input_blobs_.size() > 1) {
+        auto dims = input_blobs_[1]->GetBlobDesc().dims;
+        conv_param->kernels[0] = dims[2];
+        if (pad_type == 3) {
+            conv_param->output_channel = dims[1] * group;
+            conv_param->input_channel = dims[0] / group ;
+        } else {
+            conv_param->output_channel = dims[0];
+            conv_param->input_channel = dims[1];
+        }
+    }
     CHECK_PARAM_NULL(conv_param);
 
     int num    = input_blob->GetBlobDesc().dims[0];
@@ -43,7 +57,7 @@ Status Conv1DLayer::InferOutputShape(bool ignore_error) {
 
     int height_out = 0;
 
-    const int pad_type = conv_param->pad_type;
+    // const int pad_type = conv_param->pad_type;
 
     // Refactored the code to support tensorflow models
     if (pad_type == -1)  // default padding following the proto setting
@@ -85,7 +99,7 @@ Status Conv1DLayer::InferOutputShape(bool ignore_error) {
         return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
     }
 
-    int group = conv_param->group;
+    // int group = conv_param->group;
     if (group == 0) {
         LOGE_IF(!ignore_error, "Error: ConvLayer Error: invalid group param\n");
         return Status(TNNERR_INVALID_GROUP, "ConvLayer Error: invalid group param");
diff --git a/source/tnn/layer/conv_layer.cc b/source/tnn/layer/conv_layer.cc
index f5cd2ade5..f504f60c2 100644
--- a/source/tnn/layer/conv_layer.cc
+++ b/source/tnn/layer/conv_layer.cc
@@ -30,6 +30,26 @@ Status ConvLayer::InferOutputShape(bool ignore_error) {
     Blob* input_blob           = input_blobs_[0];
     Blob* output_blob          = output_blobs_[0];
     ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    int group = conv_param->group;
+    const int pad_type = conv_param->pad_type;
+
+    if (group == 0) {
+        LOGE_IF(!ignore_error, "Error: ConvLayer Error: invalid group param\n");
+        return Status(TNNERR_INVALID_GROUP, "ConvLayer Error: invalid group param");
+    }
+
+    if(input_blobs_.size() > 1) {
+        auto dims = input_blobs_[1]->GetBlobDesc().dims;
+        conv_param->kernels[0] = dims[3];
+        conv_param->kernels[1] = dims[2];
+        if (pad_type == 3) {
+            conv_param->output_channel = dims[1] * group;
+            conv_param->input_channel = dims[0] / group ;
+        } else {
+            conv_param->output_channel = dims[0];
+            conv_param->input_channel = dims[1];
+        }
+    }
     CHECK_PARAM_NULL(conv_param);
 
     int num    = input_blob->GetBlobDesc().dims[0];
@@ -51,7 +71,6 @@ Status ConvLayer::InferOutputShape(bool ignore_error) {
     int height_out = 0;
     int width_out  = 0;
 
-    const int pad_type = conv_param->pad_type;
 
     int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
     int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
@@ -109,11 +128,6 @@ Status ConvLayer::InferOutputShape(bool ignore_error) {
         return Status(TNNERR_PARAM_ERR, "Error: ConvLayer dont support pad type");
     }
 
-    int group = conv_param->group;
-    if (group == 0) {
-        LOGE_IF(!ignore_error, "Error: ConvLayer Error: invalid group param\n");
-        return Status(TNNERR_INVALID_GROUP, "ConvLayer Error: invalid group param");
-    }
 
     if (height_out <= 0 || width_out <= 0) {
         LOGE_IF(!ignore_error, "Error: invalid deconv param, height_out(%d) or width_out(%d) is less than zero\n", height_out, width_out);
diff --git a/source/tnn/layer/cumsum_layer.cc b/source/tnn/layer/cumsum_layer.cc
new file mode 100644
index 000000000..17aed3ddb
--- /dev/null
+++ b/source/tnn/layer/cumsum_layer.cc
@@ -0,0 +1,74 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Cumsum, LAYER_CUMSUM);
+
+// ONNX Cumsum compatible with torch cumsum
+// Plus an extra mode: Exclusive Extend
+// Example:
+// input_x = [1, 2, 3]
+// axis=0
+//
+// Default:
+// output  = [1, 3, 6]
+//
+// exclusive = 1
+// output  = [0, 1, 3]
+//
+// exclusive_extend = 1
+// output  = [0, 1, 3, 6]
+//
+// reverse = 1
+// output  = [6, 5, 3]
+//
+// reverse = 1, exclusive = 1
+// output  = [5, 3, 0]
+//
+// reverse = 1, exclusive_extend = 1
+// output  = [5, 5, 3, 0]
+
+
+Status CumsumLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status CumsumLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto layer_param = dynamic_cast<CumsumLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+
+    if (layer_param->exclusive_extend) {
+        if (layer_param->axis < 0) {
+            layer_param->axis += input_blob->GetBlobDesc().dims.size();
+        }
+        output_blob->GetBlobDesc().dims[layer_param->axis] = input_blob->GetBlobDesc().dims[layer_param->axis]+1;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Cumsum, LAYER_CUMSUM);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/deconv1d_layer.cc b/source/tnn/layer/deconv1d_layer.cc
new file mode 100644
index 000000000..9fd636f71
--- /dev/null
+++ b/source/tnn/layer/deconv1d_layer.cc
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Deconv1D, LAYER_DECONVOLUTION_1D);
+
+Status Deconv1DLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status Deconv1DLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob             = input_blobs_[0];
+    Blob* output_blob            = output_blobs_[0];
+    ConvLayerParam* deconv_param = dynamic_cast<ConvLayerParam*>(param_);
+    const int pad_type = deconv_param->pad_type;
+
+    int group = deconv_param->group;
+    if (group == 0) {
+        return Status(TNNERR_INVALID_GROUP, "Error: invalid group param");
+    }
+
+    if(input_blobs_.size() > 1) {
+        auto dims = input_blobs_[1]->GetBlobDesc().dims;
+        deconv_param->kernels[0] = dims[2];
+        if (pad_type == 3) {
+            deconv_param->output_channel = dims[1] * group;
+            deconv_param->input_channel = dims[0] / group ;
+        } else {
+            deconv_param->output_channel = dims[0];
+            deconv_param->input_channel = dims[1];
+        }
+    }
+
+    CHECK_PARAM_NULL(deconv_param);
+
+    int num    = input_blob->GetBlobDesc().dims[0];
+    int width  = input_blob->GetBlobDesc().dims[2];
+    const int pad_w_begin = deconv_param->pads[0];
+    const int kernel_w = deconv_param->kernels[0];
+    const int stride_w = deconv_param->strides[0];
+    const int dilation_w = deconv_param->dialations[0];
+
+    int width_out  = 0;
+
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    // Refactored the code to support tensorflow models
+    if (pad_type == -1)  // default padding following the proto setting
+    {
+        width_out  = stride_w * (width - 1) + kernel_extent_w - 2 * pad_w_begin;
+    } else if (pad_type == 0 || pad_type == 1 || pad_type == 2 || pad_type == 3) {
+        // The code below is based on the logic from tensorflow
+        width_out  = width * stride_w;
+        if (pad_type == 0 || pad_type == 3)  // SAME type
+        {
+            width_out  = width * stride_w;
+        } else if (pad_type == 1)  // VALID type
+        {
+            width_out  = width * stride_w + std::max(kernel_extent_w - stride_w, 0);
+        } else if (pad_type == 2)  // FULL type
+        {
+            width_out  = width * stride_w - (stride_w + kernel_extent_w - 2);
+        } else {
+            LOGE_IF(!ignore_error, "Error: DeconvLayer dont support pad type: %d\n", pad_type);
+            return Status(TNNERR_PARAM_ERR, "Error: DeconvLayer dont support pad type");
+        }
+
+        int pad_along_width  = ((width - 1) * stride_w + kernel_extent_w - width_out);
+        if (pad_type == 3) {
+            pad_along_width  = std::max(pad_along_width, 0);
+        }
+
+        int pad_left = pad_along_width / 2;
+        int pad_right = pad_along_width - pad_left;
+
+        // reset pad_h and pad_w
+        deconv_param->pads[0] = pad_left;
+        deconv_param->pads[1] = pad_right;
+
+        if (pad_type == 3) {
+            // deconv exchange pad_right and pad_left because of output_padding
+            deconv_param->pads[0] = pad_right;
+            deconv_param->pads[1] = pad_left;
+        }
+        //        LOGE("DeconvLayerpads: %d %d \n", deconv_param->pads[0],
+        //             deconv_param->pads[1]);
+    } else {
+        LOGE_IF(!ignore_error, "Error: DeconvLayer dont support pad type: %d\n", pad_type);
+        return Status(TNNERR_PARAM_ERR, "Error: DeconvLayer dont support pad type");
+    }
+
+    if (width_out <= 0) {
+        LOGE_IF(!ignore_error, "Error: invalid deconv param, width_out(%d) is less than zero\n", width_out);
+        return Status(TNNERR_PARAM_ERR, "Error: invalid deconv param, width_out is less than zero");
+    }
+
+    DimsVector output_dims;
+    output_dims.push_back(num);
+    output_dims.push_back(deconv_param->output_channel);
+    output_dims.push_back(width_out);
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Deconv1D, LAYER_DECONVOLUTION_1D);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/deconv_layer.cc b/source/tnn/layer/deconv_layer.cc
index b8913c545..4d7967175 100644
--- a/source/tnn/layer/deconv_layer.cc
+++ b/source/tnn/layer/deconv_layer.cc
@@ -31,6 +31,25 @@ Status DeconvLayer::InferOutputShape(bool ignore_error) {
     Blob* input_blob             = input_blobs_[0];
     Blob* output_blob            = output_blobs_[0];
     ConvLayerParam* deconv_param = dynamic_cast<ConvLayerParam*>(param_);
+    const int pad_type = deconv_param->pad_type;
+
+    int group = deconv_param->group;
+    if (group == 0) {
+        return Status(TNNERR_INVALID_GROUP, "Error: invalid group param");
+    }
+
+    if(input_blobs_.size() > 1) {
+        auto dims = input_blobs_[1]->GetBlobDesc().dims;
+        deconv_param->kernels[0] = dims[3];
+        deconv_param->kernels[1] = dims[2];
+        if (pad_type == 3) {
+            deconv_param->output_channel = dims[1] * group;
+            deconv_param->input_channel = dims[0] / group ;
+        } else {
+            deconv_param->output_channel = dims[0];
+            deconv_param->input_channel = dims[1];
+        }
+    }
     CHECK_PARAM_NULL(deconv_param);
 
     int num    = input_blob->GetBlobDesc().dims[0];
@@ -52,7 +71,6 @@ Status DeconvLayer::InferOutputShape(bool ignore_error) {
     int height_out = 0;
     int width_out  = 0;
 
-    const int pad_type = deconv_param->pad_type;
 
     int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
     int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
@@ -118,11 +136,6 @@ Status DeconvLayer::InferOutputShape(bool ignore_error) {
         return Status(TNNERR_PARAM_ERR, "Error: DeconvLayer dont support pad type");
     }
 
-    int group = deconv_param->group;
-    if (group == 0) {
-        return Status(TNNERR_INVALID_GROUP, "Error: invalid group param");
-    }
-
     if (height_out <= 0 || width_out <= 0) {
         LOGE_IF(!ignore_error, "Error: invalid deconv param, height_out(%d) or width_out(%d) is less than zero\n", height_out, width_out);
         return Status(TNNERR_PARAM_ERR, "Error: invalid deconv param, height_out or width_out is less than zero");
diff --git a/source/tnn/layer/dequantize_layer.cc b/source/tnn/layer/dequantize_layer.cc
new file mode 100644
index 000000000..3cda3b070
--- /dev/null
+++ b/source/tnn/layer/dequantize_layer.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/data_flag_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Dequantize, LAYER_DEQUANTIZE);
+
+Status DequantizeLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    //tricky: mark blob as no const, special for qdq weight
+    for (auto& iter : output_blobs_) {
+        iter->SetFlag(DataFlagUtils::MinChangeStatus(DATA_FLAG_CHANGE_ALWAYS ,iter->GetFlag()));
+    }
+    return TNN_OK;
+}
+
+Status DequantizeLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    Blob* input_blob = input_blobs_[0];
+    output_blobs_[0]->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Dequantize, LAYER_DEQUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/effective_transformer_layer.cc b/source/tnn/layer/effective_transformer_layer.cc
new file mode 100644
index 000000000..a03a94fee
--- /dev/null
+++ b/source/tnn/layer/effective_transformer_layer.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+Status EffectiveTransformerLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    for (int i = 0; i < output_blobs_.size(); ++i) {
+        if (i == 0) {
+            output_blobs_[i]->GetBlobDesc().data_type = input_blobs_[i]->GetBlobDesc().data_type;
+        } else {
+            // for offset
+            output_blobs_[i]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+        }
+    }
+    return TNN_OK;
+}
+
+Status EffectiveTransformerLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    int rank = input_blobs_[0]->GetBlobDesc().dims.size();
+    auto in_dims = input_blobs_[0]->GetBlobDesc().dims;
+    output_blobs_[0]->GetBlobDesc().dims = in_dims;
+
+    auto eff_param = dynamic_cast<EffectiveTransformerLayerParam*>(param_);
+    CHECK_PARAM_NULL(eff_param);
+
+    if (eff_param->is_remove_padding) {
+        if (output_blobs_.size() != 3) {
+            LOGE("Error: EffectiveTransformerLayer output number error.\n");
+            return Status(TNNERR_PARAM_ERR, "Error: EffectiveTransformerLayer output number error.");
+        }
+        if (rank < 2) {
+            LOGE("Error: EffectiveTransformerLayer input dims error.\n");
+            return Status(TNNERR_PARAM_ERR, "Error: EffectiveTransformerLayer input dims error.");
+        } else {
+            int dim = 1;
+            for (int i = 0; i < rank - 1; ++i) {
+                dim *= in_dims[i];
+            }
+            dim += 1;   // token_number
+            output_blobs_[1]->GetBlobDesc().dims = {dim};
+            output_blobs_[2]->GetBlobDesc().dims = {in_dims[0] + 1};
+        }
+    } else {
+        if (output_blobs_.size() == 2) {
+            // for control flow
+            output_blobs_[1]->GetBlobDesc().dims = {1};
+        }
+    }
+    return TNN_OK;
+}
+
+REGISTER_LAYER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/einsum_layer.cc b/source/tnn/layer/einsum_layer.cc
index b5c0ff3dd..4db3ce10a 100644
--- a/source/tnn/layer/einsum_layer.cc
+++ b/source/tnn/layer/einsum_layer.cc
@@ -70,39 +70,18 @@ DimsVector CalDotOutputShape() {
     return {1};
 }
 
-Status EinsumLayer::InferOutputDataType() {
-    return BaseLayer::InferOutputDataType();
-}
-
-Status EinsumLayer::InferOutputShape(bool ignore_error) {
-    BaseLayer::InferOutputShape(ignore_error);
-
-    auto inputs  = input_blobs_;
-    auto outputs = output_blobs_;
-
-    auto param = dynamic_cast<EinsumLayerParam *>(param_);
-    if (!param) {
-        return Status(TNNERR_MODEL_ERR, "Error: EinsumLayerParam is nil");
-    }
-
-    param->perm_shapes.clear();
-    param->dim_last_op.clear();
-    param->operand_dims.clear();
-    param->has_zero_size_dim = false;
-    const auto equation    = param->equation;
-    constexpr int ELLIPSIS = '.';
-
-    // Find arrow (->) to split equation into lhs and rhs
+Status ConvertStoreInputOpLabels(const std::vector<Blob*>& inputs,
+                                 const std::string& equation,
+                                 const int ellipsis,
+                                 std::vector<std::vector<int>>& op_labels) {
+    // Convert labels for input operands into an index in [0, 25] and store
+    // them in op_labels for each operand along with ELLIPSIS if present.
     const auto arrow_pos = equation.find("->");
     const auto lhs       = equation.substr(0, arrow_pos);
+    const auto num_ops   = inputs.size();
 
-    const auto num_ops = inputs.size();
-
-    // Convert labels for input operands into an index in [0, 25] and store
-    // them in op_labels for each operand along with ELLIPSIS if present.
-    std::vector<std::vector<int>> op_labels(num_ops);
-    bool found_ell      = false;
-    std::size_t curr_op = 0;
+    bool found_ell       = false;
+    std::size_t curr_op  = 0;
     for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) {
         switch (lhs[i]) {
             case ' ':
@@ -120,7 +99,7 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
                                                 " that is not part of any ellipsis";
                     return Status(TNNERR_MODEL_ERR, message);
                 }
-                op_labels[curr_op].push_back(ELLIPSIS);
+                op_labels[curr_op].push_back(ellipsis);
                 found_ell = true;
                 break;
 
@@ -149,18 +128,20 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
     if (curr_op != num_ops - 1) {
         return Status(TNNERR_MODEL_ERR, "einsum() more operands were provided than specified in the equation");
     }
+    
+    return TNN_OK;
+}
 
-    // Labels must be within [a, z].
-    constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
-    std::vector<int> label_count(TOTAL_LABELS, 0);
-
-    // The maximum number of dimensions covered by any ellipsis, needed when
-    // unsqueezing missing dimensions from operands to permute and broadcast
-    int ell_num_dim = 0;
 
+Status CalculateLabelFreq(const std::vector<Blob*>& inputs,
+                          const int ellipsis,
+                          const std::vector<std::vector<int>>& op_labels,
+                          std::vector<int>& label_count,
+                          int& ell_num_dim) {
     // Compute label frequency and number of dimensions covered by ellipsis
     // We do this after parsing labels to make it more readable and simpler
     // to compute the number of dimensions covered by ellipsis.
+    const auto num_ops = inputs.size();
     for (int i = 0; i < num_ops; i++) {
         const auto operand_dims = inputs[i]->GetBlobDesc().dims;
         const auto labels       = op_labels[i];
@@ -169,7 +150,7 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
         bool has_ellipsis       = false;
 
         for (const auto &label : labels) {
-            if (label == ELLIPSIS) {
+            if (label == ellipsis) {
                 --nlabels;
                 has_ellipsis = true;
                 ell_num_dim  = std::max(ell_num_dim, ndims - nlabels);
@@ -189,24 +170,29 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
             return Status(TNNERR_MODEL_ERR, message);
         }
     }
+    return TNN_OK;
+}
 
-    // We want to align the dimensions of every input tensor to have
-    // shape out_dims + sum_dims. For this, we create a mapping of label
-    // to index into the permuted shape.
-    std::vector<int> label_perm_index(TOTAL_LABELS, -1);
-
-    // Current index in the permuted shape
-    int perm_index = 0;
 
-    // Start index of ellipsis dimensions in the permuted shape
-    int ell_index = 0;
-    found_ell     = false;
+Status CalculatePremIndex(const std::vector<Blob*>& inputs,
+                          const std::string& equation,
+                          const std::vector<int>& label_count,
+                          const int total_labels,
+                          const int ell_num_dim,
+                          std::vector<int>& label_perm_index,
+                          int& perm_index,
+                          int& ell_index,
+                          int& out_size) {
+    const auto num_ops   = inputs.size();
+    bool found_ell       = false;
+    const auto arrow_pos = equation.find("->");
+    const auto lhs       = equation.substr(0, arrow_pos);
 
     if (arrow_pos == std::string::npos) {
         // Implicit output is ellipsis (...) + labels seen only once
         perm_index = ell_num_dim;
         found_ell  = true;
-        for (int label = 0; label < TOTAL_LABELS; label++) {
+        for (int label = 0; label < total_labels; label++) {
             if (label_count[label] == 1) {
                 label_perm_index[label] = perm_index++;
             }
@@ -253,9 +239,7 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
         }
     }
 
-    // Save output size before adding contraction dims (dims to sum out)
-    const int out_size = perm_index;
-    param->out_size = out_size;
+    out_size = perm_index;
 
     // If ellipsis is not part of the output, add to contraction dimensions
     if (!found_ell) {
@@ -264,11 +248,90 @@ Status EinsumLayer::InferOutputShape(bool ignore_error) {
     }
 
     // Add contraction labels (labels not present in output)
-    for (int label = 0; label < TOTAL_LABELS; label++) {
+    for (int label = 0; label < total_labels; label++) {
         if (label_count[label] > 0 && label_perm_index[label] == -1) {
             label_perm_index[label] = perm_index++;
         }
     }
+    return TNN_OK;
+}
+
+
+
+
+Status EinsumLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status EinsumLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    auto inputs  = input_blobs_;
+    auto outputs = output_blobs_;
+
+    auto param = dynamic_cast<EinsumLayerParam *>(param_);
+    if (!param) {
+        return Status(TNNERR_MODEL_ERR, "Error: EinsumLayerParam is nil");
+    }
+
+    param->perm_shapes.clear();
+    param->dim_last_op.clear();
+    param->operand_dims.clear();
+    param->has_zero_size_dim = false;
+    const auto equation    = param->equation;
+    constexpr int ELLIPSIS = '.';
+
+    // Find arrow (->) to split equation into lhs and rhs
+    const auto arrow_pos = equation.find("->");
+    const auto lhs       = equation.substr(0, arrow_pos);
+
+    const auto num_ops = inputs.size();
+    std::vector<std::vector<int>> op_labels(num_ops);
+
+    // Convert labels for input operands into an index in [0, 25] and store
+    // them in op_labels for each operand along with ELLIPSIS if present.
+    Status status = ConvertStoreInputOpLabels(inputs, equation, ELLIPSIS, op_labels);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    // Labels must be within [a, z].
+    constexpr int TOTAL_LABELS = 'z' - 'a' + 1;
+    std::vector<int> label_count(TOTAL_LABELS, 0);
+
+    // The maximum number of dimensions covered by any ellipsis, needed when
+    // unsqueezing missing dimensions from operands to permute and broadcast
+    int ell_num_dim = 0;
+
+    // Compute label frequency and number of dimensions covered by ellipsis
+    // We do this after parsing labels to make it more readable and simpler
+    // to compute the number of dimensions covered by ellipsis.
+    status = CalculateLabelFreq(inputs, ELLIPSIS, op_labels, label_count, ell_num_dim);
+    if (status != TNN_OK) {
+        return status;
+    }
+
+    // We want to align the dimensions of every input tensor to have
+    // shape out_dims + sum_dims. For this, we create a mapping of label
+    // to index into the permuted shape.
+    std::vector<int> label_perm_index(TOTAL_LABELS, -1);
+
+    // Current index in the permuted shape
+    int perm_index = 0;
+
+    // Start index of ellipsis dimensions in the permuted shape
+    int ell_index = 0;
+    
+    // Save output size before adding contraction dims (dims to sum out)
+    int out_size = 0;
+
+    status = CalculatePremIndex(inputs, equation, label_count, TOTAL_LABELS, ell_num_dim,
+                                label_perm_index, perm_index, ell_index, out_size);
+    if (status != TNN_OK) {
+        return status;
+    }
+    param->out_size = out_size;
+
 
     // Here we unsqueeze missing dimensions to make all operands have the same
     // number of dimensions. We take diagonals for repeated labels within the
diff --git a/source/tnn/layer/expand_layer.cc b/source/tnn/layer/expand_layer.cc
index d70dd847b..fea075191 100644
--- a/source/tnn/layer/expand_layer.cc
+++ b/source/tnn/layer/expand_layer.cc
@@ -19,11 +19,12 @@
 
 namespace TNN_NS {
 
-DECLARE_LAYER(Expand, LAYER_EXPAND);
+DECLARE_LAYER_WITH_FUNC(Expand, LAYER_EXPAND,
+                        virtual Status FillLayerParamWithConstantResource(););
 
 Status ExpandLayer::InferOutputDataType() {
     BaseLayer::InferOutputDataType();
-
+    
     if (const_resource_) {
         const auto iter = const_resource_->find(input_blobs_[0]->GetBlobDesc().name);
         if (iter != const_resource_->end()) {
@@ -43,12 +44,38 @@ Status ExpandLayer::InferOutputShape(bool ignore_error) {
     Blob* input_blob = input_blobs_[0];
     Blob* output_blob = output_blobs_[0];
     auto input_dims = input_blob->GetBlobDesc().dims;
-    auto shape_dims = layer_param->shape;
+    std::vector<int> shape_dims;
+    shape_dims = layer_param->shape;
+    
     auto output_dims = DimsFunctionUtils::Expand(input_dims, shape_dims, nullptr);
     output_blob->GetBlobDesc().dims = output_dims;
     return TNN_OK;
 }
 
+Status ExpandLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto layer_param = dynamic_cast<ExpandLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    //根据const resource更新维度信息
+    if (input_blobs_.size() >= 2) {
+        auto shape_blob_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(shape_blob_name) != const_resource_->end()) {
+            auto shape_buffer = (*const_resource_)[shape_blob_name];
+            auto dim_count = shape_buffer->GetDataCount();
+            auto dim_data = (int *)shape_buffer->force_to<int *>();
+            DimsVector dims;
+            for (int i=0; i<dim_count; i++) {
+                dims.push_back(dim_data[i]);
+            }
+            if (layer_param->shape.size()==0) {
+                layer_param->shape = dims;
+            }
+        }
+    }
+    return status;
+}
+
 REGISTER_LAYER(Expand, LAYER_EXPAND);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/layer/flatten_torch_layer.cc b/source/tnn/layer/flatten_torch_layer.cc
new file mode 100644
index 000000000..964353390
--- /dev/null
+++ b/source/tnn/layer/flatten_torch_layer.cc
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(FlattenTorch, LAYER_FLATTENTORCH);
+
+Status FlattenTorchLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status FlattenTorchLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    FlattenTorchLayerParam* param = dynamic_cast<FlattenTorchLayerParam*>(param_);
+    CHECK_PARAM_NULL(param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    const DimsVector input_dims = input_blob->GetBlobDesc().dims;
+    const int input_dims_size = input_dims.size();
+    int start_dim = param->start_dim < 0 ? param->start_dim + input_dims_size : param->start_dim;
+    int end_dim   = param->end_dim < 0 ? param->end_dim + input_dims_size : param->end_dim;
+    if (start_dim > input_dims_size || end_dim > input_dims_size) {
+        LOGE_IF(!ignore_error, "FlattenTorch Param Error! start_dim, end_dims > input num dims.\n");
+        return Status(TNNERR_PARAM_ERR, "FlattenTorch param error, start_dim or end_dim > number_of input dims.");
+    }
+    if (start_dim > end_dim) {
+        LOGE_IF(!ignore_error, "FlattenTorch Param Error! start_dim > end_dims.\n");
+        return Status(TNNERR_PARAM_ERR, "FlattenTorch param error, start_dim > end_dim.");
+    }
+
+    DimsVector output_dims;
+    for (int i=0; i<start_dim; i++) {
+        output_dims.push_back(input_dims[i]);
+    }
+    if (start_dim < end_dim) {
+        int flattened_dim = 1;
+        for (int i=start_dim; i<end_dim+1; i++) {
+            flattened_dim *= input_dims[i];
+        }
+        output_dims.push_back(flattened_dim);
+    }
+    for (int i=end_dim+1; i<input_dims_size; i++) {
+        output_dims.push_back(input_dims[i]);
+    }
+    ///////////////////////////// 
+    /*
+    std::cout << "[FlattenTorch InferShape] in.name = " << input_blob->GetBlobDesc().name << ", out.name = " << output_blob->GetBlobDesc().name << std::endl;
+    std::cout << "[FlattenTorch InferShape] start_dim = " << start_dim << ", end_dim = " << end_dim << " ===" << std::endl;
+    std::cout << "[FlattenTorch InferShape], input_dims = [";
+    for (int i=0; i<input_dims.size(); i++)
+        std::cout << input_dims[i] << ",";
+    std::cout << "] ===" << std::endl;
+    
+    std::cout << "[FlattenTorch InferShape], output_dims = [";
+    for (int i=0; i<output_dims.size(); i++)
+        std::cout << output_dims[i] << ",";
+    std::cout << "] ===" << std::endl;
+    */
+    ///////////////////////////// 
+
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(FlattenTorch, LAYER_FLATTENTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/fused_group_norm_swish_layer.cc b/source/tnn/layer/fused_group_norm_swish_layer.cc
new file mode 100644
index 000000000..c4615f1c0
--- /dev/null
+++ b/source/tnn/layer/fused_group_norm_swish_layer.cc
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+Status FusedGroupNormSwishLayer::InferOutputDataType() {
+    Status status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    return TNN_OK;
+}
+
+Status FusedGroupNormSwishLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/fused_layer.cc b/source/tnn/layer/fused_layer.cc
new file mode 100644
index 000000000..a6e7abdc7
--- /dev/null
+++ b/source/tnn/layer/fused_layer.cc
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Fused, LAYER_FUSED);
+
+Status FusedLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+ 
+    FusedLayerParam* layer_param = dynamic_cast<FusedLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    if (layer_param->type == FusionType_None) {
+        LOGE("Error: FusedLayer should have a definite fusion type in layer param.\n");
+        return Status(TNNERR_PARAM_ERR, "Error: FusedLayer should have a definite fusion type in layer param.");
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV1) {
+        output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV2) {
+        output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV3) {
+        output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    } else if (layer_param->type == FusionType_AddBiasResidualLayerNorm ||
+               layer_param->type == FusionType_FFN ||
+               layer_param->type == FusionType_Attention ||
+               layer_param->type == FusionType_Flash_Attention ||
+               layer_param->type == FusionType_Cross_Attention) {
+        output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    } else {
+        LOGE("Error: FusedLayer fusion type not supported yet: %d.\n", (int)layer_param->type);
+        return Status(TNNERR_PARAM_ERR, "Error: FusedLayer fusion type not supported yet.");
+    }
+
+    return TNN_OK;
+}
+
+Status FusedLayer::InferOutputShape(bool ignore_error) {
+    //X, Y, condition order for input
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    FusedLayerParam* layer_param = dynamic_cast<FusedLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    if (layer_param->type == FusionType_None) {
+        LOGE_IF(!ignore_error, "Error: FusedLayer should have a definite fusion type in layer param.\n");
+        return Status(TNNERR_PARAM_ERR, "Error: FusedLayer should have a definite fusion type in layer param.");
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV1) {
+        Blob* input_blob = input_blobs_[0];
+        Blob* output_blob = output_blobs_[0];
+ 
+        // Input Shape:  [Seq_len, Batch, 3*Hidden_Size, 1, 1]
+        // Output Shape: [Seq_len, Batch, Hidden_Size, 1, 1]
+        output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+        output_blob->GetBlobDesc().dims[2] /= 3;
+ 
+        return TNN_OK;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV2) {
+        Blob* input_blob = input_blobs_[0];
+        Blob* output_blob = output_blobs_[0];
+ 
+        // Input Shape:  [Seq_len*Batch compressed to dense mode, 3*Hidden_Size, 1, 1]
+        // Output Shape: [Seq_len*Batch compressed to dense mode, Hidden_Size, 1, 1]
+        output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+        output_blob->GetBlobDesc().dims[1] /= 3;
+ 
+        return TNN_OK;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV3) {
+        LOGE_IF(!ignore_error, "Error: FusedLayer Bert QKVtoContext V3 not supported yet.\n");
+        return Status(TNNERR_PARAM_ERR, "Error: FusedLayer QKVtoContext V3 not supported yet.");
+    } else if (layer_param->type == FusionType_AddBiasResidualLayerNorm ||
+               layer_param->type == FusionType_FFN ||
+               layer_param->type == FusionType_Attention || 
+               layer_param->type == FusionType_Cross_Attention ||
+               layer_param->type == FusionType_Flash_Attention) {
+        Blob* input_blob = input_blobs_[0];
+        Blob* output_blob = output_blobs_[0];
+
+        output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+        return TNN_OK;
+    } else {
+        LOGE_IF(!ignore_error, "Error: FusedLayer fusion type not supported yet.\n");
+        return Status(TNNERR_PARAM_ERR, "Error: FusedLayer fusion type not supported yet.");
+    }
+    
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Fused, LAYER_FUSED);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/gather_elements_layer.cc b/source/tnn/layer/gather_elements_layer.cc
new file mode 100644
index 000000000..d8403e450
--- /dev/null
+++ b/source/tnn/layer/gather_elements_layer.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(GatherElements, LAYER_GATHERELEMENTS);
+
+Status GatherElementsLayer::InferOutputDataType() {
+    Status status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    return TNN_OK;
+}
+
+Status GatherElementsLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    auto layer_param = dynamic_cast<GatherElementsLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob *indices_blob = input_blobs_[1];
+    Blob *output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = indices_blob->GetBlobDesc().dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(GatherElements, LAYER_GATHERELEMENTS);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/inplace_slice_copy_layer.cc b/source/tnn/layer/inplace_slice_copy_layer.cc
new file mode 100644
index 000000000..f6430da0c
--- /dev/null
+++ b/source/tnn/layer/inplace_slice_copy_layer.cc
@@ -0,0 +1,41 @@
+
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+Status InplaceSliceCopyLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status InplaceSliceCopyLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+    auto dims         = input_blob->GetBlobDesc().dims;
+
+    output_blob->GetBlobDesc().dims = dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/source/tnn/layer/leaky_relu_layer.cc b/source/tnn/layer/leaky_relu_layer.cc
new file mode 100644
index 000000000..ae416e6b5
--- /dev/null
+++ b/source/tnn/layer/leaky_relu_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/elementwise_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_ELEMENTWISE_LAYER(LeakyRelu, LAYER_LEAKY_RELU);
+
+REGISTER_ELEMENTWISE_LAYER(LeakyRelu, LAYER_LEAKY_RELU);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/linspace_layer.cc b/source/tnn/layer/linspace_layer.cc
new file mode 100644
index 000000000..8d2b0f7fa
--- /dev/null
+++ b/source/tnn/layer/linspace_layer.cc
@@ -0,0 +1,134 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER_WITH_FUNC(Linspace, LAYER_LINSPACE,
+                        virtual Status FillLayerParamWithConstantResource(););
+
+Status LinspaceLayer::InferOutputDataType() {
+    auto status = BaseLayer::InferOutputDataType();
+    if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
+        return status;
+    }
+    
+    auto *layer_param = dynamic_cast<LinspaceLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    // Used In TNN torch, when 1-3 of start, end, end is in input_blobs_
+    // start_index, end_index, end_index represent their index in input_blobs.
+    if (layer_param->start_index != -1 || layer_param->end_index != -1 ||
+        layer_param->steps_index != -1) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+        return status;
+    }
+
+    const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
+    const auto& const_res  = const_resource_;
+    if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+    }
+    return status;
+}
+
+Status LinspaceLayer::InferOutputShape(bool ignore_error) {
+    //NOTE: This layer should not be excuted on device which is not NAIVE. see ConstantOfShapeLayer
+    
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto *layer_param = dynamic_cast<LinspaceLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_[0]->GetBlobDesc().device_type != DEVICE_NAIVE) {
+        return Status(TNNERR_MODEL_ERR, "RangeLayer input blob has invalid device type");
+    }
+    
+    auto output_dims = {layer_param->steps.i};
+    
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
+    
+    return TNN_OK;
+}
+
+Status LinspaceLayer::FillLayerParamWithConstantResource() {
+    Status status = TNN_OK;
+    auto *layer_param = dynamic_cast<LinspaceLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+    
+    if (input_blobs_.size() != 3) {
+        return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid layer param");
+    }
+    
+    // start
+    {
+        const auto start_name = input_blobs_[0]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(start_name) != const_resource_->end()) {
+            auto start_buffer = (*const_resource_)[start_name];
+            layer_param->data_type = start_buffer->GetDataType();
+            auto start_data   = start_buffer->force_to<float *>();
+            auto start = layer_param->start;
+            if (start_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                start.f = *start_data;
+            } else if (start_buffer->GetDataType() == DATA_TYPE_INT32) {
+                start.i = *((int *)start_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid start data type");
+            }
+            layer_param->start = start;
+        }
+    }
+    
+    // end
+    {
+        const auto end_name = input_blobs_[1]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(end_name) != const_resource_->end()) {
+            auto end_buffer = (*const_resource_)[end_name];
+            layer_param->data_type = end_buffer->GetDataType();
+            auto end_data   = end_buffer->force_to<float *>();
+            auto end = layer_param->end;
+            if (end_buffer->GetDataType() == DATA_TYPE_FLOAT) {
+                end.f = *end_data;
+            } else if (end_buffer->GetDataType() == DATA_TYPE_INT32) {
+                end.i = *((int *)end_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid end data type");
+            }
+            layer_param->end = end;
+        }
+    }
+    
+    // steps
+    {
+        const auto steps_name = input_blobs_[2]->GetBlobDesc().name;
+        if (const_resource_ != nullptr && const_resource_->find(steps_name) != const_resource_->end()) {
+            auto steps_buffer = (*const_resource_)[steps_name];
+            auto steps_data   = steps_buffer->force_to<float *>();
+            auto steps = layer_param->steps;
+            if (steps_buffer->GetDataType() == DATA_TYPE_INT32) {
+                steps.i = *((int *)steps_data);
+            } else {
+                return Status(TNNERR_PARAM_ERR, "RangeLayer has invalid end data type");
+            }
+            layer_param->steps = steps;
+        }
+    }
+    return status;
+}
+
+REGISTER_LAYER(Linspace, LAYER_LINSPACE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/mat_mul_layer.cc b/source/tnn/layer/mat_mul_layer.cc
index 5129e8c3e..be3d25f1e 100644
--- a/source/tnn/layer/mat_mul_layer.cc
+++ b/source/tnn/layer/mat_mul_layer.cc
@@ -21,7 +21,74 @@ namespace TNN_NS {
 DECLARE_LAYER(MatMul, LAYER_MATMUL);
 
 Status MatMulLayer::InferOutputDataType() {
-    return BaseLayer::InferOutputDataType();
+    BaseLayer::InferOutputDataType();
+    
+    auto param    = dynamic_cast<MatMulLayerParam*>(param_);
+    auto resource = dynamic_cast<MatMulLayerResource*>(resource_);
+    
+    DataType matrix_a_dtype;
+    DataType matrix_b_dtype;
+    if (input_blobs_.size() == 1) {
+        if (param->weight_position == 0) {
+            matrix_a_dtype = resource->weight.GetDataType();
+            matrix_b_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+        } else if (param->weight_position == 1) {
+            matrix_a_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+            matrix_b_dtype = resource->weight.GetDataType();
+        } else {
+            return Status(TNNERR_INVALID_MODEL, "MatMul input size error. param.weight_position invalid when num of input is 1.");
+        }
+    } else if (input_blobs_.size() == 2) {
+        matrix_a_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+        matrix_b_dtype = input_blobs_[1]->GetBlobDesc().data_type;
+    } else {
+        return Status(TNNERR_INVALID_MODEL, "MatMul OP number of inputs should be 1 or 2.");
+    }
+ 
+    // MatMul OP Cast Output to higher data type.
+    // for example:
+    // [half, float] -> float 
+    // [float, int] -> int
+    bool in_matrixes_contains_float = false;
+    bool in_matrixes_contains_half  = false;
+    bool in_matrixes_contains_bfp16 = false;
+    bool in_matrixes_contains_int32 = false;
+
+    if (matrix_a_dtype==DATA_TYPE_FLOAT || matrix_b_dtype==DATA_TYPE_FLOAT) {
+        in_matrixes_contains_float = true;
+    }
+    if (matrix_a_dtype==DATA_TYPE_HALF || matrix_b_dtype==DATA_TYPE_HALF) {
+        in_matrixes_contains_half  = true;
+    }
+    if (matrix_a_dtype==DATA_TYPE_BFP16 || matrix_b_dtype==DATA_TYPE_BFP16) {
+        in_matrixes_contains_bfp16 = true;
+    }
+    if (matrix_a_dtype==DATA_TYPE_INT32 || matrix_b_dtype==DATA_TYPE_INT32) {
+        in_matrixes_contains_int32 = true;
+    }
+
+    if (in_matrixes_contains_float) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+        return TNN_OK;
+    }
+
+    if (in_matrixes_contains_half && in_matrixes_contains_bfp16) {
+        return Status(TNNERR_LAYER_ERR, "Error: MatMul with one input fp16 and the other bfp16 is not allowed.");
+    }
+    if (in_matrixes_contains_half) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_HALF;
+        return TNN_OK;
+    }
+    if (in_matrixes_contains_bfp16) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_BFP16;
+        return TNN_OK;
+    }
+
+    if (in_matrixes_contains_int32) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+        return TNN_OK;
+    }
+    return TNN_OK;
 }
 
 // @brief matmul op to MatMul matrix_a_dims and matrix_b_dims
diff --git a/source/tnn/layer/mod_layer.cc b/source/tnn/layer/mod_layer.cc
new file mode 100644
index 000000000..771297ef8
--- /dev/null
+++ b/source/tnn/layer/mod_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Mod, LAYER_MOD);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Mod, LAYER_MOD);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/multidir_broadcast_layer.cc b/source/tnn/layer/multidir_broadcast_layer.cc
index 5378918ee..270dd7c01 100644
--- a/source/tnn/layer/multidir_broadcast_layer.cc
+++ b/source/tnn/layer/multidir_broadcast_layer.cc
@@ -18,6 +18,65 @@
 
 namespace TNN_NS {
 
+Status MultidirBroadcastLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    auto layer_param = dynamic_cast<MultidirBroadcastLayerParam*>(param_);
+
+    // Multidir OP Cast Output to higher data type.
+    // for example:
+    // [half, float] -> float 
+    // [int8_t, int] -> int
+    // [int, int, int] -> int
+    // [int, int, int, half] -> half
+    // [int, int, int, bfp16] -> bfp16
+    // [int, half, float, int8_t] -> float 
+    bool inputs_contains_float = false;
+    bool inputs_contains_half  = false;
+    bool inputs_contains_bfp16 = false;
+    bool inputs_contains_int32 = false;
+
+    auto layer_res = dynamic_cast<EltwiseLayerResource *>(resource_);
+    if (layer_res) {
+        DataType res_dtype = layer_res->element_handle.GetDataType();
+        inputs_contains_float |= (res_dtype==DATA_TYPE_FLOAT);
+        inputs_contains_half  |= (res_dtype==DATA_TYPE_HALF);
+        inputs_contains_bfp16 |= (res_dtype==DATA_TYPE_BFP16);
+        inputs_contains_int32 |= (res_dtype==DATA_TYPE_INT32);
+    }
+    
+    for (const auto& input_blob : input_blobs_) {
+        DataType dtype = input_blob->GetBlobDesc().data_type;
+        inputs_contains_float |= (dtype==DATA_TYPE_FLOAT);
+        inputs_contains_half  |= (dtype==DATA_TYPE_HALF);
+        inputs_contains_bfp16 |= (dtype==DATA_TYPE_BFP16);
+        inputs_contains_int32 |= (dtype==DATA_TYPE_INT32);
+    }
+
+    if (inputs_contains_float) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+        return TNN_OK;
+    }
+
+    if (inputs_contains_half && inputs_contains_bfp16) {
+        return Status(TNNERR_LAYER_ERR, "Error: Binary OP with both fp16 inputs and bfp16 inputs is not allowed.");
+    }
+    if (inputs_contains_half) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_HALF;
+        return TNN_OK;
+    }
+    if (inputs_contains_bfp16) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_BFP16;
+        return TNN_OK;
+    }
+
+    if (inputs_contains_int32) {
+        output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_INT32;
+        return TNN_OK;
+    }
+    
+    return TNN_OK;
+}
+
 void BroadCastTypeFilter(const DimsVector &dims_output, const DimsVector &dims_input, int &type) {
 
     if (DimsVectorUtils::Equal(dims_output, dims_input)) {
diff --git a/source/tnn/layer/multidir_broadcast_layer.h b/source/tnn/layer/multidir_broadcast_layer.h
index 93174f097..e67cfdcac 100644
--- a/source/tnn/layer/multidir_broadcast_layer.h
+++ b/source/tnn/layer/multidir_broadcast_layer.h
@@ -25,6 +25,7 @@ class MultidirBroadcastLayer : public ElementwiseLayer {
 
 protected:
     virtual Status InferOutputShape(bool ignore_error = false);
+    virtual Status InferOutputDataType();
 };
 
 #define DECLARE_MULTIDIR_BROADCAST_LAYER(type_string, layer_type)                                                      \
diff --git a/source/tnn/layer/or_layer.cc b/source/tnn/layer/or_layer.cc
new file mode 100644
index 000000000..8366957cd
--- /dev/null
+++ b/source/tnn/layer/or_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Or, LAYER_OR);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Or, LAYER_OR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/pad_layer.cc b/source/tnn/layer/pad_layer.cc
index 76dae2e06..b7229907a 100644
--- a/source/tnn/layer/pad_layer.cc
+++ b/source/tnn/layer/pad_layer.cc
@@ -36,9 +36,20 @@ Status PadLayer::InferOutputShape(bool ignore_error) {
     Blob* input_blob  = input_blobs_[0];
     Blob* output_blob = output_blobs_[0];
     auto dims         = input_blob->GetBlobDesc().dims;
-    dims[3] += layer_param->pads[0] + layer_param->pads[1];
-    dims[2] += layer_param->pads[2] + layer_param->pads[3];
-    dims[1] += layer_param->pads[4] + layer_param->pads[5];
+    if (dims.size()==3) {
+        // C,H,W 
+        dims[2] += layer_param->pads[0] + layer_param->pads[1];
+        dims[1] += layer_param->pads[2] + layer_param->pads[3];
+        dims[0] += layer_param->pads[4] + layer_param->pads[5];
+    } else if (dims.size()==4) {
+        // N,C,H,W
+        dims[3] += layer_param->pads[0] + layer_param->pads[1];
+        dims[2] += layer_param->pads[2] + layer_param->pads[3];
+        dims[1] += layer_param->pads[4] + layer_param->pads[5];
+    } else {
+        LOGE_IF(!ignore_error, "Error: unsupported PAD input format, should be NCHW or CHW.\n");
+        return Status(TNNERR_LAYER_ERR, "Error: unsupported PAD input format, should be NCHW or CHW.");
+    }
 
     output_blob->GetBlobDesc().dims = dims;
     return TNN_OK;
diff --git a/source/tnn/layer/permutev2_layer.cc b/source/tnn/layer/permutev2_layer.cc
new file mode 100644
index 000000000..dd83b1e95
--- /dev/null
+++ b/source/tnn/layer/permutev2_layer.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+class PermuteLayer;
+DECLARE_LAYER(PermuteV2, LAYER_PERMUTEV2);
+
+Status PermuteV2Layer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status PermuteV2Layer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+    
+    auto permute_param = dynamic_cast<PermuteV2LayerParam*>(param_);
+    CHECK_PARAM_NULL(permute_param);
+
+    auto input_blob  = input_blobs_[0];
+    auto output_blob = output_blobs_[0];
+
+    DimsVector output_dims;
+
+    auto input_dims = input_blob->GetBlobDesc().dims;
+    std::vector<int> orders(input_dims.size());
+    std::iota(orders.begin(), orders.end(), 0);
+    if (permute_param->dim0 < 0) permute_param->dim0 += input_dims.size();
+    if (permute_param->dim1 < 0) permute_param->dim1 += input_dims.size();
+    std::swap(orders[permute_param->dim0], orders[permute_param->dim1]);
+
+    permute_param->orders = orders;
+
+    for (int i = 0; i < input_dims.size(); ++i) {
+        if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
+            orders.push_back(i);
+        }
+    }
+    if (permute_param->orders.size() != input_dims.size()) {
+        LOGE_IF(!ignore_error, "Permute param got wrong size.\n");
+        return Status(TNNERR_PARAM_ERR, "Permute param got wrong size");
+    }
+
+    for (int i = 0; i < permute_param->orders.size(); ++i) {
+        int order = permute_param->orders[i];
+        if (order < 0 || order > input_dims.size() - 1) {
+            LOGE_IF(!ignore_error, "Permute param out of range.\n");
+            return Status(TNNERR_PARAM_ERR, "Permute param out of range");
+        }
+        output_dims.push_back(input_dims[order]);
+    }
+    
+    output_blob->GetBlobDesc().dims = output_dims;
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(PermuteV2, LAYER_PERMUTEV2);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/pooling_3d_layer.cc b/source/tnn/layer/pooling_3d_layer.cc
index ceae6f045..87796921a 100644
--- a/source/tnn/layer/pooling_3d_layer.cc
+++ b/source/tnn/layer/pooling_3d_layer.cc
@@ -97,6 +97,33 @@ Status Pooling3DLayer::InferOutputShape(bool ignore_error) {
     int width_out  = 0;
     int depth_out  = 0;
 
+    if (pool_param->is_adaptive_pool) {
+        const int output_blobs_size = output_blobs_.size();
+        const auto output_shape     = pool_param->output_shape;
+        for (int i = 0; i < output_blobs_size; i++) {
+            output_blobs_[i]->GetBlobDesc().dims = {num, channels, output_shape[2], output_shape[1], output_shape[0]};
+        }
+        pool_param->pads = {0, 0, 0, 0, 0, 0};
+        pool_param->kernels = {width, height, depth};
+        pool_param->strides = {1, 1, 1};
+        pool_param->pool_type = 1;
+        if (output_shape[0] == 1 && output_shape[1] == 1) {
+            pool_param->is_global_pool   = true;
+            DimsVector output_dims;
+            output_dims.push_back(num);
+            output_dims.push_back(channels);
+            output_dims.push_back(1);
+            output_dims.push_back(1);
+            output_dims.push_back(1);
+            for (int i = 0; i < output_blobs_.size(); ++i) {
+                output_blobs_[i]->GetBlobDesc().dims = output_dims;
+            }
+            return TNN_OK;
+        } else {
+            return TNN_OK;
+        }
+    }
+
     // default padding following the proto setting
     if (pool_param->pad_type == -1) {
         int pad_w = pool_param->pads[0];
diff --git a/source/tnn/layer/pooling_layer.cc b/source/tnn/layer/pooling_layer.cc
index 0a944bb4e..adf10cfc5 100644
--- a/source/tnn/layer/pooling_layer.cc
+++ b/source/tnn/layer/pooling_layer.cc
@@ -19,7 +19,8 @@
 
 namespace TNN_NS {
 
-DECLARE_LAYER(Pooling, LAYER_POOLING);
+DECLARE_LAYER_WITH_FUNC(Pooling, LAYER_POOLING,
+                        virtual Status FillLayerParamWithConstantResource(););
 
 inline int PoolingLayerRuntimeKernelHeight(PoolingLayerParam* pool_param, DimsVector input_dims) {
     int kernel_h = pool_param->kernels_params[1];
@@ -52,7 +53,17 @@ inline int PoolingLayerRuntimeKernelWidth(PoolingLayerParam* pool_param, DimsVec
 }
 
 Status PoolingLayer::InferOutputDataType() {
-    return BaseLayer::InferOutputDataType();
+    BaseLayer::InferOutputDataType();
+    auto layer_param = dynamic_cast<PoolingLayerParam*>(param_);
+
+    if (layer_param->is_adaptive_pool && layer_param->output_shape[0] == -1 &&
+        runtime_model_ == RUNTIME_MODE_CONST_FOLD) {
+        for (auto& iter : output_blobs_) {
+            int allocat_status = DATA_FLAG_ALLOCATE_IN_FORWARD;
+            iter->SetFlag(iter->GetFlag() | allocat_status);
+        }
+    }
+    return TNN_OK;
 }
 
 Status PoolingLayer::InferOutputShape(bool ignore_error) {
@@ -70,13 +81,34 @@ Status PoolingLayer::InferOutputShape(bool ignore_error) {
     int width       = dims_input[3];
 
     if (pool_param->is_adaptive_pool) {
+        if (pool_param->output_shape[0] == -1) {
+            LOGE_IF(!ignore_error, "adaptive pool has no output shape. layer name: %s\n", pool_param->name.c_str());
+            return Status(TNNERR_PARAM_ERR, "adaptive pool has no output shape");
+        }
+
         const int output_blobs_size = output_blobs_.size();
         const auto output_shape     = pool_param->output_shape;
         for (int i = 0; i < output_blobs_size; i++) {
             output_blobs_[i]->GetBlobDesc().dims = {num, channels, output_shape[1], output_shape[0]};
         }
-
-        return TNN_OK;
+        pool_param->pads = {0, 0, 0, 0};
+        pool_param->kernels = {width, height};
+        pool_param->strides = {1, 1};
+        pool_param->pool_type = 1;
+        if (output_shape[0] == 1 && output_shape[1] == 1) {
+            pool_param->is_global_pool   = true;
+            DimsVector output_dims;
+            output_dims.push_back(num);
+            output_dims.push_back(channels);
+            output_dims.push_back(1);
+            output_dims.push_back(1);
+            for (int i = 0; i < output_blobs_.size(); ++i) {
+                output_blobs_[i]->GetBlobDesc().dims = output_dims;
+            }
+            return TNN_OK;
+        } else {
+            return TNN_OK;
+        }
     }
 
     const int kernel_w = PoolingLayerRuntimeKernelWidth(pool_param, dims_input);
@@ -184,6 +216,34 @@ Status PoolingLayer::InferOutputShape(bool ignore_error) {
     return TNN_OK;
 }
 
+Status PoolingLayer::FillLayerParamWithConstantResource() {
+    auto layer_param = dynamic_cast<PoolingLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    if (layer_param->is_adaptive_pool) {
+        if (input_blobs_.size() > 1) {
+            layer_param->output_shape.clear();
+            const auto blob_name = input_blobs_[1]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && const_resource_->find(blob_name) != const_resource_->end()) {
+                auto sizes_buffer = (*const_resource_)[blob_name];
+                if (sizes_buffer && sizes_buffer->GetBytesSize() > 0) {
+                    auto sizes_data = sizes_buffer->force_to<int *>();
+                    auto sizes_count = sizes_buffer->GetDataCount();
+                    if (sizes_count < 2) {
+                        LOGE("Error: AdaptiveAvgPool has invalid sizes count:%d\n", sizes_count);
+                        return Status(TNNERR_PARAM_ERR, "Error: AdaptiveAvgPool has invalid sizes count");
+                    }
+                    for (int i = sizes_count-1; i >= 0; i--) {
+                        layer_param->output_shape.push_back(sizes_data[i]);
+                    }
+                }
+            }
+        }
+    }
+
+    return TNN_OK;
+}
+
 REGISTER_LAYER(Pooling, LAYER_POOLING);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/layer/pow_layer.cc b/source/tnn/layer/pow_layer.cc
index c49e5753d..7bdeed65b 100644
--- a/source/tnn/layer/pow_layer.cc
+++ b/source/tnn/layer/pow_layer.cc
@@ -13,13 +13,40 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/layer/elementwise_layer.h"
+#include <cmath>
 
 namespace TNN_NS {
 DECLARE_LAYER_WITH_FUNC(Pow, LAYER_POWER,
                         virtual Status FillLayerParamWithConstantResource(););
 
 Status PowLayer::InferOutputDataType() {
-    return BaseLayer::InferOutputDataType();
+    //return BaseLayer::InferOutputDataType();
+    auto status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    // When type(input_0) is int,
+    // If exponenet is in the second input, output type will be set to FLOAT
+    // Else (exponent is in layer param, fixed),
+    // When exponent >= 0, and is X.0 (can convert to integer), output type is INT
+    // Otherwise, output type is FLOAT
+    if (input_blobs_[0]->GetBlobDesc().data_type==DATA_TYPE_INT8 ||
+        input_blobs_[0]->GetBlobDesc().data_type==DATA_TYPE_INT32 ||
+        input_blobs_[0]->GetBlobDesc().data_type==DATA_TYPE_INT64 ||
+        input_blobs_[0]->GetBlobDesc().data_type==DATA_TYPE_UINT32) {
+        if (input_blobs_.size() >= 2) {
+            output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+        } else {
+            auto layer_param = dynamic_cast<PowLayerParam*>(param_);
+            CHECK_PARAM_NULL(layer_param);
+ 
+            float exp_fp32 = layer_param->exponent;
+            // Integers can be represented perfectly by Float, so we can use != here.
+            if (std::floor(exp_fp32)!=exp_fp32 || exp_fp32<0.0f) {
+                output_blobs_[0]->GetBlobDesc().data_type = DATA_TYPE_FLOAT;
+            }
+        }
+    }
+    return TNN_OK;
 }
 
 Status PowLayer::InferOutputShape(bool ignore_error) {
diff --git a/source/tnn/layer/quantize_layer.cc b/source/tnn/layer/quantize_layer.cc
new file mode 100644
index 000000000..64efcf89b
--- /dev/null
+++ b/source/tnn/layer/quantize_layer.cc
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/data_flag_utils.h"
+
+namespace TNN_NS {
+DECLARE_LAYER(Quantize, LAYER_QUANTIZE);
+
+Status QuantizeLayer::InferOutputDataType() {
+    BaseLayer::InferOutputDataType();
+    //tricky: mark blob as no const, special for qdq weight
+    for (auto& iter : output_blobs_) {
+        iter->SetFlag(DataFlagUtils::MinChangeStatus(DATA_FLAG_CHANGE_ALWAYS, iter->GetFlag()));
+    }
+    return TNN_OK;
+}
+
+Status QuantizeLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    Blob* input_blob = input_blobs_[0];
+    output_blobs_[0]->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Quantize, LAYER_QUANTIZE);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/range_layer.cc b/source/tnn/layer/range_layer.cc
index 12f60e8e0..5a6705118 100644
--- a/source/tnn/layer/range_layer.cc
+++ b/source/tnn/layer/range_layer.cc
@@ -24,6 +24,18 @@ Status RangeLayer::InferOutputDataType() {
     if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
         return status;
     }
+    
+    auto *layer_param = dynamic_cast<RangeLayerParam *>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    // Used In TNN torch, when 1-3 of start, limit, delta is in input_blobs_
+    // start_index, limit_index, delta_index represent their index in input_blobs.
+    if (layer_param->start_index!=-1 || layer_param->limit_index!=-1 ||
+        layer_param->delta_index!=-1) {
+        output_blobs_[0]->SetFlag(output_blobs_[0]->GetFlag() | DATA_FLAG_ALLOCATE_IN_FORWARD);
+        return status;
+    }
+
     const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
     const auto& const_res  = const_resource_;
     if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
diff --git a/source/tnn/layer/reshape_layer.cc b/source/tnn/layer/reshape_layer.cc
index 7ffb76374..089118610 100644
--- a/source/tnn/layer/reshape_layer.cc
+++ b/source/tnn/layer/reshape_layer.cc
@@ -83,7 +83,9 @@ Status ReshapeLayer::FillLayerParamWithConstantResource() {
             for (int i=0; i<dim_count; i++) {
                 dims.push_back(dim_data[i]);
             }
-            layer_param->shape = dims;
+            if (layer_param->shape.empty()) {
+                layer_param->shape = dims;
+            }
             layer_param->num_axes = dim_count;
         }
     }
@@ -91,5 +93,6 @@ Status ReshapeLayer::FillLayerParamWithConstantResource() {
 }
 
 REGISTER_LAYER(Reshape, LAYER_RESHAPE);
+REGISTER_LAYER(Reshape, LAYER_RESHAPETORCH);
 
 }  // namespace TNN_NS
diff --git a/source/tnn/layer/roll_layer.cc b/source/tnn/layer/roll_layer.cc
new file mode 100644
index 000000000..bd7b6cbde
--- /dev/null
+++ b/source/tnn/layer/roll_layer.cc
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(Roll, LAYER_ROLL);
+
+Status RollLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status RollLayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    
+    auto layer_param = dynamic_cast<RollLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+
+    Blob* input_blob  = input_blobs_[0];
+    Blob* output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = input_blob->GetBlobDesc().dims;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(Roll, LAYER_ROLL);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/split_gelu_layer.cc b/source/tnn/layer/split_gelu_layer.cc
new file mode 100644
index 000000000..131143768
--- /dev/null
+++ b/source/tnn/layer/split_gelu_layer.cc
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <algorithm>
+
+#include "tnn/layer/base_layer.h"
+#include "tnn/utils/dims_utils.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(SplitGELU, LAYER_FUSED_SPLIT_GELU);
+
+Status SplitGELULayer::InferOutputDataType() {
+    Status status = BaseLayer::InferOutputDataType();
+    RETURN_ON_NEQ(status, TNN_OK);
+
+    output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
+    return TNN_OK;
+}
+
+Status SplitGELULayer::InferOutputShape(bool ignore_error) {
+    auto status = BaseLayer::InferOutputShape(ignore_error);
+    RETURN_ON_NEQ(status, TNN_OK);
+    Blob *input_blob = input_blobs_[0];
+    Blob *output_blob = output_blobs_[0];
+
+    output_blob->GetBlobDesc().dims = output_blob->GetBlobDesc().dims;
+    output_blob->GetBlobDesc().dims[2] /= 2;
+    return TNN_OK;
+}
+
+REGISTER_LAYER(SplitGELU, LAYER_FUSED_SPLIT_GELU);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/split_torch_layer.cc b/source/tnn/layer/split_torch_layer.cc
new file mode 100644
index 000000000..45831e4ad
--- /dev/null
+++ b/source/tnn/layer/split_torch_layer.cc
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <cmath>
+
+#include "tnn/layer/base_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_LAYER(SplitTorch, LAYER_SPLITTORCH);
+
+Status SplitTorchLayer::InferOutputDataType() {
+    return BaseLayer::InferOutputDataType();
+}
+
+Status SplitTorchLayer::InferOutputShape(bool ignore_error) {
+    BaseLayer::InferOutputShape(ignore_error);
+
+    auto layer_param = dynamic_cast<SplitTorchLayerParam*>(param_);
+    if (!layer_param) {
+        return Status(TNNERR_PARAM_ERR, "SplitTorchLayer do not have valid param, please check node: " + layer_name_);
+    }
+    Blob* input_blob     = input_blobs_[0];
+    auto input_dims      = input_blob->GetBlobDesc().dims;
+    const int split_size = layer_param->split_size;
+
+    if (layer_param->axis < 0) {
+        layer_param->axis += input_dims.size();
+    }
+
+    if (split_size == 0) {
+        return Status(TNNERR_PARAM_ERR, "SplitTorchLayer has invalid param, split size is zero");
+    }
+
+    const int input_size = input_dims[layer_param->axis];
+    const int num_split  = input_size / split_size;
+    const int last_size  = input_size % split_size;
+
+    std::vector<int> slices(num_split, split_size);
+    if (last_size > 0) {
+        slices.push_back(last_size);
+    }
+
+    layer_param->slices = slices;
+
+    if (layer_param->slices.size() != output_blobs_.size()) {
+        const int obs = output_blobs_.size();
+        int x         = 0;
+        return Status(TNNERR_PARAM_ERR, "SplitTorchLayer has invalid param, slices size != output blobs size ");
+    }
+
+    int size_sum = layer_param->slices[0];
+    for (int i = 1; i < layer_param->slices.size(); i++) {
+        size_sum += layer_param->slices[i];
+    }
+
+    if (size_sum != input_dims[layer_param->axis]) {
+        return Status(TNNERR_PARAM_ERR, "SplitTorchLayer has invalid slices");
+    }
+
+    for (size_t i = 0; i < output_blobs_.size(); i++) {
+        auto output_dims                     = input_dims;
+        output_dims[layer_param->axis]       = layer_param->slices[i];
+        output_blobs_[i]->GetBlobDesc().dims = output_dims;
+    }
+
+    return TNN_OK;
+}
+
+REGISTER_LAYER(SplitTorch, LAYER_SPLITTORCH);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/layer/squeeze_layer.cc b/source/tnn/layer/squeeze_layer.cc
index 7a78ec70e..18af6a4bb 100644
--- a/source/tnn/layer/squeeze_layer.cc
+++ b/source/tnn/layer/squeeze_layer.cc
@@ -23,6 +23,9 @@ Status SqueezeLayer::InferOutputDataType() {
     if (runtime_model_ != RUNTIME_MODE_CONST_FOLD) {
         return status;
     }
+    auto* layer_param = dynamic_cast<SqueezeLayerParam*>(param_);
+    CHECK_PARAM_NULL(layer_param);
+ 
     const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
     const auto& const_res  = const_resource_;
     if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
@@ -37,23 +40,56 @@ Status SqueezeLayer::InferOutputShape(bool ignore_error) {
 
     auto* layer_param = dynamic_cast<SqueezeLayerParam*>(param_);
     CHECK_PARAM_NULL(layer_param);
-
-    const auto& output_blob = output_blobs_[0];
-    DimsVector input_dims   = input_blobs_[0]->GetBlobDesc().dims;
-    DimsVector output_dims  = input_dims;
-    RETURN_VALUE_ON_NEQ(input_dims.size() > 0, true, Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input size"));
-    auto axes = layer_param->axes;
     
-    for (auto iter = axes.rbegin(); iter != axes.rend(); iter++) {
-        //Note: here it is diffreent from UnsqueezeLayer
-        int axis = *iter;
-        axis =  axis < 0 ? axis + (int)output_dims.size() : axis;
-        if (axis < 0 || axis >= output_dims.size() || output_dims[axis] != 1) {
-            return Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input axes");
+    DimsVector input_dims;
+    DimsVector output_dims = {};
+    
+    if (layer_param->axes.empty()) {
+        // In Torch, we have torch.squeeze(input,dim=None,*,out=None) -> Tensor
+        // When dim, aka axes in TNN is not provided,
+        // a squeeze operation will squeeze all dimensions of input of size 1.
+        // E.G.
+        // from [1,3,1,1,64] -> to [3,64]
+        // WARNING:
+        // Model Providers are responsible for making sure that case like:
+        // from [Batch,3,1,1,64] -> to [Batch,3,64], and Min Batch==1, Max Batch!=1 will not happen
+        // when Squeeze with no axes provided exists in Network. TNN will throw Runtime ERROR when this happen.
+        const auto& input_name = input_blobs_[0]->GetBlobDesc().name;
+        const auto& const_res  = const_resource_;
+        if (const_res != nullptr && const_res->find(input_name) != const_res->end()) {
+            input_dims = ((*const_resource_)[input_name])->GetBufferDims();
+        } else {
+            input_dims = input_blobs_[0]->GetBlobDesc().dims;
+        }
+
+        for (int i=0; i<input_dims.size(); i++) {
+            int dim = input_dims[i];
+            if (dim != 1) {
+                output_dims.push_back(dim);
+            } else {
+                layer_param->axes.push_back(i);
+            }
+        }
+        if (output_dims.empty()) {
+            output_dims.push_back(1);
+        }
+    } else {
+        input_dims  = input_blobs_[0]->GetBlobDesc().dims;
+        output_dims = input_dims;
+        RETURN_VALUE_ON_NEQ(input_dims.size() > 0, true, Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input size"));
+        auto axes = layer_param->axes;
+
+        for (auto iter = axes.rbegin(); iter != axes.rend(); iter++) {
+            //Note: here it is diffreent from UnsqueezeLayer
+            int axis = *iter;
+            axis =  axis < 0 ? axis + (int)output_dims.size() : axis;
+            if (axis < 0 || axis >= output_dims.size() || output_dims[axis] != 1) {
+                return Status(TNNERR_PARAM_ERR, "SqueezeLayer has invalid input axes");
+            }
+            output_dims.erase(output_dims.begin() + axis);
         }
-        output_dims.erase(output_dims.begin() + axis);
     }
-    output_blob->GetBlobDesc().dims = output_dims;
+    output_blobs_[0]->GetBlobDesc().dims = output_dims;
     return status;
 }
 
diff --git a/source/tnn/layer/stride_slice_v2_layer.cc b/source/tnn/layer/stride_slice_v2_layer.cc
index a1dd9553b..40ef1d511 100644
--- a/source/tnn/layer/stride_slice_v2_layer.cc
+++ b/source/tnn/layer/stride_slice_v2_layer.cc
@@ -69,33 +69,47 @@ Status StrideSliceV2Layer::FillLayerParamWithConstantResource() {
     CHECK_PARAM_NULL(layer_param);
     
     if (input_blobs_.size() >= 2) {
-        auto begins_blob_name = input_blobs_[1]->GetBlobDesc().name;
-        if (const_resource_ != nullptr && const_resource_->find(begins_blob_name) != const_resource_->end()) {
-            auto begins_buffer =  (*const_resource_)[begins_blob_name];
-            auto dim_count = begins_buffer->GetDataCount();
-            auto dim_data = (int *)begins_buffer->force_to<int *>();
-            DimsVector dims;
-            for (int i=0; i<dim_count; i++) {
-                dims.push_back(dim_data[i]);
+        if (input_blobs_.size() >= 3) {
+            layer_param->begins_index = 1;
+            layer_param->ends_index = 2;
+        } else {
+            if (layer_param->begins_index==-1 && layer_param->ends_index==-1) {
+                // 2 inputs, the second input not specified, set to begins by default.
+                layer_param->begins_index = 1;
             }
-            layer_param->begins = dims;
         }
-    }
-    
-    if (input_blobs_.size() >= 3) {
-        auto ends_blob_name = input_blobs_[2]->GetBlobDesc().name;
-        if (const_resource_ != nullptr && const_resource_->find(ends_blob_name) != const_resource_->end()) {
-            auto ends_buffer =  (*const_resource_)[ends_blob_name];
-            auto dim_count = ends_buffer->GetDataCount();
-            auto dim_data = (int *)ends_buffer->force_to<int *>();
-            DimsVector dims;
-            for (int i=0; i<dim_count; i++) {
-                dims.push_back(dim_data[i]);
+
+        if (layer_param->begins_index != -1) {
+            auto begins_blob_name = input_blobs_[layer_param->begins_index]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && 
+                const_resource_->find(begins_blob_name) != const_resource_->end()) {
+                auto begins_buffer = (*const_resource_)[begins_blob_name];
+                auto dim_count = begins_buffer->GetDataCount();
+                auto dim_data = (int *)begins_buffer->force_to<int *>();
+                DimsVector dims;
+                for (int i=0; i<dim_count; i++) {
+                    dims.push_back(dim_data[i]);
+                }
+                layer_param->begins = dims;
+            }
+        }
+        
+        if (layer_param->ends_index != -1) {
+            auto ends_blob_name = input_blobs_[layer_param->ends_index]->GetBlobDesc().name;
+            if (const_resource_ != nullptr && 
+                const_resource_->find(ends_blob_name) != const_resource_->end()) {
+                auto ends_buffer = (*const_resource_)[ends_blob_name];
+                auto dim_count = ends_buffer->GetDataCount();
+                auto dim_data = (int *)ends_buffer->force_to<int *>();
+                DimsVector dims;
+                for (int i=0; i<dim_count; i++) {
+                    dims.push_back(dim_data[i]);
+                }
+                layer_param->ends = dims;
             }
-            layer_param->ends = dims;
         }
     }
-    
+ 
     return status;
 }
 
diff --git a/source/tnn/layer/where_layer.cc b/source/tnn/layer/where_layer.cc
index 588c11ecd..eeab2cd47 100644
--- a/source/tnn/layer/where_layer.cc
+++ b/source/tnn/layer/where_layer.cc
@@ -21,7 +21,20 @@ DECLARE_LAYER(Where, LAYER_WHERE);
 Status WhereLayer::InferOutputDataType() {
     auto status = BaseLayer::InferOutputDataType();
     RETURN_ON_NEQ(status, TNN_OK);
-    
+
+    auto layer_resource = dynamic_cast<WhereLayerResource*>(resource_);
+    if (layer_resource) {
+        if (layer_resource->x.GetBytesSize()>0 && layer_resource->y.GetBytesSize()>0) {
+            if (layer_resource->x.GetDataType()!=layer_resource->y.GetDataType()) {
+                return Status(TNNERR_PARAM_ERR, "DataType WhereTorchLayer x(Constant) and y(Constant) should be the same: " + layer_name_);
+            } else {
+                output_blobs_[0]->GetBlobDesc().data_type = layer_resource->x.GetDataType();
+            }
+        }
+        // If at least one of x and y is not stored in LayerResouce.
+        // The first input of Where Layer should be the remaining x or y. Out DataType==in0.data_type.
+    }
+ 
     output_blobs_[0]->GetBlobDesc().data_type = input_blobs_[0]->GetBlobDesc().data_type;
     return TNN_OK;
 }
@@ -39,6 +52,19 @@ Status WhereLayer::InferOutputShape(bool ignore_error) {
         dims_output = DimsVectorUtils::Max(dims, dims_output);
     }
 
+    // For Where Torch, x or y may be in resource instead of in inputs.
+    auto layer_resource = dynamic_cast<WhereLayerResource*>(resource_);
+    if (layer_resource) {
+        if (layer_resource->x.GetBytesSize()>0) {
+            dims = layer_resource->x.GetBufferDims();
+            dims_output = DimsVectorUtils::Max(dims, dims_output);
+        }
+        if (layer_resource->y.GetBytesSize()>0) {
+            dims = layer_resource->y.GetBufferDims();
+            dims_output = DimsVectorUtils::Max(dims, dims_output);
+        }
+    }
+
     output_blobs_[0]->GetBlobDesc().dims = dims_output;
     return TNN_OK;
 }
diff --git a/source/tnn/layer/xor_layer.cc b/source/tnn/layer/xor_layer.cc
new file mode 100644
index 000000000..37afba249
--- /dev/null
+++ b/source/tnn/layer/xor_layer.cc
@@ -0,0 +1,23 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/layer/multidir_broadcast_layer.h"
+
+namespace TNN_NS {
+
+DECLARE_MULTIDIR_BROADCAST_LAYER(Xor, LAYER_XOR);
+
+REGISTER_MULTIDIR_BROADCAST_LAYER(Xor, LAYER_XOR);
+
+}  // namespace TNN_NS
diff --git a/source/tnn/memory_manager/blob_memory.cc b/source/tnn/memory_manager/blob_memory.cc
index 224f7bca3..3feaa7a71 100644
--- a/source/tnn/memory_manager/blob_memory.cc
+++ b/source/tnn/memory_manager/blob_memory.cc
@@ -31,6 +31,10 @@ BlobMemorySizeInfo BlobMemory::GetBlobMemorySizeInfo() const {
     return size_info_;
 }
 
+AbstractDevice* BlobMemory::GetDevice() const {
+    return device_;
+}
+
 void BlobMemory::SetUseCount(int use_count) {
     use_count_ = use_count;
 }
diff --git a/source/tnn/memory_manager/blob_memory.h b/source/tnn/memory_manager/blob_memory.h
index 6b7292ed2..f3a74fe39 100644
--- a/source/tnn/memory_manager/blob_memory.h
+++ b/source/tnn/memory_manager/blob_memory.h
@@ -27,6 +27,7 @@ class BlobMemory {
 
     virtual void UpdateBlobMemorySizeInfo(BlobMemorySizeInfo info) = 0;
     BlobMemorySizeInfo GetBlobMemorySizeInfo() const;
+    AbstractDevice* GetDevice() const;
 
     void SetUseCount(int use_count);
     int GetUseCount() const;
diff --git a/source/tnn/memory_manager/blob_memory_pool.cc b/source/tnn/memory_manager/blob_memory_pool.cc
index 72a262e88..b1b3c0e63 100644
--- a/source/tnn/memory_manager/blob_memory_pool.cc
+++ b/source/tnn/memory_manager/blob_memory_pool.cc
@@ -131,6 +131,7 @@ void BlobMemoryPool::CalculateAllBlobMemorySize() {
     for (auto iter : blob_memory_library_) {
         BlobMemorySizeInfo info = iter->GetBlobMemorySizeInfo();
         all_blob_memory_size_ += GetBlobMemoryBytesSize(info);
+        LOGD("all_blob_memory_size_: %d \n", all_blob_memory_size_);
     }
 }
 
diff --git a/source/tnn/memory_manager/blob_memory_size_info.cc b/source/tnn/memory_manager/blob_memory_size_info.cc
index 661554f18..5e70dabb3 100644
--- a/source/tnn/memory_manager/blob_memory_size_info.cc
+++ b/source/tnn/memory_manager/blob_memory_size_info.cc
@@ -24,12 +24,21 @@ int64_t GetBlobMemoryBytesSize(BlobMemorySizeInfo& size_info) {
         return dims_count * DataTypeUtils::GetBytesSize(size_info.data_type);
 
     } else if (size_info.dims.size() == 2) {
-        // 2d blob memory with 4 channel
-        int64_t dims_count = 1;
-        for (auto dim : size_info.dims) {
-            dims_count *= dim;
+        if (size_info.data_type == DATA_TYPE_INT8) {
+            // 2b int8 blob memory with 1 channel, for large memory allocate
+            int64_t dims_count = 1;
+            for (auto dim : size_info.dims) {
+                dims_count *= dim;
+            }
+            return dims_count * DataTypeUtils::GetBytesSize(size_info.data_type);
+        } else {
+            // 2d blob memory with 4 channel
+            int64_t dims_count = 1;
+            for (auto dim : size_info.dims) {
+                dims_count *= dim;
+            }
+            return dims_count * 4 * DataTypeUtils::GetBytesSize(size_info.data_type);
         }
-        return dims_count * 4 * DataTypeUtils::GetBytesSize(size_info.data_type);
     } else {
         return 0;
     }
diff --git a/source/tnn/memory_manager/memory_unify_assign_strategy.cc b/source/tnn/memory_manager/memory_unify_assign_strategy.cc
index 3c315b273..24010c087 100644
--- a/source/tnn/memory_manager/memory_unify_assign_strategy.cc
+++ b/source/tnn/memory_manager/memory_unify_assign_strategy.cc
@@ -24,9 +24,16 @@ MemoryUnifyAssignStrategy::MemoryUnifyAssignStrategy(void* data) {
 Status MemoryUnifyAssignStrategy::AssignAllBlobMemory(std::set<BlobMemory*>& blob_memory_library) {
     int blob_memory_start_offset = 0;
     for (auto& iter : blob_memory_library) {
-        BlobHandle handle;
-        handle.base         = all_blob_memory_data_;
-        handle.bytes_offset = blob_memory_start_offset;
+        auto device = iter->GetDevice();
+        BlobHandle handle; 
+        //special for metal, device special api to calculate blob base.        
+        if(device->GetDeviceType() == DEVICE_METAL) {
+            handle.base         = all_blob_memory_data_;             
+            handle.bytes_offset = blob_memory_start_offset;          
+        } else {                                                     
+            handle.base         = reinterpret_cast<char*>(all_blob_memory_data_) + blob_memory_start_offset;
+            handle.bytes_offset = 0;
+        }
         iter->SetHandleFromExternal(handle);
         BlobMemorySizeInfo size_info = iter->GetBlobMemorySizeInfo();
         blob_memory_start_offset += GetBlobMemoryBytesSize(size_info);
diff --git a/source/tnn/memory_manager/shared_memory_manager.cc b/source/tnn/memory_manager/shared_memory_manager.cc
index 2114811b9..b64b58b6b 100644
--- a/source/tnn/memory_manager/shared_memory_manager.cc
+++ b/source/tnn/memory_manager/shared_memory_manager.cc
@@ -28,7 +28,18 @@ bool operator<(SharedMemoryId lhs, SharedMemoryId rhs) {
 std::map<SharedMemoryId, SharedMemory> SharedMemoryManager::s_shared_forward_memory;
 std::map<SharedMemoryId, std::vector<ISharedMemoryChangeListener *>> SharedMemoryManager::s_shared_memory_instances;
 
-SharedMemory SharedMemoryManager::GetSharedMemory(int forward_memory_size, std::thread::id thread_id,
+DimsVector SplitMemorySizeToDims(size_t memory_size) {
+    DimsVector dims;
+    while (memory_size > INT_MAX) {
+        int dim = (memory_size - 1) / INT_MAX + 1;
+        dims.push_back(dim);
+        memory_size /= dim;
+    }
+    dims.push_back(memory_size);
+    return dims;
+}
+
+SharedMemory SharedMemoryManager::GetSharedMemory(size_t forward_memory_size, std::thread::id thread_id,
                                                   AbstractDevice *device, int device_id,
                                                   ISharedMemoryChangeListener *listener,
                                                   Status &status) {
@@ -41,8 +52,8 @@ SharedMemory SharedMemoryManager::GetSharedMemory(int forward_memory_size, std::
     if (forward_memory_size > share_memory.shared_memory_size) {
         void *new_shared_memory = NULL;
         BlobMemorySizeInfo info;
-        info.data_type = DATA_TYPE_INT8; 
-        info.dims.push_back(forward_memory_size);
+        info.data_type = DATA_TYPE_INT8;
+        info.dims = SplitMemorySizeToDims(forward_memory_size);
         status = device->Allocate(&new_shared_memory, info);
         if (status != TNN_OK) {
             return SharedMemory();
diff --git a/source/tnn/memory_manager/shared_memory_manager.h b/source/tnn/memory_manager/shared_memory_manager.h
index 82b876e3d..1c2063d7c 100644
--- a/source/tnn/memory_manager/shared_memory_manager.h
+++ b/source/tnn/memory_manager/shared_memory_manager.h
@@ -27,7 +27,7 @@
 namespace TNN_NS {
 
 struct SharedMemory {
-    int shared_memory_size      = 0;
+    size_t shared_memory_size   = 0;
     void *shared_memory_data    = NULL;
     int shared_memory_ref_count = 0;
 };
@@ -48,7 +48,7 @@ bool operator<(SharedMemoryId lhs, SharedMemoryId rhs);
 class SharedMemoryManager {
 public:
     static SharedMemory GetSharedMemory(
-        int forward_memory_size, std::thread::id thread_id,
+        size_t forward_memory_size, std::thread::id thread_id,
         AbstractDevice *device, int device_id,
         ISharedMemoryChangeListener *listener,
         Status &status);
diff --git a/source/tnn/network/coreml/coreml_network.h b/source/tnn/network/coreml/coreml_network.h
index 7ebbe8e78..7762f2c1b 100644
--- a/source/tnn/network/coreml/coreml_network.h
+++ b/source/tnn/network/coreml/coreml_network.h
@@ -42,8 +42,9 @@ class CoreMLNetwork : public AbstractNetwork {
     // @param net_resource network resource info
     // @param inputs_shape_map modify input shape, if empty, it will use the
     // shape in proto
+	// @param inputs_data_type modify input data type, by default float.
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
 
     // @brief reshape with input shape info
     // @inputs input shape info
@@ -52,6 +53,10 @@ class CoreMLNetwork : public AbstractNetwork {
     // @brief get tnn command queue
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue);
+    
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void *command_queue);
 
     // @brief network forward
     virtual Status Forward();
@@ -63,7 +68,7 @@ class CoreMLNetwork : public AbstractNetwork {
     virtual Status DeInit();
 
     // @brief get network forward for all blob memory size
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     // @brief set forward memory when share memory mode is set from external
     virtual Status SetForwardMemory(void *memory);
diff --git a/source/tnn/network/coreml/coreml_network.mm b/source/tnn/network/coreml/coreml_network.mm
index 55a350b76..bb1073316 100644
--- a/source/tnn/network/coreml/coreml_network.mm
+++ b/source/tnn/network/coreml/coreml_network.mm
@@ -86,7 +86,7 @@ bool HasAppleNPU() {
 }
 
 Status CoreMLNetwork::Init(NetworkConfig &net_config, ModelConfig &model_config, AbstractModelInterpreter *interpreter,
-                           InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+                           InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     if (!HasAppleNPU()) {
         return Status(TNNERR_COMMON_ERROR, "Apple device dont have NeuralEngine");
     }
@@ -339,7 +339,7 @@ bool HasAppleNPU() {
     }
 }
 
-Status CoreMLNetwork::GetForwardMemorySize(int &memory_size) {
+Status CoreMLNetwork::GetForwardMemorySize(size_t &memory_size) {
     memory_size = 0;
     return Status(TNNERR_INST_ERR, "CoreML do not support GetForwardMemorySize");
 }
@@ -486,6 +486,10 @@ bool HasAppleNPU() {
     return context_->GetCommandQueue(command_queue);
 }
 
+Status CoreMLNetwork::SetCommandQueue(void *command_queue) {
+    return TNN_OK;
+}
+
 Status CoreMLNetwork::Forward() {
     if (!HasAppleNPU()) {
         return Status(TNNERR_COMMON_ERROR, "Apple device dont have NeuralEngine");
diff --git a/source/tnn/network/coreml/tnn_impl_coreml.h b/source/tnn/network/coreml/tnn_impl_coreml.h
index 23eaa756b..6ace82c62 100644
--- a/source/tnn/network/coreml/tnn_impl_coreml.h
+++ b/source/tnn/network/coreml/tnn_impl_coreml.h
@@ -47,25 +47,38 @@ class TNNImplCoreML : public TNNImpl {
 
     // return input shapes map from model
     virtual Status GetModelInputShapesMap(InputShapesMap& shapes_map);
+    
+    //@brief get input data types map from model
+    virtual Status GetModelInputDataTypeMap(InputDataTypeMap& data_type_map);
+
+    // return input names from model
+    virtual Status GetModelInputNames(std::vector<std::string>& input_names);
+
+    // return output names from model
+    virtual Status GetModelOutputNames(std::vector<std::string>& output_names);
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param inputs_shape: modify input shape, or it will use the shape in the
     // proto
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap inputs_shape = InputShapesMap());
+                                                 InputShapesMap inputs_shape = InputShapesMap(),
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 
     // @brief create an instance
     // @param instance: The instance to be created.
     // @param min_inputs_shape: support min shape
     // @param max_inputs_shape: support max shape
+    // @param inputs_data_type: modify input data type
     // @param status code: If successful, returns zero. Otherwise, returns
     // error code.
     virtual std::shared_ptr<Instance> CreateInst(NetworkConfig& config, Status& status,
-                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape);
+                                                 InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                 InputDataTypeMap inputs_data_type = InputDataTypeMap());
 
 
 };
diff --git a/source/tnn/network/coreml/tnn_impl_coreml.mm b/source/tnn/network/coreml/tnn_impl_coreml.mm
index 74f97786c..639c8cbbd 100644
--- a/source/tnn/network/coreml/tnn_impl_coreml.mm
+++ b/source/tnn/network/coreml/tnn_impl_coreml.mm
@@ -36,9 +36,23 @@
     return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model input shapes");
 }
 
+Status TNNImplCoreML::GetModelInputDataTypeMap(InputDataTypeMap& data_type_map) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model input data types");
+}
+
+Status TNNImplCoreML::GetModelInputNames(std::vector<std::string>& input_names) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model input names");
+}
+
+Status TNNImplCoreML::GetModelOutputNames(std::vector<std::string>& output_names) {
+    return Status(TNNERR_NET_ERR, "Error: CoreML do not supprt get model output names");
+
+}
+
 std::shared_ptr<Instance> TNNImplCoreML::CreateInst(NetworkConfig& net_config,
-                                               Status& status,
-                                               InputShapesMap inputs_shape) {
+                                                    Status& status,
+                                                    InputShapesMap inputs_shape,
+                                                    InputDataTypeMap inputs_data_type) {
     auto instance = std::make_shared<Instance>(net_config, model_config_);
     status        = instance->Init(nullptr, inputs_shape);
 
@@ -50,10 +64,11 @@
 
 
 std::shared_ptr<Instance> TNNImplCoreML::CreateInst(NetworkConfig& net_config,
-                                               Status& status,
-                                               InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape) {
+                                                    Status& status,
+                                                    InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape,
+                                                    InputDataTypeMap inputs_data_type) {
     auto instance = std::make_shared<Instance>(net_config, model_config_);
-    status        = instance->Init(nullptr, min_inputs_shape, max_inputs_shape);
+    status        = instance->Init(nullptr, min_inputs_shape, max_inputs_shape, inputs_data_type);
 
     if (status != TNN_OK) {
         return nullptr;
diff --git a/source/tnn/network/openvino/CMakeLists.txt b/source/tnn/network/openvino/CMakeLists.txt
index 987f0f063..8fcaca632 100644
--- a/source/tnn/network/openvino/CMakeLists.txt
+++ b/source/tnn/network/openvino/CMakeLists.txt
@@ -13,5 +13,5 @@ if(NOT DEFINED ENV{OPENVINO_ROOT_DIR})
     message(FATAL_ERROR "not defined environment variable:OPENVINO_ROOT_DIR")
 endif()
 
-include_directories($ENV{OPENVINO_ROOT_DIR}/deployment_tools/inference_engine/include)
-include_directories($ENV{OPENVINO_ROOT_DIR}/deployment_tools/ngraph/include)
+include_directories($ENV{OPENVINO_ROOT_DIR}/runtime/include/ie)
+include_directories($ENV{OPENVINO_ROOT_DIR}/runtime/include)
diff --git a/source/tnn/network/openvino/custom_layer/custom_implementation.h b/source/tnn/network/openvino/custom_layer/custom_implementation.h
index 6d4a94b2e..3afca94ae 100644
--- a/source/tnn/network/openvino/custom_layer/custom_implementation.h
+++ b/source/tnn/network/openvino/custom_layer/custom_implementation.h
@@ -12,6 +12,9 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
+#ifndef TNN_NETWORK_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAION_
+#define TNN_NETWORK_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAION_
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -19,20 +22,16 @@
 #include <string>
 #include <vector>
 
-#include <ie_blob.h>
-#include <ie_iextension.h>
-#include <ie_layouts.h>
-
-#include <tnn/core/status.h>
-#include <tnn/network/openvino/openvino_network.h>
+#include <ie/ie_blob.h>
+#include <ie/ie_layouts.h>
+#include <ie/ie_iextension.h>
 #include <ngraph/ngraph.hpp>
 #include <ngraph/opsets/opset.hpp>
 
+#include "tnn/core/status.h"
+#include "tnn/network/openvino/openvino_network.h"
 #include "tnn/network/openvino/utils.h"
 
-#ifndef TNN_DEVICE_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAIO_
-#define TNN_DEVICE_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAIO_
-
 namespace TNN_NS {
 
 class CustomOpenvinoOp : public ngraph::op::Op {
@@ -326,7 +325,7 @@ class CustomTypeRegister {
 #define DECLARE_CUSTOM_OP(type)                                                                                        \
     class Custom##type##Op : public TNN_NS::CustomOpenvinoOp {                                                         \
     public:                                                                                                            \
-        static constexpr ngraph::NodeTypeInfo type_info{"Custom" #type, 0};                                            \
+        static constexpr ngraph::NodeTypeInfo type_info{"Custom" #type, nullptr, nullptr};                             \
         const ngraph::NodeTypeInfo& get_type_info() const {                                                            \
             return type_info;                                                                                          \
         }                                                                                                              \
@@ -338,4 +337,4 @@ class CustomTypeRegister {
         }                                                                                                              \
     }
 
-#endif
+#endif  // #ifdef TNN_NETWORK_OPENVINO_CUSTOM_OPENVINO_IMPLEMENTAION_
diff --git a/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc b/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc
index 2af54f73b..a099f5c31 100644
--- a/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/abs_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Abs, LAYER_ABS);
 REGISTER_UNARY_LAYER_BUILDER(Abs, LAYER_ABS);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc b/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc
index 5d584467e..5ebb1db10 100644
--- a/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/acos_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Acos, LAYER_ACOS);
 REGISTER_UNARY_LAYER_BUILDER(Acos, LAYER_ACOS);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/adapter_layer_builder.cc b/source/tnn/network/openvino/layer_builder/adapter_layer_builder.cc
index 9ff5ed3a1..fb3028920 100644
--- a/source/tnn/network/openvino/layer_builder/adapter_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/adapter_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,9 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "tnn/network/openvino/layer_builder/adapter_layer_builder.h"
-
 #include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "tnn/network/openvino/layer_builder/adapter_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
diff --git a/source/tnn/network/openvino/layer_builder/adapter_layer_builder.h b/source/tnn/network/openvino/layer_builder/adapter_layer_builder.h
index 41494d9bb..db6e093e7 100644
--- a/source/tnn/network/openvino/layer_builder/adapter_layer_builder.h
+++ b/source/tnn/network/openvino/layer_builder/adapter_layer_builder.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
diff --git a/source/tnn/network/openvino/layer_builder/add_layer_builder.cc b/source/tnn/network/openvino/layer_builder/add_layer_builder.cc
index 755798be0..b55b38644 100644
--- a/source/tnn/network/openvino/layer_builder/add_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/add_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_BINARY_LAYER_BUILDER(Add, LAYER_ADD);
 REGISTER_BINARY_LAYER_BUILDER(Add, LAYER_ADD);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc b/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc
index 8cd8b333e..0df157d99 100644
--- a/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/arg_max_or_min_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 namespace openvino {
@@ -73,4 +57,4 @@ Status ArgMaxOrMinOVLayerBuilder::Build() {
 REGISTER_OPENVINO_LAYER_BUILDER(ArgMaxOrMin, LAYER_ARG_MAX_OR_MIN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc b/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc
index f34e2cc24..1402737a4 100644
--- a/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/asin_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Asin, LAYER_ASIN);
 REGISTER_UNARY_LAYER_BUILDER(Asin, LAYER_ASIN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc b/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc
index 48b3f83eb..81a1228dd 100644
--- a/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc
+++ b/source/tnn/network/openvino/layer_builder/atan_layer_budiler.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Atan, LAYER_ATAN);
 REGISTER_UNARY_LAYER_BUILDER(Atan, LAYER_ATAN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc b/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc
index ef70f188b..287b4bb88 100644
--- a/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/batch_norm_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,9 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_batch_norm.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/data_type_utils.h"
-#include "tnn/network/openvino/custom_layer/custom_batch_norm.h"
 
 namespace TNN_NS {
 
@@ -56,4 +42,4 @@ Status BatchNormOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc b/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc
index 852661cff..bf4530848 100644
--- a/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/binary_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,24 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
 #include "tnn/core/macro.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/interpreter/layer_param.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -76,4 +62,4 @@ Status BinaryLayerBuilder::Build() {
 }
 
 } // namespace openvino
-} // namespace TNN_NS
\ No newline at end of file
+} // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/binary_layer_builder.h b/source/tnn/network/openvino/layer_builder/binary_layer_builder.h
index f4c5ab911..13893735b 100644
--- a/source/tnn/network/openvino/layer_builder/binary_layer_builder.h
+++ b/source/tnn/network/openvino/layer_builder/binary_layer_builder.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -15,19 +15,9 @@
 #ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
 #define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_BINARY_LAYER_BUILDER_H_
 
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
+#include "tnn/extern_wrapper/base_layer_builder.h"
 #include "tnn/interpreter/layer_param.h"
 #include "tnn/interpreter/layer_resource.h"
-#include "tnn/extern_wrapper/base_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 
 namespace TNN_NS {
diff --git a/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc b/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc
index 9692e5bfb..3e8211b6c 100644
--- a/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/cast_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -63,4 +49,4 @@ Status CastOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Cast, LAYER_CAST);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc b/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc
index 5ca175f31..fdcb2821f 100644
--- a/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/ceil_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -56,4 +40,4 @@ Status CeilOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Ceil, LAYER_CEIL);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc b/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc
index 3fde188a6..c2f848587 100644
--- a/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/clip_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -57,4 +43,4 @@ Status ClipOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Clip, LAYER_CLIP);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc b/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc
index d84195236..1eb135ef9 100644
--- a/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/concat_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -55,4 +41,4 @@ Status ConcatOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Concat, LAYER_CONCAT);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc b/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc
index 7c3936e4c..a5c257391 100644
--- a/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/conv_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc b/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc
index e1390ce4a..f8577cb86 100644
--- a/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/cos_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Cos, LAYER_COS);
 REGISTER_UNARY_LAYER_BUILDER(Cos, LAYER_COS);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc b/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc
index e4a0225a2..9ff024ccf 100644
--- a/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/deconv_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc b/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc
index 9bb6882ae..6bf113a38 100644
--- a/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/detection_output_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/utils/bbox_util.h"
 #include "tnn/network/openvino/custom_layer/custom_implementation.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
+#include "tnn/utils/bbox_util.h"
 
 namespace TNN_NS {
 DECLARE_CUSTOM_OP(DetectionOutput);
@@ -49,4 +38,4 @@ Status DetectionOutputOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(DetectionOutput, LAYER_DETECTION_OUTPUT);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/div_layer_builder.cc b/source/tnn/network/openvino/layer_builder/div_layer_builder.cc
index 83b8e9e92..83c45cf6e 100644
--- a/source/tnn/network/openvino/layer_builder/div_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/div_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,11 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
 namespace openvino {
 
-
 DECLARE_BINARY_LAYER_BUILDER(Divide, LAYER_DIV);
 
 REGISTER_BINARY_LAYER_BUILDER(Divide, LAYER_DIV);
diff --git a/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc b/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc
index cb20bb5ee..fd1f7d4a2 100644
--- a/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/elu_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -57,4 +43,4 @@ Status EluOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Elu, LAYER_ELU);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc b/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc
index e3ea77a41..1d12ac5da 100644
--- a/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/erf_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Erf, LAYER_ERF);
 REGISTER_UNARY_LAYER_BUILDER(Erf, LAYER_ERF);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc b/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc
index a33833f49..0a6c6a38f 100644
--- a/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/exp_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Exp, LAYER_EXP);
 REGISTER_UNARY_LAYER_BUILDER(Exp, LAYER_EXP);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc b/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc
index 1373311f7..4f2a1bdb7 100644
--- a/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/expand_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_expand.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -51,4 +36,4 @@ Status ExpandOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Expand, LAYER_EXPAND);
 REGISTER_CUSTOM_TYPE(LAYER_EXPAND);
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc b/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc
index c72d505f9..119eecf39 100644
--- a/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/flatten_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -71,4 +57,4 @@ Status FlattenOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Flatten, LAYER_FLATTEN);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc b/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc
index 61e17ad27..26fc13be1 100644
--- a/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/floor_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Floor, LAYER_FLOOR);
 REGISTER_UNARY_LAYER_BUILDER(Floor, LAYER_FLOOR);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc b/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc
index e68907dd2..770c9b20a 100644
--- a/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/gather_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -79,4 +65,4 @@ Status GatherOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Gather, LAYER_GATHER);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/gathernd_layer_builder.cc b/source/tnn/network/openvino/layer_builder/gathernd_layer_builder.cc
index f9356a417..aa211c923 100644
--- a/source/tnn/network/openvino/layer_builder/gathernd_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/gathernd_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,24 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-#include "tnn/utils/data_type_utils.h"
 #include "tnn/network/openvino/custom_layer/custom_gathernd.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
+#include "tnn/utils/data_type_utils.h"
 
 namespace TNN_NS {
 
@@ -44,4 +30,4 @@ Status GatherNDOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(GatherND, LAYER_GATHERND);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc b/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc
index 6ef32e253..29a9e771c 100644
--- a/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/gelu_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -56,4 +42,4 @@ Status GeluOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Gelu, LAYER_GELU);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc b/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc
index b07813638..f3ee354f1 100644
--- a/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/hard_sigmoid_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -62,4 +48,4 @@ Status HardSigmoidOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(HardSigmoid, LAYER_HARDSIGMOID);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc b/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc
index d68eee102..8cb2e761e 100644
--- a/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/hard_swish_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_hard_swish.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -81,4 +66,4 @@ Status HardSwishOVLayerBuilder::Build() {
 REGISTER_OPENVINO_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
 REGISTER_CUSTOM_TYPE(LAYER_HARDSWISH);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc b/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc
index 5764016d5..a17ee9c86 100644
--- a/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/inner_product_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/dims_utils.h"
 
 namespace TNN_NS {
@@ -102,4 +88,4 @@ Status InnerProductOVLayerBuilder::Build() {
 }
 
 REGISTER_OPENVINO_LAYER_BUILDER(InnerProduct, LAYER_INNER_PRODUCT);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc b/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc
index 7fc2ef77e..9ea728b66 100644
--- a/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/instance_norm_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_instance_norm.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
 #include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/data_type_utils.h"
-#include "tnn/network/openvino/custom_layer/custom_instance_norm.h"
 
 namespace TNN_NS {
 
@@ -84,4 +71,4 @@ Status InstanceNormOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(InstanceNorm, LAYER_INST_BATCH_NORM);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc b/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc
index 5d4ccb5f0..7cb61699c 100644
--- a/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/layer_norm_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,24 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/custom_layer/custom_layer_norm.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
-#include "tnn/utils/data_type_utils.h"
 #include "tnn/network/openvino/utils.h"
+#include "tnn/utils/data_type_utils.h"
 
 namespace TNN_NS {
 
@@ -51,4 +37,4 @@ Status LayerNormOVLayerBuilder::Build() {
 REGISTER_OPENVINO_LAYER_BUILDER(LayerNorm, LAYER_LAYER_NORM);
 REGISTER_CUSTOM_TYPE(LAYER_LAYER_NORM);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/log_layer_builder.cc b/source/tnn/network/openvino/layer_builder/log_layer_builder.cc
index 4a4f1aeb6..32ac9a4c4 100644
--- a/source/tnn/network/openvino/layer_builder/log_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/log_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Log, LAYER_LOG);
 REGISTER_UNARY_LAYER_BUILDER(Log, LAYER_LOG);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc b/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc
index 635d02f68..11408fc9c 100644
--- a/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/log_sigmoid_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -56,4 +42,4 @@ Status LogSigmoidOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(LogSigmoid, LAYER_LOGSIGMOID);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc b/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc
index a10da848d..ebe26e8ee 100644
--- a/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/lrn_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -57,4 +43,4 @@ Status LRNOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(LRN, LAYER_LRN);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc b/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc
index 31a23f916..807f6b062 100644
--- a/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/lstm_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,25 +12,10 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_lstm_onnx.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/dims_vector_utils.h"
 
-#include "tnn/network/openvino/custom_layer/custom_lstm_onnx.h"
-
 namespace TNN_NS {
 
 DECLARE_OPENVINO_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
@@ -137,4 +122,4 @@ Status LSTMONNXOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(LSTMONNX, LAYER_LSTMONNX);
 REGISTER_CUSTOM_TYPE(LAYER_LSTMONNX);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc b/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc
index a556150cf..917eb9e26 100644
--- a/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/matmul_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/dims_utils.h"
 
 namespace TNN_NS {
@@ -79,4 +65,4 @@ Status MatMulOVLayerBuilder::Build() {
 }
 
 REGISTER_OPENVINO_LAYER_BUILDER(MatMul, LAYER_MATMUL);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/max_layer_builder.cc b/source/tnn/network/openvino/layer_builder/max_layer_builder.cc
index 81b962013..aa10df40f 100644
--- a/source/tnn/network/openvino/layer_builder/max_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/max_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_BINARY_LAYER_BUILDER(Maximum, LAYER_MAXIMUM);
 REGISTER_BINARY_LAYER_BUILDER(Maximum, LAYER_MAXIMUM);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/min_layer_builder.cc b/source/tnn/network/openvino/layer_builder/min_layer_builder.cc
index eef55af6b..727d01de4 100644
--- a/source/tnn/network/openvino/layer_builder/min_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/min_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_BINARY_LAYER_BUILDER(Minimum, LAYER_MINIMUM);
 REGISTER_BINARY_LAYER_BUILDER(Minimum, LAYER_MINIMUM);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc b/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc
index 1ffcb5c4f..9f63e0b8e 100644
--- a/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/multiply_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_BINARY_LAYER_BUILDER(Multiply, LAYER_MUL);
 REGISTER_BINARY_LAYER_BUILDER(Multiply, LAYER_MUL);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc b/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc
index edc05b085..6ce229a17 100644
--- a/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/neg_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Negative, LAYER_NEG);
 REGISTER_UNARY_LAYER_BUILDER(Negative, LAYER_NEG);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc b/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc
index 91ec56332..a3fe22b2e 100644
--- a/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/normalize_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -95,4 +79,4 @@ Status NormalizeOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Normalize, LAYER_NORMALIZE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc b/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc
index 345a1449a..741dcb699 100644
--- a/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc
+++ b/source/tnn/network/openvino/layer_builder/onehot_layer_build.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -60,4 +46,4 @@ Status OneHotOVLayerBuilder::Build() {
 }
 
 REGISTER_OPENVINO_LAYER_BUILDER(OneHot, LAYER_ONEHOT);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc b/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc
index fb491948c..70769e482 100644
--- a/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/openvino_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,20 +12,12 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-
 #include <mutex>
 
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-
 #include "tnn/core/macro.h"
-#include "tnn/core/abstract_device.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/device/x86/x86_device.h"
 #include "tnn/network/openvino/custom_layer/custom_implementation.h"
 #include "tnn/network/openvino/layer_builder/adapter_layer_builder.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h b/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h
index 81d212f1e..1e3c38921 100644
--- a/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h
+++ b/source/tnn/network/openvino/layer_builder/openvino_layer_builder.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -15,25 +15,34 @@
 #ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_OPENVINO_LAYER_BUILDER_H_
 #define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_BUILDER_OPENVINO_LAYER_BUILDER_H_
 
+#include <cmath>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include <ie/inference_engine.hpp>
 #include <ngraph/node.hpp>
+#include <ngraph/ngraph.hpp>
 #include <ngraph/op/op.hpp>
+#include <ngraph/opsets/opset.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
 
-#include "tnn/layer/base_layer.h"
 #include "tnn/core/abstract_device.h"
 #include "tnn/core/blob.h"
 #include "tnn/core/context.h"
 #include "tnn/core/layer_type.h"
 #include "tnn/core/status.h"
+#include "tnn/device/x86/x86_device.h"
+#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/extern_wrapper/foreign_blob.h"
+#include "tnn/extern_wrapper/foreign_tensor.h"
 #include "tnn/interpreter/layer_param.h"
 #include "tnn/interpreter/layer_resource.h"
-#include "tnn/extern_wrapper/base_layer_builder.h"
+#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/openvino_types.h"
-#include "tnn/device/x86/x86_device.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc b/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc
index 95f6b57f5..2a078f8db 100644
--- a/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/pad_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -93,4 +79,4 @@ Status PadOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Pad, LAYER_PAD);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc b/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc
index fbd83606f..014a04e8f 100644
--- a/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/pad_v2_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_pad_v2.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -93,4 +78,4 @@ Status PadV2OVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(PadV2, LAYER_PADV2);
 REGISTER_CUSTOM_TYPE(LAYER_PADV2);
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc b/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc
index d22451fa1..ae04124bd 100644
--- a/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/permute_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -70,4 +56,4 @@ Status PermuteOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Permute, LAYER_PERMUTE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/pixel_shuffle_layer_builder.cc b/source/tnn/network/openvino/layer_builder/pixel_shuffle_layer_builder.cc
index f162a7673..fe46167f1 100644
--- a/source/tnn/network/openvino/layer_builder/pixel_shuffle_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/pixel_shuffle_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -55,4 +41,4 @@ Status PixelShuffleOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(PixelShuffle, LAYER_PIXEL_SHUFFLE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc b/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc
index f1ff7d107..56d3f7d4c 100644
--- a/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/pool_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,22 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/custom_layer/custom_pooling.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -118,4 +104,4 @@ Status PoolingOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Pooling, LAYER_POOLING);
 REGISTER_CUSTOM_TYPE(LAYER_POOLING);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc b/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc
index 6e0656148..d35a64e45 100644
--- a/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/pow_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -71,4 +57,4 @@ Status PowOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Pow, LAYER_POWER);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc b/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc
index cf9ad0473..cbfc989d7 100644
--- a/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/prelu_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -72,4 +58,4 @@ Status PReLUOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(PReLU, LAYER_PRELU);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc b/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc
index 74e4a010d..e30db9545 100644
--- a/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/prior_box_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -95,4 +81,4 @@ Status PriorBoxOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(PriorBox, LAYER_PRIOR_BOX);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc
index f6baa9c62..172dd641e 100644
--- a/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reciprocal_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -58,4 +42,4 @@ Status ReciprocalOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Reciprocal, LAYER_RECIPROCAL);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc
index bea7ef9d6..90caf4f7a 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_l1_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -62,4 +48,4 @@ Status ReduceL1OVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceL1, LAYER_REDUCE_L1);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc
index ea8a2f814..01fe06613 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_l2_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -69,4 +55,4 @@ Status ReduceL2OVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceL2, LAYER_REDUCE_L2);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc
index e17380cc6..510600636 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_log_sum_exp_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -65,4 +51,4 @@ Status ReduceLogSumExpOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceLogSumExp, LAYER_REDUCE_LOG_SUM_EXP);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc
index b7996d9a7..1b47788e2 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_log_sum_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -62,4 +48,4 @@ Status ReduceLogSumOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceLogSum, LAYER_REDUCE_LOG_SUM);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc
index 130eadaea..16fd844ed 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_max_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -59,4 +45,4 @@ Status ReduceMaxOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceMax, LAYER_REDUCE_MAX);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc
index 8b0c72f88..eea1bd3f1 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_mean_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -59,4 +45,4 @@ Status ReduceMeanOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceMean, LAYER_REDUCE_MEAN);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc
index 2c2075120..d8a9ce32d 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_min_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -59,4 +45,4 @@ Status ReduceMinOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceMin, LAYER_REDUCE_MIN);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc
index 4ab8cf2be..82c71d678 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_prod_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -59,4 +45,4 @@ Status ReduceProdOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceProd, LAYER_REDUCE_PROD);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc
index cf3dbcafc..3785a53c5 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_sum_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -59,4 +45,4 @@ Status ReduceSumOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceSum, LAYER_REDUCE_SUM);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc
index 0af972d6c..4672f4eec 100644
--- a/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reduce_sum_square_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -64,4 +50,4 @@ Status ReduceSumSquareOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ReduceSumSquare, LAYER_REDUCE_SUM_SQUARE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc b/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc
index 03b9ea3bc..007fec8bb 100644
--- a/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/relu6_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -57,4 +41,4 @@ Status Relu6OVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Relu6, LAYER_RELU6);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc b/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc
index 4d2b334ac..9ab00be4b 100644
--- a/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/relu_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Relu, LAYER_RELU);
 REGISTER_UNARY_LAYER_BUILDER(Relu, LAYER_RELU);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc
index 9654b25fe..808b258c8 100644
--- a/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reorg_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,25 +12,9 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
+#include "tnn/network/openvino/custom_layer/custom_reorg.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/utils.h"
-#include "tnn/network/openvino/custom_layer/custom_reorg.h"
 
 namespace TNN_NS {
 
@@ -54,4 +38,4 @@ Status ReorgOVLayerBuilder::Build() {
 REGISTER_OPENVINO_LAYER_BUILDER(Reorg, LAYER_REORG);
 REGISTER_CUSTOM_TYPE(LAYER_REORG);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc b/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc
index 1477712c7..796378dec 100644
--- a/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/reshape_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,22 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/custom_layer/custom_reshape.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -68,4 +54,4 @@ Status ReshapeOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Reshape, LAYER_RESHAPE);
 REGISTER_CUSTOM_TYPE(LAYER_RESHAPE);
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc b/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc
index ab94dbc70..9bd028339 100644
--- a/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/roi_pooling_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -72,4 +56,4 @@ Status RoiPoolingOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(RoiPooling, LAYER_ROIPOOLING);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/scale_layer_builder.cc b/source/tnn/network/openvino/layer_builder/scale_layer_builder.cc
index 7bf6d4eab..69f3886a4 100644
--- a/source/tnn/network/openvino/layer_builder/scale_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/scale_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/utils/data_type_utils.h"
 
 namespace TNN_NS {
@@ -79,4 +65,4 @@ Status ScaleOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Scale, LAYER_SCALE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc b/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc
index a5ecdc97a..602569c9e 100644
--- a/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/scatter_nd_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -66,4 +52,4 @@ Status ScatterNDOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(ScatterND, LAYER_SCATTER_ND);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc b/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc
index c8c8b84fb..de4132ce4 100644
--- a/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/selu_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset2.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -76,4 +60,4 @@ Status SeluOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Selu, LAYER_SELU);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc b/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc
index d8e0c1e5b..bf321215c 100644
--- a/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/shape_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -79,4 +65,4 @@ Status ShapeOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Shape, LAYER_SHAPE);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc b/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc
index 9594bf0cb..06b3ec96d 100644
--- a/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/shuffle_channel_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc b/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc
index 1cae56f76..4f2f81bb7 100644
--- a/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/sigmoid_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Sigmoid, LAYER_SIGMOID);
 REGISTER_UNARY_LAYER_BUILDER(Sigmoid, LAYER_SIGMOID);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc b/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc
index 773ee341e..8e89cf65c 100644
--- a/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/sign_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Sign, LAYER_SIGN);
 REGISTER_UNARY_LAYER_BUILDER(Sign, LAYER_SIGN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc b/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc
index 349895be6..9bfc98984 100644
--- a/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/sin_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Sin, LAYER_SIN);
 REGISTER_UNARY_LAYER_BUILDER(Sin, LAYER_SIN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc b/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc
index 764ea84a2..2d47fd678 100644
--- a/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/softmax_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -57,4 +43,4 @@ Status SoftmaxOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Softmax, LAYER_SOFTMAX);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc b/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc
index 034322ca6..1b95d2755 100644
--- a/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/softplus_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,24 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_softplus.h"
-#include <iostream>
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc b/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc
index 8498d6269..2b82b4bba 100644
--- a/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/softsign_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,24 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_softsign.h"
-#include <iostream>
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc b/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc
index 33775908e..ec0eb3765 100644
--- a/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/splitv_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -120,4 +106,4 @@ Status SplitvOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Splitv, LAYER_SPLITV);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc b/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc
index a334825bd..b9f2a561a 100644
--- a/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/sqrt_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Sqrt, LAYER_SQRT);
 REGISTER_UNARY_LAYER_BUILDER(Sqrt, LAYER_SQRT);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc b/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc
index bac4b0a75..f9da39ade 100644
--- a/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/squeeze_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -52,4 +38,4 @@ Status SqueezeOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Squeeze, LAYER_SQUEEZE);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc b/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc
index 99d2dbd1f..8ca5234ad 100644
--- a/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/stride_slice_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -78,4 +64,4 @@ Status StrideSliceOVLayerBuilder::Build() {
 }
 
 REGISTER_OPENVINO_LAYER_BUILDER(StrideSlice, LAYER_STRIDED_SLICE);
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc b/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc
index 4edb4cf91..b661f2efb 100644
--- a/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/stride_slice_v2_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,23 +12,8 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
-#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
-
 #include "tnn/network/openvino/custom_layer/custom_stride_slice_v2.h"
+#include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 #include "tnn/network/openvino/utils.h"
 
 namespace TNN_NS {
@@ -101,4 +86,4 @@ Status StrideSliceV2OVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(StrideSliceV2, LAYER_STRIDED_SLICE_V2);
 REGISTER_CUSTOM_TYPE(LAYER_STRIDED_SLICE_V2);
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc b/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc
index 41ecb03e1..1f425424b 100644
--- a/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/sub_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/binary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_BINARY_LAYER_BUILDER(Subtract, LAYER_SUB);
 REGISTER_BINARY_LAYER_BUILDER(Subtract, LAYER_SUB);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc b/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc
index c666f6b8a..a9d32e480 100644
--- a/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/tan_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Tan, LAYER_TAN);
 REGISTER_UNARY_LAYER_BUILDER(Tan, LAYER_TAN);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc b/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc
index 18a71063f..1ff0da145 100644
--- a/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/tanh_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,17 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/unary_layer_builder.h"
 
 namespace TNN_NS {
@@ -33,4 +22,4 @@ DECLARE_UNARY_LAYER_BUILDER(Tanh, LAYER_TANH);
 REGISTER_UNARY_LAYER_BUILDER(Tanh, LAYER_TANH);
 
 }
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/layer_builder/unary_layer_builder.h b/source/tnn/network/openvino/layer_builder/unary_layer_builder.h
index 0407c8706..948aa208b 100644
--- a/source/tnn/network/openvino/layer_builder/unary_layer_builder.h
+++ b/source/tnn/network/openvino/layer_builder/unary_layer_builder.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -15,19 +15,6 @@
 #ifndef TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_UNARY_BINARY_LAYER_BUILDER_H_
 #define TNN_SOURCE_TNN_DEVICE_OPENVINO_LAYER_UNARY_BINARY_LAYER_BUILDER_H_
 
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/interpreter/layer_param.h"
-#include "tnn/interpreter/layer_resource.h"
-#include "tnn/extern_wrapper/base_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
 
 namespace TNN_NS {
diff --git a/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc b/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc
index 0be9159aa..ed8159513 100644
--- a/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/unsqueeze_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,21 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <inference_engine.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/node.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -52,4 +38,4 @@ Status UnsqueezeOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Unsqueeze, LAYER_UNSQUEEZE);
 
-}  // namespace TNN_NS
\ No newline at end of file
+}  // namespace TNN_NS
diff --git a/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc b/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc
index 4fd92bd62..e08545580 100644
--- a/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc
+++ b/source/tnn/network/openvino/layer_builder/upsample_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -12,22 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include <cmath>
-#include <memory>
-
-#include <ngraph/node.hpp>
-#include <ngraph/ngraph.hpp>
-#include <ngraph/op/op.hpp>
-#include <ngraph/opsets/opset.hpp>
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <inference_engine.hpp>
-
-#include "tnn/layer/base_layer.h"
 #include "tnn/network/openvino/layer_builder/openvino_layer_builder.h"
-#include "tnn/extern_wrapper/foreign_blob.h"
-#include "tnn/extern_wrapper/foreign_tensor.h"
-#include "tnn/network/openvino/openvino_types.h"
 
 namespace TNN_NS {
 
@@ -45,11 +30,11 @@ Status UpsampleOVLayerBuilder::Build() {
 
     ngraph::op::v4::Interpolate::InterpolateAttrs attrs;
     if (paramlist->align_corners) {
-        attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::align_corners;
+        attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS;
     } else {
-        attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::half_pixel;
+        attrs.coordinate_transformation_mode = ngraph::op::v4::Interpolate::CoordinateTransformMode::HALF_PIXEL;
     }
-    attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::floor;
+    attrs.nearest_mode = ngraph::op::v4::Interpolate::NearestMode::FLOOR;
     // attrs.align_corners = paramlist->align_corners;
     // if (paramlist->align_corners) 
     //     attrs.coordinate_transformation_mode = ngraph::op::v3::Interpolate::CoordinateTransformMode::align_corners;
@@ -61,11 +46,11 @@ Status UpsampleOVLayerBuilder::Build() {
         ngraph::element::Type_t::i64, ngraph::Shape{input_node->get_output_shape(0).size() - 2}, axes);
 
     if (paramlist->mode == 1) {
-        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::nearest; //"nearest";
+        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::NEAREST; //"nearest";
     } else if (paramlist->mode == 2) {
-        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear;  //"linear";
+        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::LINEAR;  //"linear";
     } else if (paramlist->mode == 3){
-        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::cubic;   //"cubic";
+        attrs.mode = ngraph::op::v4::Interpolate::InterpolateMode::CUBIC;   //"cubic";
     } else {
         return Status(TNNERR_MODEL_ERR, "Error: Upsample dont support resize type");
     }
@@ -77,7 +62,7 @@ Status UpsampleOVLayerBuilder::Build() {
     upsampleScaleShape.push_back(1.0);
     upsampleScaleShape.push_back(1.0);
     if (paramlist->dims.size() != 0) {
-        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::sizes;
+        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::SIZES;
         if (paramlist->dims[0] != 0 && paramlist->dims[1] != 0) {
             upsampleShape[0] = paramlist->dims[1];
             upsampleShape[1] = paramlist->dims[0];
@@ -85,7 +70,7 @@ Status UpsampleOVLayerBuilder::Build() {
             return Status(TNNERR_MODEL_ERR, "Error: Upsample size error");
         }
     } else {
-        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::scales;
+        attrs.shape_calculation_mode = ngraph::op::v4::Interpolate::ShapeCalcMode::SCALES;
         upsampleScaleShape[0] = paramlist->scales.at(1);
         upsampleScaleShape[1] = paramlist->scales.at(0);
     }
@@ -109,4 +94,4 @@ Status UpsampleOVLayerBuilder::Build() {
 
 REGISTER_OPENVINO_LAYER_BUILDER(Upsample, LAYER_UPSAMPLE);
 
-}
\ No newline at end of file
+}
diff --git a/source/tnn/network/openvino/openvino_network.cc b/source/tnn/network/openvino/openvino_network.cc
index 334b7afb2..80c43aff5 100644
--- a/source/tnn/network/openvino/openvino_network.cc
+++ b/source/tnn/network/openvino/openvino_network.cc
@@ -34,7 +34,8 @@ OpenVINONetwork_::~OpenVINONetwork_() {
 
 Status OpenVINONetwork_::Init(NetworkConfig &net_config, ModelConfig &model_config,
                             AbstractModelInterpreter* interpreter,
-                            InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder) {
+                            InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, 
+                            InputDataTypeMap inputs_data_type, bool enable_const_folder) {
 
     Status ret  = TNN_OK;
 
@@ -151,7 +152,7 @@ Status OpenVINONetwork_::BuildNgraphNetwork(NetStructure *net_structure) {
     return TNN_OK;
 }
 
-Status OpenVINONetwork_::GetForwardMemorySize(int &memory_size) {
+Status OpenVINONetwork_::GetForwardMemorySize(size_t &memory_size) {
     memory_size = 0;
     return TNN_OK;
 }
@@ -381,6 +382,10 @@ Status OpenVINONetwork_::GetCommandQueue(void **command_queue) {
     return TNN_OK;
 }
 
+Status OpenVINONetwork_::SetCommandQueue(void *command_queue) {
+    return TNN_OK;
+}
+
 Status OpenVINONetwork_::Forward() {
     infer_request_.Infer();
 #if TNN_PROFILE
diff --git a/source/tnn/network/openvino/openvino_network.h b/source/tnn/network/openvino/openvino_network.h
index 80897166e..3663d4a02 100644
--- a/source/tnn/network/openvino/openvino_network.h
+++ b/source/tnn/network/openvino/openvino_network.h
@@ -5,14 +5,14 @@
 
 #include <vector>
 
-#include <inference_engine.hpp>
+#include <ie/inference_engine.hpp>
 
 #include "tnn/core/abstract_network.h"
-#include "tnn/core/default_network.h"
 #include "tnn/core/blob.h"
 #include "tnn/core/blob_manager.h"
 #include "tnn/core/common.h"
 #include "tnn/core/context.h"
+#include "tnn/core/default_network.h"
 #include "tnn/core/macro.h"
 #include "tnn/interpreter/net_resource.h"
 #include "tnn/interpreter/net_structure.h"
@@ -32,7 +32,8 @@ class OpenVINONetwork_:public DefaultNetwork {
     // @param net_res
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config,
                         AbstractModelInterpreter* interpreter,
-                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, bool enable_const_folder=true);
+                        InputShapesMap min_inputs_shape, InputShapesMap max_inputs_shape, 
+                        InputDataTypeMap inputs_data_type, bool enable_const_folder=true);
 
     // @brief deinit release init create resource
     virtual Status DeInit();
@@ -42,7 +43,7 @@ class OpenVINONetwork_:public DefaultNetwork {
     //  forward
     //  @return error code: If successful, returns zero. Otherwise, returns
     //  an error code.
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     //  @brief: set memory used by the tnn instance without forward
     //  memory, the memory size must be at least that returned by
@@ -62,6 +63,10 @@ class OpenVINONetwork_:public DefaultNetwork {
     // @param command_queue device command queue for forward
     virtual Status GetCommandQueue(void **command_queue);
 
+    // @brief set tnn command queue
+    // @param command_queue device command queue for forward
+    virtual Status SetCommandQueue(void* command_queue);
+
     // @brief network infer, it will sync to wait result
     virtual Status Forward();
 
diff --git a/source/tnn/network/openvino/utils.h b/source/tnn/network/openvino/utils.h
index 12e7f2cfd..f345c7901 100644
--- a/source/tnn/network/openvino/utils.h
+++ b/source/tnn/network/openvino/utils.h
@@ -15,7 +15,7 @@
 #ifndef TNN_SOURCE_TNN_NETWORK_OPENVINO_UTILS_H_
 #define TNN_SOURCE_TNN_NETWORK_OPENVINO_UTILS_H_
 
-#include <ie_precision.hpp>
+#include <ie/ie_precision.hpp>
 #include <ngraph/ngraph.hpp>
 #include <ngraph/node.hpp>
 #include <ngraph/op/op.hpp>
diff --git a/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc
index 2b38ddf38..8f7c684c5 100644
--- a/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/abs_layer_builder.cc
@@ -21,11 +21,30 @@ DECLARE_TENSORRT_LAYER_BUILDER(Abs, LAYER_ABS);
 ILayer* AbsTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
-    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kABS);
+
+    ILayer* layer;    
+    nvinfer1::DataType in_dtype = tensor->getType();
+    // TRT8 unary ABS does not suppport INT32
+    // Convert to FLOAT first and then back to INT32 AFTER ABS 
+    if (in_dtype==nvinfer1::DataType::kINT8 || in_dtype==nvinfer1::DataType::kINT32) {
+        ILayer* cast_layer = network->addIdentity(*tensor);
+        cast_layer->setName((layer_name_+"_int2fp").c_str());
+        cast_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        tensor = cast_layer->getOutput(0);
+    }
+
+    layer = network->addUnary(*tensor, UnaryOperation::kABS);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
     }
 
+    if (in_dtype==nvinfer1::DataType::kINT8 || in_dtype==nvinfer1::DataType::kINT32) {
+        layer = network->addIdentity(*tensor);
+        layer->setName((layer_name_+"_fp2int").c_str());
+        layer->setOutputType(0, in_dtype);
+        tensor = layer->getOutput(0);
+    }
+
     return layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc
index a70bcff94..b57875a0d 100644
--- a/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/activation_layer_builder.cc
@@ -26,7 +26,18 @@ ILayer* ActivationTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+    bool type_cast = false;
+    nvinfer1::DataType input_dtype = input_tensor->getType();
+    if (input_dtype == nvinfer1::DataType::kINT32) {
+        type_cast = true;
+        ILayer* cast_layer = network->addIdentity(*input_tensor);
+        if (cast_layer != nullptr) {
+            cast_layer->setName((layer_name_+"_input_to_float").c_str());
+            cast_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            input_tensor = cast_layer->getOutput(0);
+        }
+    }
+
 
     ILayer* last_layer;
     IActivationLayer* activation_layer = network->addActivation(*input_tensor, m_type);
@@ -45,11 +56,14 @@ ILayer* ActivationTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         last_layer = activation_layer;
     }
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
-            output_scale_value, 1 / output_scale_value);
+    if (type_cast) {
+        input_tensor = last_layer->getOutput(0);
+        ILayer* cast_layer = network->addIdentity(*input_tensor);
+        if (cast_layer != nullptr) {
+            cast_layer->setName((layer_name_+"_output_to_int32").c_str());
+            cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+            last_layer = cast_layer;
+        }
     }
 
     return last_layer;
diff --git a/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc
index 6aa5216af..8a98f9f25 100644
--- a/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/batch_norm_layer_builder.cc
@@ -28,13 +28,14 @@ ILayer* BatchNormTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
 
-    Weights power { nvinfer1::DataType::kFLOAT, nullptr, 0 };
     Weights shift;
     shift = ConvertToWeights(&(resource->bias_handle));
 
     Weights scale;
     scale = ConvertToWeights(&(resource->scale_handle));
 
+    Weights power { scale.type, nullptr, 0 };
+
     int dims_size = tensor->getDimensions().nbDims;
     // unsqueeze 
     ILayer* layer;
@@ -74,6 +75,7 @@ ILayer* BatchNormTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
 }
 
 REGISTER_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_BATCH_NORM);
+REGISTER_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_BIAS_ADD);
 REGISTER_TENSORRT_LAYER_BUILDER(BatchNorm, LAYER_SCALE);
 
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc
index fb716bd2e..db36e934c 100644
--- a/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/binary_layer_builder.cc
@@ -21,7 +21,9 @@ BinaryTRTLayerBuilder::BinaryTRTLayerBuilder(LayerType ignore) : TensorRTLayerBu
 }
 
 ILayer* BinaryTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
-    IElementWiseLayer* layer;
+    ILayer* layer;
+    nvinfer1::DataType out_dtype = ConvertToTRTDataType(output_blobs_[0]->GetBlobDesc().data_type); 
+    
     if (input_blobs_.size() == 2) {
         auto input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
         auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
@@ -41,9 +43,52 @@ ILayer* BinaryTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
             input_tensor2 = unsqueeze_layer->getOutput(0);
         }
 
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType in1_dtype = input_tensor1->getType();
+        nvinfer1::DataType in2_dtype = input_tensor2->getType();
+
+        // DataType Cast Before Elemwise
+        if (m_op==ElementWiseOperation::kAND || m_op==ElementWiseOperation::kOR ||
+            m_op==ElementWiseOperation::kXOR) {
+            // kAND, kOR, kXOR requires input type == kBOOL
+            if (in1_dtype!=nvinfer1::DataType::kBOOL) {
+                ILayer* cast_layer = network->addIdentity(*input_tensor1);
+                cast_layer->setName((layer_name_+"_a_2bool").c_str());
+                cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
+                input_tensor1 = cast_layer->getOutput(0);
+            }
+            if (in2_dtype!=nvinfer1::DataType::kBOOL) {
+                ILayer* cast_layer = network->addIdentity(*input_tensor2);
+                cast_layer->setName((layer_name_+"_b_2bool").c_str());
+                cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
+                input_tensor2 = cast_layer->getOutput(0);
+            }
+        } else {
+            if (in1_dtype==nvinfer1::DataType::kINT32 && 
+                (in2_dtype==nvinfer1::DataType::kFLOAT || in2_dtype==nvinfer1::DataType::kHALF)) {
+                ILayer* cast_layer = network->addIdentity(*input_tensor1);
+                cast_layer->setName((layer_name_+"_a_int2fp").c_str());
+                cast_layer->setOutputType(0, in2_dtype);
+                input_tensor1 = cast_layer->getOutput(0);
+            } else if ((in1_dtype==nvinfer1::DataType::kFLOAT || in1_dtype==nvinfer1::DataType::kHALF) && 
+                in2_dtype==nvinfer1::DataType::kINT32) {
+                ILayer* cast_layer = network->addIdentity(*input_tensor2);
+                cast_layer->setName((layer_name_+"_b_int2fp").c_str());
+                cast_layer->setOutputType(0, in1_dtype);
+                input_tensor2 = cast_layer->getOutput(0);
+            }
+        }
+
+        if(!CheckBroadcastDimsCorrect(input_tensor1, input_tensor2)) {
+            return nullptr;
+        }
+
         layer = network->addElementWise(*input_tensor1, *input_tensor2, m_op);
         if (layer != nullptr) {
             layer->setName(layer_name_.c_str());
+        } else {
+            return nullptr;
         }
     } else {
         auto paramlist = dynamic_cast<MultidirBroadcastLayerParam*>(param_);
@@ -74,21 +119,82 @@ ILayer* BinaryTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
             src_a = unsqueeze_layer->getOutput(0);
         }
 
+        // DataType Cast
+        //DataType src_a_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+        //DataType src_b_dtype = resource->element_handle.GetDataType();
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType src_a_dtype = src_a->getType();
+        nvinfer1::DataType src_b_dtype = src_b->getType();
+        if (m_op==ElementWiseOperation::kAND || m_op==ElementWiseOperation::kOR ||
+            m_op==ElementWiseOperation::kXOR) {
+            // kAND, kOR, kXOR requires input type == kBOOL
+            if (src_a_dtype!=nvinfer1::DataType::kBOOL) {
+                ILayer* cast_layer = network->addIdentity(*src_a);
+                cast_layer->setName((layer_name_+"_a_2bool").c_str());
+                cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
+                src_a = cast_layer->getOutput(0);
+            }
+            if (src_b_dtype!=nvinfer1::DataType::kBOOL) {
+                ILayer* cast_layer = network->addIdentity(*src_b);
+                cast_layer->setName((layer_name_+"_b_2bool").c_str());
+                cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
+                src_b = cast_layer->getOutput(0);
+            }
+        } else {
+            if (src_a_dtype==nvinfer1::DataType::kINT32 && 
+                (src_b_dtype==nvinfer1::DataType::kFLOAT || src_b_dtype==nvinfer1::DataType::kHALF)) {
+                ILayer* cast_layer = network->addIdentity(*src_a);
+                cast_layer->setName((layer_name_+"_in1_int2fp").c_str());
+                cast_layer->setOutputType(0, src_b_dtype);
+                src_a = cast_layer->getOutput(0);
+            } else if ((src_a_dtype==nvinfer1::DataType::kFLOAT || src_a_dtype==nvinfer1::DataType::kHALF) && 
+                src_b_dtype==nvinfer1::DataType::kINT32) {
+                ILayer* cast_layer = network->addIdentity(*src_b);
+                cast_layer->setName((layer_name_+"_in2_int2fp").c_str());
+                cast_layer->setOutputType(0, src_a_dtype);
+                src_b = cast_layer->getOutput(0);
+            }
+        }
         if (paramlist->weight_input_index == 0) {
             std::swap(src_a, src_b);
         }
+
+        if(!CheckBroadcastDimsCorrect(src_a, src_b)) {
+            return nullptr;
+        }
+
         layer = network->addElementWise(*src_a, *src_b, m_op);
+        
         if (layer != nullptr) {
             layer->setName(layer_name_.c_str());
-        }
+        } else {
+            return nullptr;
+        }        
+
         if (unsqueeze) {
             Dims tmp_dims;
             tmp_dims.nbDims = 0;
             IShuffleLayer* shuffle = network->addShuffle(*layer->getOutput(0));
             shuffle->setReshapeDimensions(tmp_dims);
-            return shuffle;
+            layer = shuffle;
         }
     }
+    
+    // DataType Cast After Elemwise
+    // kAND, kOR, kXOR, kEQUAL, kGREATER, kLESS default out type == kBOOL
+    if ((m_op==ElementWiseOperation::kAND || m_op==ElementWiseOperation::kOR ||
+        m_op==ElementWiseOperation::kXOR || m_op==ElementWiseOperation::kEQUAL ||
+        m_op==ElementWiseOperation::kLESS || m_op==ElementWiseOperation::kGREATER) &&
+        out_dtype != nvinfer1::DataType::kINT8) {
+        // If Output Type set by TNN::inferOutputType() is DATA_TYPE_INT8, thus, nvinfer1::DataType:kInt8
+        // INT8 here means BOOL, not really quantization INT8. Output DataType cast is not needed.
+        auto output_tensor = layer->getOutput(0);
+        layer = network->addIdentity(*output_tensor);
+        layer->setName((layer_name_+"_out_cast").c_str());
+        layer->setOutputType(0, out_dtype);
+    }
+
     return layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc
index 024289066..bac4e4f81 100644
--- a/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/cast_layer_builder.cc
@@ -30,7 +30,12 @@ ILayer* CastTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     ILayer* layer = network->addIdentity(*tensor);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
-        layer->setOutputType(0, ConvertToTRTDataType((DataType)layer_param->to));
+        if (input_blobs_.size()==1) {
+            layer->setOutputType(0, ConvertToTRTDataType((DataType)layer_param->to));
+        } else {
+            // CastTo, pytorch aten::type_as
+            layer->setOutputType(0, ConvertToTRTDataType(input_blobs_[1]->GetBlobDesc().data_type));
+        }
     }
 
     return layer;
diff --git a/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc
index a7dc23f62..123e29a59 100644
--- a/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/ceil_layer_builder.cc
@@ -21,6 +21,12 @@ DECLARE_TENSORRT_LAYER_BUILDER(Ceil, LAYER_CEIL);
 ILayer* CeilTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    if (tensor->getType()==nvinfer1::DataType::kINT32) {
+        ILayer* identity_layer = network->addIdentity(*tensor);
+        return identity_layer;
+    }
+
     IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kCEIL);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
diff --git a/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc
index f0feda433..1d5d95ed8 100644
--- a/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/concat_layer_builder.cc
@@ -23,7 +23,6 @@ ILayer* ConcatTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto paramlist = dynamic_cast<ConcatLayerParam*>(param_);
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
     size_t nbInputs = input_blobs_.size();
     ITensor ** input_tensors = new ITensor*[nbInputs];
     for (int i = 0; i < nbInputs; i++) {
@@ -37,65 +36,16 @@ ILayer* ConcatTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     ILayer* last_layer;
     IConcatenationLayer* layer = network->addConcatenation(input_tensors, nbInputs);
     if (layer != nullptr) {
+        int axis = paramlist->axis;
+        if (axis < 0 && input_tensors[0]->getDimensions().nbDims > 0) {
+            axis += input_tensors[0]->getDimensions().nbDims;
+        } 
         layer->setName(layer_name_.c_str());
-        layer->setAxis(paramlist->axis);
+        layer->setAxis(axis);
         last_layer = layer;
     }
     delete [] input_tensors;
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        Weights output_quant_shift;
-        output_quant_shift.type = nvinfer1::DataType::kFLOAT;
-        output_quant_shift.values = nullptr;
-        output_quant_shift.count = 0;
-
-        Weights output_quant_scale;
-        float* output_quant_scale_data = (float*)malloc(sizeof(float));
-        int8_weight_data.push_back(output_quant_scale_data);
-        *output_quant_scale_data = output_scale_value;
-        output_quant_scale.type = nvinfer1::DataType::kFLOAT;
-        output_quant_scale.values = (void*)output_quant_scale_data;
-        output_quant_scale.count = 1;
-
-        Weights output_quant_power;
-        output_quant_power.type = nvinfer1::DataType::kFLOAT;
-        output_quant_power.values = nullptr;
-        output_quant_power.count = 0;
-
-        auto output_quant_layer = network->addScale(*(last_layer->getOutput(0)), ScaleMode::kUNIFORM,
-            output_quant_shift, output_quant_scale, output_quant_power);
-        std::string output_quant_name = layer_name_ + "_output_quant_";
-        output_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
-        output_quant_layer->setName(output_quant_name.c_str());
-
-        Weights output_dequant_shift;
-        output_dequant_shift.type = nvinfer1::DataType::kFLOAT;
-        output_dequant_shift.values = nullptr;
-        output_dequant_shift.count = 0;
-
-        Weights output_dequant_scale;
-        float* output_dequant_scale_data = (float*)malloc(sizeof(float));
-        int8_weight_data.push_back(output_dequant_scale_data);
-        *output_dequant_scale_data = 1 / output_scale_value;
-        output_dequant_scale.type = nvinfer1::DataType::kFLOAT;
-        output_dequant_scale.values = (void*)output_dequant_scale_data;
-        output_dequant_scale.count = 1;
-
-        Weights output_dequant_power;
-        output_dequant_power.type = nvinfer1::DataType::kFLOAT;
-        output_dequant_power.values = nullptr;
-        output_dequant_power.count = 0;
-
-        auto output_dequant_layer = network->addScale(*(output_quant_layer->getOutput(0)),
-            ScaleMode::kUNIFORM, output_dequant_shift, output_dequant_scale, output_dequant_power);
-        std::string output_dequant_name = layer_name_ + "_output_dequant_";
-        output_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-        output_dequant_layer->setName(output_dequant_name.c_str());
-        last_layer = output_dequant_layer;
-        std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->SetQuantized();
-    }
-
     return last_layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc
index 010134d52..63e690b4a 100644
--- a/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/constantofshape_layer_builder.cc
@@ -45,13 +45,24 @@ ILayer* ConstantOfShapeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network
     }
     ISliceLayer* broadcast_layer = network->addSlice(*unsqueeze->getOutput(0), starts,
         nvinfer1::Dims{}, strides);
+    broadcast_layer->setName((layer_name_+"_constant_of_shape_slice").c_str());
 
     if (broadcast_layer != nullptr) {
         broadcast_layer->setName(layer_name_.c_str());   
         broadcast_layer->setInput(2, *input_tensors[0]);
     }
 
-    return broadcast_layer;
+    ILayer* layer = broadcast_layer;
+                
+    DataType out_dtype = output_blobs_[0]->GetBlobDesc().data_type;
+    if (out_dtype==DATA_TYPE_FLOAT || out_dtype==DATA_TYPE_HALF) {
+        ILayer* cast_layer = network->addIdentity(*(broadcast_layer->getOutput(0)));
+        cast_layer->setName((layer_name_+"_2fp").c_str());
+        cast_layer->setOutputType(0, ConvertToTRTDataType(out_dtype));
+        layer = cast_layer;
+    }
+
+    return layer;
 }
 
 REGISTER_TENSORRT_LAYER_BUILDER(ConstantOfShape, LAYER_CONSTANT_OF_SHAPE);
diff --git a/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc
index 62a974610..ac3bd515d 100644
--- a/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/conv_1d_layer_builder.cc
@@ -28,6 +28,21 @@ ILayer* Convolution1DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
 
+    nvinfer1::ITensor* weight_tensor = nullptr;
+    if (input_blobs_.size() > 1) {
+        auto weight_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        weight_tensor              = std::dynamic_pointer_cast<TensorRTTensor>(weight_foreign_tensor)->GetTensor();
+        auto dims                  = weight_tensor->getDimensions();
+        paramlist->kernels[0]      = dims.d[2];
+        if (paramlist->pad_type == 3) {
+            paramlist->input_channel  = dims.d[0] / paramlist->group;
+            paramlist->output_channel = dims.d[1] * paramlist->group;
+        } else {
+            paramlist->input_channel  = dims.d[1];
+            paramlist->output_channel = dims.d[0];
+        }
+    }
+
     Weights kernelWeights;
     Weights biasWeights;
     ILayer* last_layer;
@@ -42,10 +57,13 @@ ILayer* Convolution1DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
     kernelSize.nbDims = 2;
     kernelSize.d[0] = paramlist->kernels[0];
     kernelSize.d[1] = 1;
-
-    DimsVector unsqueeze_dims(input_tensor->getDimensions().nbDims, 0);
-    unsqueeze_dims.push_back(1);
-    ILayer* layer = AddReshapeToNetwork(network, input_tensor, unsqueeze_dims, (layer_name_ + "unsqueeze").c_str());
+  
+    //DimsVector unsqueeze_dims(input_tensor->getDimensions().nbDims, 0);
+    //unsqueeze_dims.push_back(1);
+    //ILayer* layer = AddReshapeToNetwork(network, input_tensor, unsqueeze_dims, (layer_name_ + "unsqueeze").c_str());
+   
+    const std::vector<int> axes{3}; 
+    ILayer* layer = AddUnSqueezeToNetwork(network, input_tensor, axes, (layer_name_ + "unsqueeze").c_str());
 
     IConvolutionLayer* conv_layer;
     conv_layer = network->addConvolutionNd(*(layer->getOutput(0)), paramlist->output_channel, kernelSize,
@@ -70,6 +88,10 @@ ILayer* Convolution1DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
         conv_layer->setNbGroups(paramlist->group);
     }
 
+    if (input_blobs_.size() > 1) {
+        conv_layer->setInput(1, *weight_tensor);
+    }
+
     last_layer = conv_layer;
 
     IActivationLayer* activation_layer;
@@ -86,8 +108,10 @@ ILayer* Convolution1DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
         return nullptr;
     }
 
-    unsqueeze_dims.erase(unsqueeze_dims.end()-1);
-    last_layer = AddReshapeToNetwork(network, last_layer->getOutput(0), unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+    //unsqueeze_dims.erase(unsqueeze_dims.end()-1);
+    //last_layer = AddReshapeToNetwork(network, last_layer->getOutput(0), unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+
+    last_layer = AddSqueezeToNetwork(network, last_layer->getOutput(0), axes, (layer_name_ + "unsqueeze").c_str());
 
     return last_layer;
 }
diff --git a/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc
index 4b3be9694..3610b9e43 100644
--- a/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/conv_3d_layer_builder.cc
@@ -130,30 +130,16 @@ ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* net
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
 
     Weights kernelWeights;
     Weights biasWeights;
     ILayer* last_layer;
-    if (int8) {
-        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
-        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        std::vector<int> dims;
-        dims.push_back(paramlist->output_channel);
-        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1] / paramlist->group);
-        dims.push_back(paramlist->kernels[1]);
-        dims.push_back(paramlist->kernels[0]);
-        last_layer = AddInt8WeightQDQLayers(network, &(resource->filter_handle), kernelWeights,
-            paramlist->bias ? &(resource->bias_handle) : nullptr, biasWeights,
-            1 / (weight_scale_value / input_scale_value), dims);
+
+    kernelWeights = ConvertToWeights(&(resource->filter_handle));
+    if (paramlist->bias) {
+        biasWeights = ConvertToWeights(&(resource->bias_handle));
     } else {
-        kernelWeights = ConvertToWeights(&(resource->filter_handle));
-        if (paramlist->bias) {
-            biasWeights = ConvertToWeights(&(resource->bias_handle));
-        } else {
-            biasWeights = ConvertToWeights(nullptr, true, resource->filter_handle.GetDataType());
-        }
+        biasWeights = ConvertToWeights(nullptr, true, resource->filter_handle.GetDataType());
     }
 
     Dims kernelSize = ConvertToTRTDimsReverse(paramlist->kernels);
@@ -162,7 +148,6 @@ ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* net
     if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3] && pads[4] == pads[5])) {
         conv_layer = network->addConvolutionNd(*input_tensor, paramlist->output_channel, kernelSize,
             kernelWeights, biasWeights);
-        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
         if (conv_layer != nullptr) {
             conv_layer->setName(layer_name_.c_str());
             conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
@@ -179,7 +164,6 @@ ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* net
         ITensor* pad_tensor = padding_layer->getOutput(0);
         conv_layer = network->addConvolutionNd(*pad_tensor, paramlist->output_channel, kernelSize,
             kernelWeights, biasWeights);
-        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
         if(conv_layer != NULL) {
             conv_layer->setName(layer_name_.c_str());
             conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
@@ -190,10 +174,6 @@ ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* net
 
     last_layer = conv_layer;
 
-    if (int8) {
-        conv_layer->setPrecision(nvinfer1::DataType::kINT8);
-    }
-
     IActivationLayer* activation_layer;
     if (paramlist->activation_type == ActivationType_ReLU) {
         activation_layer = network->addActivation(*(conv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
@@ -208,13 +188,6 @@ ILayer* Convolution3DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* net
         return nullptr;
     }
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
-            output_scale_value, 1 / output_scale_value);
-    }
-
     return last_layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc
index f2085a567..a652ea06c 100644
--- a/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/conv_layer_builder.cc
@@ -16,6 +16,7 @@
 #include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
 #include "tnn/network/tensorrt/dimension_expr.h"
 #include "tnn/network/tensorrt/utils.h"
+#include "NvInfer.h"
 
 namespace TNN_NS {
 
@@ -110,53 +111,55 @@ ILayer* ConvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* netwo
     auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
     auto resource = dynamic_cast<ConvLayerResource*>(resource_);
 
+    nvinfer1::ITensor* weight_tensor = nullptr;
+    if (input_blobs_.size() > 1) {
+        auto weight_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        weight_tensor = std::dynamic_pointer_cast<TensorRTTensor>(weight_foreign_tensor)->GetTensor();
+        auto dims = weight_tensor->getDimensions();
+        paramlist->kernels[0] = dims.d[3];
+        paramlist->kernels[1] = dims.d[2];
+	if (paramlist->pad_type == 3) {
+            paramlist->input_channel = dims.d[0] / paramlist->group;
+            paramlist->output_channel = dims.d[1] * paramlist->group;
+	} else {
+            paramlist->input_channel = dims.d[1];
+            paramlist->output_channel = dims.d[0];
+	}
+    }
     auto in_blob_name = input_blobs_[0]->GetBlobDesc().name;
     bool following_a_concat_layer =
         m_network->m_concat_blob_names.find(in_blob_name) != m_network->m_concat_blob_names.end();
 
     auto pads = paramlist->pads;
-    bool symmetric = (pads[0] == pads[1]) && (pads[2] == pads[3]);
-    if ((symmetric && paramlist->kernels[0] == 7 && paramlist->kernels[1] == 7 && following_a_concat_layer) ||
-        (symmetric && paramlist->kernels[0] == 1 && paramlist->kernels[1] == 41) ||
-        (symmetric && paramlist->kernels[0] == 1 && paramlist->kernels[1] == 5)) {
+    bool symmetric_and_definite = (pads[0] == pads[1]) && (pads[2] == pads[3]) &&
+                                  (input_blobs_[0]->GetBlobDesc().dims.size() > 0);
+    if ((symmetric_and_definite) &&
+        ((paramlist->kernels[0] == 7 && paramlist->kernels[1] == 7 && following_a_concat_layer) ||
+         (paramlist->kernels[0] == 1 && paramlist->kernels[1] == 41) ||
+         (paramlist->kernels[0] == 1 && paramlist->kernels[1] == 5))) {
         return TensorRTPluginLayerBuilder::AddToNetwork(network);
     }
 
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
 
     Weights kernelWeights;
     Weights biasWeights;
     ILayer* last_layer;
-    if (int8) {
-        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
-        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        std::vector<int> dims;
-        dims.push_back(paramlist->output_channel);
-        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1] / paramlist->group);
-        dims.push_back(paramlist->kernels[1]);
-        dims.push_back(paramlist->kernels[0]);
-        last_layer = AddInt8WeightQDQLayers(network, &(resource->filter_handle), kernelWeights,
-            paramlist->bias ? &(resource->bias_handle) : nullptr, biasWeights,
-            1 / (weight_scale_value / input_scale_value), dims);
-    } else {
-        kernelWeights = ConvertToWeights(&(resource->filter_handle));
-        if (paramlist->bias) {
-            biasWeights = ConvertToWeights(&(resource->bias_handle));
-        } else {
-            biasWeights = ConvertToWeights(nullptr, true, resource->filter_handle.GetDataType());
-        }
-    }
+
+   kernelWeights = ConvertToWeights(&(resource->filter_handle));
+   if (paramlist->bias) {
+       biasWeights = ConvertToWeights(&(resource->bias_handle));
+   } else {
+       biasWeights = ConvertToWeights(nullptr, true, resource->filter_handle.GetDataType());
+   }
 
     Dims kernelSize = ConvertToTRTDimsReverse(paramlist->kernels);
     IConvolutionLayer* conv_layer;
     if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3])) {
         conv_layer = network->addConvolutionNd(*input_tensor, paramlist->output_channel, kernelSize,
             kernelWeights, biasWeights);
-        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
         if (conv_layer != nullptr) {
             conv_layer->setName(layer_name_.c_str());
             conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
@@ -173,7 +176,6 @@ ILayer* ConvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* netwo
         ITensor* pad_tensor = padding_layer->getOutput(0);
         conv_layer = network->addConvolutionNd(*pad_tensor, paramlist->output_channel, kernelSize,
             kernelWeights, biasWeights);
-        if (int8) conv_layer->setInput(1, *(last_layer->getOutput(0)));
         if(conv_layer != NULL) {
             conv_layer->setName(layer_name_.c_str());
             conv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
@@ -182,11 +184,10 @@ ILayer* ConvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* netwo
         }
     }
 
-    last_layer = conv_layer;
-
-    if (int8) {
-        conv_layer->setPrecision(nvinfer1::DataType::kINT8);
+    if (input_blobs_.size() > 1) {
+        conv_layer->setInput(1, *weight_tensor);
     }
+    last_layer = conv_layer;
 
     IActivationLayer* activation_layer;
     if (paramlist->activation_type == ActivationType_ReLU) {
@@ -202,13 +203,6 @@ ILayer* ConvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* netwo
         return nullptr;
     }
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
-            output_scale_value, 1 / output_scale_value);
-    }
-
     return last_layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/cumsum_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/cumsum_layer_builder.cc
new file mode 100644
index 000000000..b72fa9d86
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/cumsum_layer_builder.cc
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Cumsum, LAYER_CUMSUM);
+
+bool CumsumTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF ||
+             inOut[pos].type == nvinfer1::DataType::kINT32) && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status CumsumTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* CumsumTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Cumsum";
+}
+
+nvinfer1::DataType CumsumTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* CumsumTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    // TODO: Support TRT LayerBuilder instead of Plugin
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs CumsumTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    DimsExprs output(inputs[0]);
+    for (int i = 1; i < nbInputs; i++) {
+        for (int j = 0; j < output.nbDims; j++) {
+            output.d[j] = exprBuilder.operation(DimensionOperation::kMAX, *output.d[j], *inputs[i].d[j]);
+        }
+    }
+    auto layer_param = dynamic_cast<CumsumLayerParam*>(param_);
+    if (layer_param->exclusive_extend) {
+        output.d[layer_param->axis] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[layer_param->axis], *exprBuilder.constant(1));
+    }
+    return output;
+}
+
+const char* CumsumPluginCreator::getPluginName() const noexcept {
+    return "Cumsum";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Cumsum, LAYER_CUMSUM);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/deconv_1d_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/deconv_1d_layer_builder.cc
new file mode 100644
index 000000000..7f1c1a6eb
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/deconv_1d_layer_builder.cc
@@ -0,0 +1,219 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/dimension_expr.h"
+#include "tnn/network/tensorrt/utils.h"
+#include <NvInfer.h>
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Deconvolution1D, LAYER_DECONVOLUTION_1D);
+
+bool Deconvolution1DTRTPluginLayerBuilder::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,
+                                                                   int nbInputs, int nbOutputs) noexcept {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
+           inOut[pos].type == inOut[0].type && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+Status Deconvolution1DTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* Deconvolution1DTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Deconvolution";
+}
+
+nvinfer1::DataType Deconvolution1DTRTPluginLayerBuilder::getOutputDataType(int index,
+                                                                         const nvinfer1::DataType* inputTypes,
+                                                                         int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+DimsExprs Deconvolution1DTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+                                                                  int nbInputs,
+                                                                  nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    if (!conv_param) {
+        LOGE("Deconvolution1DTRTPluginLayerBuilder got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    for (int i = 0; i < 1; i++) {
+        if (conv_param->pads[i * 2] != conv_param->pads[i * 2 + 1]) {
+            LOGE("Deconvolution1DTRTPluginLayerBuilder does not support asymmetric padding.\n");
+            return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+        }
+    }
+
+    nvinfer1::IExprBuilder& e = exprBuilder;
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int kernel_w = conv_param->kernels[0];
+    const int stride_w = conv_param->strides[0];
+    const int dilation_w = conv_param->dialations[0];
+
+    DimensionExpr width_out(nullptr, &e);
+    DimensionExpr width(inputs[0].d[2], &e);
+
+    const int pad_type = conv_param->pad_type;
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    if (pad_type == -1) {
+        // default padding following the proto setting
+        width_out  = stride_w * (width - 1) + kernel_extent_w - 2 * pad_w_begin;
+    } else if (pad_type == 0 || pad_type == 3) {
+        // SAME type
+        width_out  = width * stride_w;
+    } else if (pad_type == 1) {
+        // VALID type
+        width_out  = width * stride_w + std::max(kernel_extent_w - stride_w, 0);
+    } else if (pad_type == 2) {
+        // FULL type
+        width_out  = width * stride_w - (stride_w + kernel_extent_w - 2);
+    } else {
+        LOGE("Deconvolution1DTRTPluginLayerBuilder only support default padding m\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimsExprs output(inputs[0]);
+
+    output.d[1] = e.constant(conv_param->output_channel);
+    output.d[2] = width_out.expr();
+
+    return output;
+}
+
+const char* Deconvolution1DPluginCreator::getPluginName() const noexcept {
+    return "Deconvolution1D";
+}
+
+ILayer* Deconvolution1DTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
+
+    if (paramlist->pad_type == -1 && paramlist->dialations[0] == 1 &&
+        paramlist->input_channel == 1 && paramlist->output_channel == paramlist->group) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    nvinfer1::ITensor* weight_tensor = nullptr;
+    if (input_blobs_.size() > 1) {
+        auto weight_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        weight_tensor              = std::dynamic_pointer_cast<TensorRTTensor>(weight_foreign_tensor)->GetTensor();
+        auto dims                  = weight_tensor->getDimensions();
+        paramlist->kernels[0]      = dims.d[2];
+        if (paramlist->pad_type == 3) {
+            paramlist->input_channel  = dims.d[0] / paramlist->group;
+            paramlist->output_channel = dims.d[1] * paramlist->group;
+        } else {
+            paramlist->input_channel  = dims.d[1];
+            paramlist->output_channel = dims.d[0];
+        }
+    }
+
+    if (paramlist->dialations[0] != 1) {
+        LOGE("TRT does not support dilated deconvolutions");
+        return nullptr;
+    }
+    auto resource = dynamic_cast<ConvLayerResource*>(resource_);
+    Weights kernelWeights, biasWeights;
+    kernelWeights = ConvertToWeights(&(resource->filter_handle));
+    if (paramlist->bias) {
+        biasWeights = ConvertToWeights(&(resource->bias_handle));
+    } else {
+        biasWeights = ConvertToWeights(nullptr, true, resource->filter_handle.GetDataType());
+    }
+
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    ILayer* last_layer;
+    DimsHW kernelSize(paramlist->kernels[0], 1);
+
+    //DimsVector unsqueeze_dims(input_tensor->getDimensions().nbDims, 0);
+    //unsqueeze_dims.push_back(1);
+    //ILayer* layer = AddReshapeToNetwork(network, input_tensor, unsqueeze_dims, (layer_name_ + "unsqueeze").c_str());
+
+    const std::vector<int> axes{3};
+    ILayer* layer = AddUnSqueezeToNetwork(network, input_tensor, axes, (layer_name_ + "unsqueeze").c_str());
+
+    auto pads = paramlist->pads;
+    IDeconvolutionLayer* deconv_layer;
+    if (paramlist->pad_type == -1 || (pads[0] == pads[1]) ) {
+        deconv_layer = network->addDeconvolutionNd(*(layer->getOutput(0)), paramlist->output_channel,
+            kernelSize, kernelWeights, biasWeights);
+        if (deconv_layer != nullptr) {
+            deconv_layer->setName(layer_name_.c_str());
+            deconv_layer->setStrideNd(DimsHW(paramlist->strides[0], 1));
+            deconv_layer->setPaddingNd(DimsHW(paramlist->pads[0], 0));
+            deconv_layer->setNbGroups(paramlist->group);
+            //deconv_layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+        }
+    } else {
+        DimsVector postPadding{pads[1], 0};
+        DimsVector  prePadding{pads[0], 0};
+        deconv_layer = network->addDeconvolutionNd(*(layer->getOutput(0)), paramlist->output_channel, kernelSize,
+            kernelWeights, biasWeights);
+        if(deconv_layer != NULL) {
+            deconv_layer->setName(layer_name_.c_str());
+            Dims strides;
+            strides.nbDims = 2;
+            strides.d[0] = paramlist->strides[0];
+            strides.d[1] = 1;
+            deconv_layer->setStrideNd(strides);
+            deconv_layer->setPrePadding(ConvertToTRTDims(prePadding));
+            deconv_layer->setPostPadding(ConvertToTRTDims(postPadding));
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 71
+            Dims dialations;
+            dialations.nbDims = 2;
+            dialations.d[0] = paramlist->dialations[0];
+            dialations.d[1] = 1;
+            deconv_layer->setDilationNd(dialations);
+#endif
+            deconv_layer->setNbGroups(paramlist->group);
+        }
+    }
+    
+    if (input_blobs_.size() > 1) {
+        deconv_layer->setInput(1, *weight_tensor);
+    }
+    last_layer = deconv_layer;
+
+    IActivationLayer* activation_layer;
+    if (paramlist->activation_type == ActivationType_ReLU) {
+        activation_layer = network->addActivation(*(deconv_layer->getOutput(0)), nvinfer1::ActivationType::kRELU);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type == ActivationType_ReLU6) {
+        activation_layer = network->addActivation(*(deconv_layer->getOutput(0)), nvinfer1::ActivationType::kCLIP);
+        activation_layer->setAlpha(0.f);
+        activation_layer->setBeta(6.f);
+        last_layer = activation_layer;
+    } else if (paramlist->activation_type != ActivationType_None) {
+        LOGE("Error: Unsupport reshape type(%d)", paramlist->activation_type);
+        return nullptr;
+    }
+
+//    unsqueeze_dims.erase(unsqueeze_dims.end()-1);
+//    last_layer = AddReshapeToNetwork(network, last_layer->getOutput(0), unsqueeze_dims, (layer_name_ + "squeeze").c_str());
+
+    last_layer = AddSqueezeToNetwork(network, last_layer->getOutput(0), axes, (layer_name_ + "squeeze").c_str());
+    return last_layer;
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Deconvolution1D, LAYER_DECONVOLUTION_1D);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc
index 5614f04d0..546702138 100644
--- a/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/deconv_layer_builder.cc
@@ -12,16 +12,138 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
+#include "tnn/network/tensorrt/tensorrt_network.h"
 #include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/dimension_expr.h"
 #include "tnn/network/tensorrt/utils.h"
+#include <NvInfer.h>
 
 namespace TNN_NS {
 
-DECLARE_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+// DECLARE_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
 
-ILayer* DeconvolutionTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+
+bool DeconvolutionTRTPluginLayerBuilder::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,
+                                                                   int nbInputs, int nbOutputs) noexcept {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
+           inOut[pos].type == inOut[0].type && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+Status DeconvolutionTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* DeconvolutionTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Deconvolution";
+}
+
+nvinfer1::DataType DeconvolutionTRTPluginLayerBuilder::getOutputDataType(int index,
+                                                                         const nvinfer1::DataType* inputTypes,
+                                                                         int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+DimsExprs DeconvolutionTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+                                                                  int nbInputs,
+                                                                  nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    ConvLayerParam* conv_param = dynamic_cast<ConvLayerParam*>(param_);
+    if (!conv_param) {
+        LOGE("ConvolutionTRTPluginLayerBuilder got null param\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    for (int i = 0; i < 2; i++) {
+        if (conv_param->pads[i * 2] != conv_param->pads[i * 2 + 1]) {
+            LOGE("ConvolutionTRTPluginLayerBuilder does not support asymmetric padding.\n");
+            return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+        }
+    }
+
+    nvinfer1::IExprBuilder& e = exprBuilder;
+
+    const int pad_w_begin = conv_param->pads[0];
+    const int pad_h_begin = conv_param->pads[2];
+
+    const int kernel_w = conv_param->kernels[0];
+    const int kernel_h = conv_param->kernels[1];
+
+    const int stride_w = conv_param->strides[0];
+    const int stride_h = conv_param->strides[1];
+
+    const int dilation_w = conv_param->dialations[0];
+    const int dilation_h = conv_param->dialations[1];
+
+    DimensionExpr height_out(nullptr, &e);
+    DimensionExpr width_out(nullptr, &e);
+
+    DimensionExpr height(inputs[0].d[2], &e);
+    DimensionExpr width(inputs[0].d[3], &e);
+
+    const int pad_type = conv_param->pad_type;
+
+    int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    if (pad_type == -1) {
+        // default padding following the proto setting
+        height_out = stride_h * (height - 1) + kernel_extent_h - 2 * pad_h_begin;
+        width_out  = stride_w * (width - 1) + kernel_extent_w - 2 * pad_w_begin;
+    } else if (pad_type == 0 || pad_type == 3) {
+        // SAME type
+        height_out = height * stride_h;
+        width_out  = width * stride_w;
+    } else if (pad_type == 1) {
+        // VALID type
+        height_out = height * stride_h + std::max(kernel_extent_h - stride_h, 0);
+        width_out  = width * stride_w + std::max(kernel_extent_w - stride_w, 0);
+    } else if (pad_type == 2) {
+        // FULL type
+        height_out = height * stride_h - (stride_h + kernel_extent_h - 2);
+        width_out  = width * stride_w - (stride_w + kernel_extent_w - 2);
+    } else {
+        LOGE("ConvolutionTRTPluginLayerBuilder only support default padding m\n");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    DimsExprs output(inputs[0]);
+
+    output.d[1] = e.constant(conv_param->output_channel);
+    output.d[2] = height_out.expr();
+    output.d[3] = width_out.expr();
+
+    return output;
+}
+
+const char* DeconvolutionPluginCreator::getPluginName() const noexcept {
+    return "Deconvolution";
+}
+
+ILayer* DeconvolutionTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
     auto paramlist = dynamic_cast<ConvLayerParam*>(param_);
 
+    if (paramlist->pad_type == -1 && paramlist->dialations[0] == 1 && paramlist->dialations[1] == 1 &&
+        paramlist->input_channel == 1 && paramlist->output_channel == paramlist->group) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    }
+
+    nvinfer1::ITensor* weight_tensor = nullptr;
+    if (input_blobs_.size() > 1) {
+        auto weight_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        weight_tensor              = std::dynamic_pointer_cast<TensorRTTensor>(weight_foreign_tensor)->GetTensor();
+        auto dims                  = weight_tensor->getDimensions();
+        paramlist->kernels[0]      = dims.d[3];
+        paramlist->kernels[1]      = dims.d[2];
+        if (paramlist->pad_type == 3) {
+            paramlist->input_channel  = dims.d[0] / paramlist->group;
+            paramlist->output_channel = dims.d[1] * paramlist->group;
+        } else {
+            paramlist->input_channel  = dims.d[1];
+            paramlist->output_channel = dims.d[0];
+        }
+    }
+
     if (paramlist->dialations[1] != 1 || paramlist->dialations[0] != 1) {
         LOGE("TRT does not support dilated deconvolutions");
         return nullptr;
@@ -43,21 +165,21 @@ ILayer* DeconvolutionTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
     auto pads = paramlist->pads;
     IDeconvolutionLayer* deconv_layer;
     if (paramlist->pad_type == -1 || (pads[0] == pads[1] && pads[2] == pads[3])) {
-        deconv_layer = network->addDeconvolution(*tensor, paramlist->output_channel,
+        deconv_layer = network->addDeconvolutionNd(*tensor, paramlist->output_channel,
             kernelSize, kernelWeights, biasWeights);
         if (deconv_layer != nullptr) {
             deconv_layer->setName(layer_name_.c_str());
-            deconv_layer->setStride(DimsHW(paramlist->strides[1], paramlist->strides[0]));
-            deconv_layer->setPadding(DimsHW(paramlist->pads[2], paramlist->pads[0]));
+            deconv_layer->setStrideNd(DimsHW(paramlist->strides[1], paramlist->strides[0]));
+            deconv_layer->setPaddingNd(DimsHW(paramlist->pads[2], paramlist->pads[0]));
             deconv_layer->setNbGroups(paramlist->group);
             //deconv_layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
         }
     } else {
         DimsVector postPadding{pads[3], pads[1]};
         DimsVector  prePadding{pads[2], pads[0]};
-        deconv_layer = network->addDeconvolution(*tensor, paramlist->output_channel, kernelSize,
+        deconv_layer = network->addDeconvolutionNd(*tensor, paramlist->output_channel, kernelSize,
             kernelWeights, biasWeights);
-        if(deconv_layer != NULL) {
+        if (deconv_layer != NULL) {
             deconv_layer->setName(layer_name_.c_str());
             deconv_layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
             deconv_layer->setPrePadding(ConvertToTRTDims(prePadding));
@@ -68,6 +190,10 @@ ILayer* DeconvolutionTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
             deconv_layer->setNbGroups(paramlist->group);
         }
     }
+    
+    if (input_blobs_.size() > 1) {
+        deconv_layer->setInput(1, *weight_tensor);
+    }
     last_layer = deconv_layer;
 
     IActivationLayer* activation_layer;
@@ -87,7 +213,8 @@ ILayer* DeconvolutionTRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
     return last_layer;
 }
 
-REGISTER_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+// REGISTER_TENSORRT_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Deconvolution, LAYER_DECONVOLUTION);
 
 }  //  namespace TNN_NS
 
diff --git a/source/tnn/network/tensorrt/layer_builder/dequantize_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/dequantize_layer_builder.cc
new file mode 100644
index 000000000..3459f5978
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/dequantize_layer_builder.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Dequantize, LAYER_DEQUANTIZE);
+
+ILayer* DequantizeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+#if NV_TENSORRT_MAJOR < 8
+    LOGE("dequant layer builder is not support before TensorRT8\n");
+    return nullptr;
+#else
+    auto layer_param = dynamic_cast<QuantizeLayerParam*>(param_);
+    auto tensor = GetInputITensors()[0];
+    //auto scale = GetInputITensors()[1];
+    auto layer_resource = dynamic_cast<QuantizeLayerResource*>(resource_);
+    auto const_layer = ConvertWeightToConstLayer(network, &(layer_resource->scale_handle));
+    nvinfer1::ITensor * scale = nullptr;
+    if (const_layer != nullptr) {
+        scale = const_layer->getOutput(0);
+    }
+
+    IDequantizeLayer* dequantize_layer = network->addDequantize(*tensor, *scale);
+
+    int64_t axis = layer_param->axis;
+
+    dequantize_layer->setAxis(axis);
+    if (dequantize_layer != nullptr) {
+        dequantize_layer->setName(layer_name_.c_str());
+    }
+    return dequantize_layer;
+#endif
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Dequantize, LAYER_DEQUANTIZE);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/effective_transformer_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/effective_transformer_layer_builder.cc
new file mode 100644
index 000000000..5ede20b3a
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/effective_transformer_layer_builder.cc
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+bool EffectiveTransformerTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    auto layer_param = dynamic_cast<EffectiveTransformerLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("EffectiveTransformerTRTPluginLayerBuilder: Unable to get layer param.");
+        return false;
+    }
+
+    bool layout_check = inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+
+    bool datatype_check = true;
+    if (pos == 0) {
+        datatype_check = inOut[pos].type == nvinfer1::DataType::kHALF;
+    } else if (pos == nbInputs || (layer_param->is_remove_padding && pos == 1)) {
+        datatype_check = inOut[pos].type == inOut[0].type;
+    } else {
+        datatype_check = inOut[pos].type == nvinfer1::DataType::kINT32;
+    }
+
+    return layout_check && datatype_check;
+}
+
+Status EffectiveTransformerTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* EffectiveTransformerTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "EffectiveTransformer";
+}
+
+nvinfer1::DataType EffectiveTransformerTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    if (index == 0) {
+        return inputTypes[0];
+    }
+    return nvinfer1::DataType::kINT32;
+}
+
+ILayer* EffectiveTransformerTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs EffectiveTransformerTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    auto layer_param = dynamic_cast<EffectiveTransformerLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("EffectiveTransformerTRTPluginLayerBuilder: Unable to get layer param.");
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+
+    if (layer_param->is_remove_padding && index == 1) {
+        nvinfer1::DimsExprs output;
+        output.nbDims = 1;
+        output.d[0] = inputs[0].d[0];
+        for (int i = 1; i < inputs[0].nbDims - 1; ++i) {
+            output.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *(inputs[0].d[i]), *(output.d[0]));
+        }
+        const nvinfer1::IDimensionExpr *one = exprBuilder.constant(1); // for token_number
+        output.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM, *one, *(output.d[0]));
+        return output;
+    }
+    if (layer_param->is_remove_padding && index == 2) {
+        nvinfer1::DimsExprs output;
+        output.nbDims = 1;
+        output.d[0] = inputs[0].d[0];
+        const nvinfer1::IDimensionExpr *one = exprBuilder.constant(1);
+        output.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM, *one, *(output.d[0]));
+        return output;
+    }
+    if (!layer_param->is_remove_padding && index == 1) {
+        nvinfer1::DimsExprs output;
+        output.nbDims = 1;
+        output.d[0] = exprBuilder.constant(1);
+        return output;
+    }
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+}
+
+const char* EffectiveTransformerPluginCreator::getPluginName() const noexcept {
+    return "EffectiveTransformer";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(EffectiveTransformer, LAYER_EFFECTIVE_TRANSFORMER);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc
index 15719200c..6220fa21b 100644
--- a/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/expand_layer_builder.cc
@@ -24,9 +24,22 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
 
     auto input_tensors = GetInputITensors();
     ITensor* input_data_tensor = input_tensors[0];
+    if (input_data_tensor->getType()==nvinfer1::DataType::kBOOL) {
+        // Expand Slice Layer does not support format other than INT32
+        // We need to turn BOOL, maybe output of EQUAL etc, to INT32 here. 
+        ILayer* cast_layer = network->addIdentity(*input_data_tensor);
+        cast_layer->setName((layer_name_+"_bool2int").c_str());
+        cast_layer->setOutputType(0, nvinfer1::DataType::kINT32);
+        input_data_tensor = cast_layer->getOutput(0);
+    }
+
     ITensor* inputDims;
     if (input_tensors[0]->getDimensions().nbDims != 0)
+        #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+        inputDims = network->addCast(*(network->addShape(*input_tensors[0])->getOutput(0)), nvinfer1::DataType::kINT32)->getOutput(0);
+        #else
         inputDims = network->addShape(*input_tensors[0])->getOutput(0);
+        #endif
     int inputRank;
     if (input_tensors[0]->getDimensions().nbDims != 0) {
         inputRank = inputDims->getDimensions().d[0];
@@ -36,10 +49,11 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
 
     nvinfer1::ITensor* shape;
     int shapeLength;
-    if (input_tensors.size() == 2) {
+    if (input_tensors.size() == 2 && input_tensors[1]->getDimensions().d[0]!=-1) {
         shape = input_tensors[1];
         shapeLength = input_tensors[1]->getDimensions().d[0];
-    } else if (input_tensors.size() == 1) {
+    } else if (input_tensors.size() == 1 || 
+               (input_tensors.size() == 2 && input_tensors[1]->getDimensions().d[0]==-1)) {
         nvinfer1::Dims shapeDims;
         shapeDims.nbDims = 1;
         shapeDims.d[0] = layer_param->shape.size();
@@ -62,14 +76,30 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         }
         Weights tmpWeight;
         tmpWeight.type = nvinfer1::DataType::kINT32;
-        tmpWeight.values = layer_param->shape.data();
+        if (layer_param->shape.data() != nullptr) {
+            tmpWeight.values = layer_param->shape.data();
+        } else {
+            // make sure tmpWeight.values is not nullptr
+            tmpWeight.values = input_data_tensor;
+        }
         tmpWeight.count = 1;
         if (input_tensors[0]->getDimensions().nbDims != 0) {
+            #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+            auto int64_shape_tensor = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+            nvinfer1::ITensor* const args[2] = {
+                network->addCast(*int64_shape_tensor, nvinfer1::DataType::kINT32)->getOutput(0), inputDims};
+            #else
             nvinfer1::ITensor* const args[2] = {
                 network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0), inputDims};
+            #endif
             newDims = network->addConcatenation(args, 2)->getOutput(0);
         } else {
+            #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+            auto int64_shape_tensor = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+            newDims = network->addCast(*int64_shape_tensor, nvinfer1::DataType::kINT32)->getOutput(0);
+            #else
             newDims = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+            #endif
         }
     } else {
         newDims = inputDims;
@@ -92,8 +122,14 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         tmpWeight.type = nvinfer1::DataType::kINT32;
         tmpWeight.values = layer_param->shape.data();
         tmpWeight.count = 1;
+        #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+        nvinfer1::ITensor* const args[2] = {
+            network->addCast(*(network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0)),
+                             nvinfer1::DataType::kINT32)->getOutput(0), shape};
+        #else
         nvinfer1::ITensor* const args[2] = {
             network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0), shape};
+        #endif
         newShape = network->addConcatenation(args, 2)->getOutput(0);
     } else {
         newShape = shape;
@@ -114,9 +150,16 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
             tmpDims.d[i] = 1;
         Weights tmpWeight;
         tmpWeight.type = nvinfer1::DataType::kINT32;
-        tmpWeight.values = layer_param->shape.data();
+        auto tmpValuePtr = std::make_shared<std::vector<int>>(1, 1);
+        tmpWeight.values = tmpValuePtr.get(); 
         tmpWeight.count = 1;
-        one = network->addShape(*network->addConstant(tmpDims, tmpWeight)->getOutput(0))->getOutput(0);
+        ILayer* one_shape_constant_layer = network->addConstant(tmpDims, tmpWeight);
+        one_shape_constant_layer->setName((layer_name_+"_one_shape_constant").c_str());
+        #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+        one = network->addCast(*(network->addShape(*one_shape_constant_layer->getOutput(0))->getOutput(0)), nvinfer1::DataType::kINT32)->getOutput(0);
+        #else
+        one = network->addShape(*one_shape_constant_layer->getOutput(0))->getOutput(0);
+        #endif
     }
 
     ITensor* strides = network->addElementWise(*one,
@@ -124,6 +167,7 @@ ILayer* ExpandTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         ElementWiseOperation::kMIN)->getOutput(0);
 
     ISliceLayer* broadcast_layer = network->addSlice(*input_data_tensor, startDims, nvinfer1::Dims{}, nvinfer1::Dims{});
+    broadcast_layer->setName((layer_name_+"_expand_slice").c_str());
     if (broadcast_layer != nullptr) {
         broadcast_layer->setInput(2, *sizes);
         broadcast_layer->setInput(3, *strides);
diff --git a/source/tnn/network/tensorrt/layer_builder/flatten_torch_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/flatten_torch_layer_builder.cc
new file mode 100644
index 000000000..7cf4b1d79
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/flatten_torch_layer_builder.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(FlattenTorch, LAYER_FLATTENTORCH);
+
+ILayer* FlattenTorchTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_param = dynamic_cast<FlattenTorchLayerParam*>(param_);
+    auto tensor = GetInputITensors()[0];
+
+    int start_dim = layer_param->start_dim;
+    int end_dim   = layer_param->end_dim;
+    if (start_dim < 0) start_dim += tensor->getDimensions().nbDims;
+    if (end_dim < 0) end_dim += tensor->getDimensions().nbDims;
+
+    ShapeTensor in_dims = shapeOf(*tensor);
+    ShapeTensor out_dims;
+    
+    if (start_dim == end_dim) {
+        // Meaningless Flatten, do nothing.
+        ILayer* identity_layer = network->addIdentity(*tensor);
+        return identity_layer;
+    }
+    
+    if (start_dim > 0) {
+        std::vector<int> d0_indices;
+        for (int i=0; i<start_dim; i++) {
+            d0_indices.push_back(i);
+        }
+        out_dims = gather(network, in_dims, ShapeTensor(1, std::move(d0_indices)));
+        if (end_dim > start_dim) {
+            ShapeTensor d1 = product(network, in_dims, start_dim, end_dim+1, 1);
+            out_dims = concat(network, out_dims, d1);
+        }
+    } else {
+        // Assume end_dim > start_dim when start_dim = 0
+        out_dims = product(network, in_dims, 0, end_dim+1, 1);
+    }
+    if (end_dim < tensor->getDimensions().nbDims-1) {
+        std::vector<int> d2_indices;
+        for (int i=end_dim+1; i<tensor->getDimensions().nbDims; i++) {
+            d2_indices.push_back(i);
+        }
+        ShapeTensor d2 = gather(network, in_dims, ShapeTensor(1, std::move(d2_indices)));
+        out_dims = concat(network, out_dims, d2);
+    }
+
+    IShuffleLayer* flatten_layer = addShuffle(network, *tensor, out_dims, false);
+    if (flatten_layer != nullptr) {
+        flatten_layer->setName(layer_name_.c_str());
+    }
+    return flatten_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(FlattenTorch, LAYER_FLATTENTORCH);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc
index 542b630c4..8b0db9e9a 100644
--- a/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/floor_layer_builder.cc
@@ -21,12 +21,45 @@ DECLARE_TENSORRT_LAYER_BUILDER(Floor, LAYER_FLOOR);
 ILayer* FloorTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    if (tensor->getType()==nvinfer1::DataType::kINT32) {
+        ILayer* identity_layer = network->addIdentity(*tensor);
+        return identity_layer;
+    }
+
+    const auto input_dim = tensor->getDimensions().nbDims;
+    IShuffleLayer *unsqueeze = nullptr, *squeeze = nullptr;
+    if (input_dim == 0) {
+        unsqueeze = addUnsqueeze(network, *tensor, {0,});
+        if (unsqueeze == nullptr) {
+            return unsqueeze;
+        } else {
+            unsqueeze->setName((layer_name_ + "/before_unsqueeze").c_str());
+        }
+        // update 'tensor' to the output of the Unsqueeze layer
+        tensor = unsqueeze->getOutput(0);
+    }
+
     IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kFLOOR);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
+    } else {  // i.e. layer == nullptr
+        return layer;
+    }
+
+    if (input_dim == 0) {
+        // so here, layer != nullptr holds
+        // update 'tensor' to the output of the Floor layer
+        tensor = layer->getOutput(0);
+        squeeze = addSqueeze(network, *tensor, {0,});
+        if (squeeze == nullptr) {
+            return squeeze;
+        } else {
+            squeeze->setName((layer_name_ + "/after_squeeze").c_str());
+        }
     }
 
-    return layer;
+    return input_dim > 0 ? static_cast<ILayer*>(layer) : static_cast<ILayer*>(squeeze);
 }
 
 REGISTER_TENSORRT_LAYER_BUILDER(Floor, LAYER_FLOOR);
diff --git a/source/tnn/network/tensorrt/layer_builder/fused_group_norm_swish_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/fused_group_norm_swish_layer_builder.cc
new file mode 100644
index 000000000..bad7f2c25
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/fused_group_norm_swish_layer_builder.cc
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+bool FusedGroupNormSwishTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    const auto &desc = inOut[pos];
+    const auto common_cond = nbInputs == 3 && nbOutputs == 1;
+    switch (pos) {
+        case 0: {
+            auto type_format_0 = (desc.type == nvinfer1::DataType::kFLOAT || desc.type == nvinfer1::DataType::kHALF) &&
+                                 desc.format == nvinfer1::TensorFormat::kLINEAR;
+            auto type_format_1 = desc.type == nvinfer1::DataType::kINT8 && desc.format == nvinfer1::TensorFormat::kCHW4;
+            auto type_format_2 = desc.type == nvinfer1::DataType::kINT8 && desc.format == nvinfer1::TensorFormat::kCHW32;
+            return common_cond && (type_format_0 || type_format_1 || type_format_2);
+        }
+        case 1:
+        case 2:
+            return common_cond && desc.type == nvinfer1::DataType::kFLOAT;
+        case 3:
+            return common_cond && desc.type == inOut[0].type && desc.format == inOut[0].format;
+        default:
+            return false;
+    }
+}
+
+Status FusedGroupNormSwishTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* FusedGroupNormSwishTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "FusedGroupNormSwish";
+}
+
+nvinfer1::DataType FusedGroupNormSwishTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* FusedGroupNormSwishTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs FusedGroupNormSwishTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,
+                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    DimsExprs output(inputs[0]);
+    return output;
+}
+
+const char* FusedGroupNormSwishPluginCreator::getPluginName() const noexcept {
+    return "FusedGroupNormSwish";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(FusedGroupNormSwish, LAYER_FUSED_GROUP_NORM_SWISH);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/fused_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/fused_layer_builder.cc
new file mode 100644
index 000000000..76fffc731
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/fused_layer_builder.cc
@@ -0,0 +1,243 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Fused, LAYER_FUSED);
+
+bool FusedTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    auto layer_param = dynamic_cast<FusedLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("FusedTRTLayerBuilder: Unable to get layer param.");
+        return false;
+    }
+
+    if (layer_param->type == FusionType_AddBiasResidualLayerNorm ||
+        layer_param->type == FusionType_FFN) {
+        return (inOut[pos].type == nvinfer1::DataType::kHALF
+                && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+                && inOut[pos].type == inOut[0].type);
+    } else if (layer_param->type == FusionType_Attention) {
+        if (pos == 0 || pos >= nbInputs) {
+            // TODO: ADD FLOAT
+            //return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF)
+            return (inOut[pos].type == nvinfer1::DataType::kHALF
+                    && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+                    && inOut[pos].type == inOut[0].type);
+        } else {
+            if (pos == 1 && layer_param->has_attention_mask) {
+                // attention_mask
+                //return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF)
+                return (inOut[pos].type == nvinfer1::DataType::kHALF
+                        && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+                        && inOut[pos].type == inOut[0].type);
+            } else {
+                // trt_offsets or other shape-related inputs
+                return (inOut[pos].type == nvinfer1::DataType::kINT32
+                        && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+            }
+        }
+    } else if (layer_param->type == FusionType_Flash_Attention) {
+        if (pos == 0 || pos >= nbInputs) {
+            // TODO: ADD FLOAT
+            //return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF)
+            return (inOut[pos].type == nvinfer1::DataType::kHALF
+                    && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+                    && inOut[pos].type == inOut[0].type);
+        } else {
+                return (inOut[pos].type == nvinfer1::DataType::kINT32
+                        && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+    } else if (layer_param->type == FusionType_Cross_Attention) {
+        if (pos == 0 || pos == 1 || pos >= nbInputs) {
+            // TODO: ADD FLOAT
+            //return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF)
+            return (inOut[pos].type == nvinfer1::DataType::kHALF
+                    && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+                    && inOut[pos].type == inOut[0].type);
+        } else {
+                return (inOut[pos].type == nvinfer1::DataType::kINT32
+                        && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+    }
+
+    return false;
+}
+
+Status FusedTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* FusedTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Fused";
+}
+
+nvinfer1::DataType FusedTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* FusedTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    auto layer_param = dynamic_cast<FusedLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("FusedTRTLayerBuilder: Unable to get layer param.");
+        return nullptr;
+    }
+
+    if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV1) {
+        if (layer_param->bert_mha_hidden_size <= 0 || layer_param->bert_mha_num_heads <= 0) {
+            LOGE("FusedTRTLayerBuilder: TRT QKVToContext V1 Plugin Layer got Wrong Param: num_heads and hidden_size for Multi-head Attention.");
+            return nullptr;
+        }
+
+        auto creator = getPluginRegistry()->getPluginCreator("CustomQKVToContextPluginDynamic", "1");
+        if (!creator) {
+            LOGE("FusedTRTLayerBuilder: Unable to find creator for TRT QKVToContext V1 Plugin Layer.");
+            return nullptr;
+        }
+
+        std::vector<ITensor*> input_tensors;
+        auto in_x_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(in_x_foreign_tensor)->GetTensor());
+        if (input_blobs_.size() >= 2) {
+            // input[1]: ReduceSum-ed Attention Mask of size [Batch]
+            auto mask_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+            input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(mask_foreign_tensor)->GetTensor());
+        }
+
+        int in_data_type = int(input_tensors[0]->getType());   // DataType: kFloat = 0, kHalf = 1;
+        int has_in_mask = input_blobs_.size() >= 2 ? true : false;
+        float default_dq_probs = 1.0f/127.0f;
+
+        std::vector<nvinfer1::PluginField> mha_v1_field;
+        mha_v1_field.emplace_back("type_id", &in_data_type, nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v1_field.emplace_back("has_mask", &has_in_mask, nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v1_field.emplace_back("hidden_size", &(layer_param->bert_mha_hidden_size), nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v1_field.emplace_back("num_heads", &(layer_param->bert_mha_num_heads), nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v1_field.emplace_back("dq_probs", &default_dq_probs, nvinfer1::PluginFieldType::kFLOAT32, 1);
+
+        PluginFieldCollection mhaV1FC {5, mha_v1_field.data()};
+        IPluginV2* pluginObj = creator->createPlugin(layer_name_.c_str(), &mhaV1FC);
+        auto layer = network->addPluginV2(input_tensors.data(), input_blobs_.size(), *pluginObj);
+        if (layer != nullptr) {
+            layer->setName((layer_name_).c_str());
+        }
+
+        return layer;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV2) {
+        if (layer_param->bert_mha_hidden_size <= 0 || layer_param->bert_mha_num_heads <= 0) {
+            LOGE("FusedTRTLayerBuilder: TRT QKVToContext V2 Plugin Layer got Wrong Param: num_heads and hidden_size for Multi-head Attention.");
+            return nullptr;
+        }
+
+        auto creator = getPluginRegistry()->getPluginCreator("CustomQKVToContextPluginDynamic", "2");
+        if (!creator) {
+            LOGE("FusedTRTLayerBuilder: Unable to find creator for TRT QKVToContext V2 Plugin Layer.");
+            return nullptr;
+        }
+
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+        // V2.varseqlen has 4 inputs:
+        // in_x:        [Total Seqlen under dense mode, Hidden_Size, ]
+        // input_mask:  [Batch] dummy in var_seqlen mode, no size Requirements.
+        // cu_seqlen:   [Batch+1], data be like [0, seq0_len, seq0_len+seq1_len, ..., cumulative sum of all all seq lens]
+        // dummy:       [MaxSeqLen] dummy in var_seqlen mode, size of dummy required to be [MaxSeqLen]
+        std::vector<ITensor*> input_tensors;
+        auto in_x_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto dummy_mask_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto cu_seqlen_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[2])->GetForeignTensor();
+        auto dummy_max_seqlen_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[3])->GetForeignTensor();
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(in_x_foreign_tensor)->GetTensor());
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(dummy_mask_foreign_tensor)->GetTensor());
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(cu_seqlen_foreign_tensor)->GetTensor());
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(dummy_max_seqlen_foreign_tensor)->GetTensor());
+
+        int in_data_type = 1;   // DataType: kFloat = 0, kHalf = 1, V2.var_seqlen requires half type to be one.
+        int has_in_mask = 1;    // V2.var_seqlen requires input mask to be true (actually mask is dummy)
+        float default_dq_probs = 1.0f/127.0f;
+        int enable_var_seqlen = 1;
+
+        std::vector<nvinfer1::PluginField> mha_v2_field;
+        mha_v2_field.emplace_back("type_id", &in_data_type, nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v2_field.emplace_back("has_mask", &has_in_mask, nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v2_field.emplace_back("hidden_size", &(layer_param->bert_mha_hidden_size), nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v2_field.emplace_back("num_heads", &(layer_param->bert_mha_num_heads), nvinfer1::PluginFieldType::kINT32, 1);
+        mha_v2_field.emplace_back("dq_probs", &default_dq_probs, nvinfer1::PluginFieldType::kFLOAT32, 1);
+        mha_v2_field.emplace_back("var_seqlen", &enable_var_seqlen, nvinfer1::PluginFieldType::kFLOAT32, 1);
+
+        PluginFieldCollection mhaV2FC {6, mha_v2_field.data()};
+        IPluginV2* pluginObj = creator->createPlugin(layer_name_.c_str(), &mhaV2FC);
+        auto layer = network->addPluginV2(input_tensors.data(), 4, *pluginObj);
+        if (layer != nullptr) {
+            layer->setName((layer_name_).c_str());
+        }
+
+        return layer;
+    } else if (layer_param->type == FusionType_TRTPlugin_BertQKVtoContextV3) {
+        if (layer_param->bert_mha_hidden_size <= 0 || layer_param->bert_mha_num_heads <= 0) {
+            LOGE("FusedTRTLayerBuilder: TRT QKVToContext V3 Plugin Layer got Wrong Param: num_heads and hidden_size for Multi-head Attention.");
+            return nullptr;
+        }
+
+        auto creator = getPluginRegistry()->getPluginCreator("CustomQKVToContextPluginDynamic", "3");
+        if (!creator) {
+            LOGE("FusedTRTLayerBuilder: Unable to find creator for TRT QKVToContext V3 Plugin Layer.");
+            return nullptr;
+        }
+
+        return nullptr;
+    } else if (layer_param->type == FusionType_AddBiasResidualLayerNorm ||
+               layer_param->type == FusionType_FFN ||
+               layer_param->type == FusionType_Flash_Attention ||
+               layer_param->type == FusionType_Cross_Attention ||
+               layer_param->type == FusionType_Attention) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    } else {
+        LOGE("FusedTRTLayerBuilder: Layer fusion Type not supported.");
+        return nullptr;
+    }
+
+    return nullptr;
+}
+
+DimsExprs FusedTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    auto layer_param = dynamic_cast<FusedLayerParam*>(param_);
+    if (layer_param->type == FusionType_Flash_Attention) {
+        nvinfer1::DimsExprs out;
+        out.nbDims = 4;
+        //std::cout<<"PengFusedLayer inputs_0 size:"<<inputs[0].nbDims<<std::endl;
+        out.d[0] = inputs[0].d[0];
+        out.d[1] = inputs[0].d[1];
+        out.d[2] = inputs[0].d[2];
+        out.d[3] = inputs[0].d[4];
+        return out;
+    } else {
+        return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputs, exprBuilder);
+    }
+}
+
+const char* FusedPluginCreator::getPluginName() const noexcept {
+    return "Fused";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Fused, LAYER_FUSED);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/gather_elements_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/gather_elements_layer_builder.cc
new file mode 100644
index 000000000..65dce72e0
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/gather_elements_layer_builder.cc
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(GatherElements, LAYER_GATHERELEMENTS);
+
+ILayer* GatherElementsTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<GatherElementsLayerParam*>(param_);
+    auto axis = paramlist->axis;
+
+    if (GetInputITensors().size() != 2) {
+        LOGE("GatherElementsTRTLayerBuilder Error, input size not supported");
+        return nullptr;
+    }
+
+    nvinfer1::ITensor* data = GetInputITensors()[0];
+    nvinfer1::ITensor* indices = GetInputITensors()[1];
+
+    int data_rank = data->getDimensions().nbDims;
+    int indices_rank = indices->getDimensions().nbDims;
+
+    if (data_rank != indices_rank) {
+        LOGE("GatherElementsTRTLayerBuilder Error, data and indices rank not equal");
+        return nullptr;
+    }
+
+    if (axis < 0) {
+        axis += data_rank;
+    }
+
+    if (axis >= data_rank) {
+        LOGE("GatherElementsTRTLayerBuilder Error, invalid axis");
+        return nullptr;
+    }
+
+    nvinfer1::IGatherLayer* layer = network->addGather(*data, *indices, axis);
+    layer->setName(layer_name_.c_str());
+
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR > 80
+    layer->setMode(nvinfer1::GatherMode::kELEMENT);
+#endif
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(GatherElements, LAYER_GATHERELEMENTS);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc
index ae4abd80e..eed41d188 100644
--- a/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/gather_layer_builder.cc
@@ -147,8 +147,11 @@ ILayer* GatherTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) n
             LOGE("GatherTRTLayerBuilder can not find data or indices\n");
             return nullptr;
         }
-
-        return network->addGather(*data, *indices, axis);
+        
+        auto gather_layer = network->addGather(*data, *indices, axis);
+        gather_layer->setName((layer_name_).c_str());
+        //gather_layer->setInputType(1, nvinfer1::DataType::kINT32);
+        return gather_layer;
     }
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
 }
diff --git a/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc
index bfa48e5ea..6f6c30658 100644
--- a/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/gelu_layer_builder.cc
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
 
 namespace TNN_NS {
 
@@ -38,7 +39,34 @@ nvinfer1::DataType GeluTRTPluginLayerBuilder::getOutputDataType(int index, const
 }
 
 ILayer* GeluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
-    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    ILayer* layer;
+    ITensor* tensor = input_tensor;
+    int dim_size = input_tensor->getDimensions().nbDims;
+    
+    layer = ConstantLayer(network, 0.707106793288165f, dim_size);
+    layer = network->addElementWise(*tensor, *(layer->getOutput(0)), ElementWiseOperation::kPROD);
+    tensor = layer->getOutput(0);
+    
+    layer = network->addUnary(*tensor, UnaryOperation::kERF);
+    tensor = layer->getOutput(0);
+
+    layer = ConstantLayer(network, 1.f, dim_size);
+    layer = network->addElementWise(*tensor, *(layer->getOutput(0)), ElementWiseOperation::kSUM);
+    tensor = layer->getOutput(0);
+
+    layer = ConstantLayer(network, 0.5, dim_size);
+    layer = network->addElementWise(*tensor, *(layer->getOutput(0)), ElementWiseOperation::kPROD);
+    tensor = layer->getOutput(0);
+
+    layer = network->addElementWise(*tensor, *input_tensor, ElementWiseOperation::kPROD);
+
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+    return layer;
 }
 
 DimsExprs GeluTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
diff --git a/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc
index 09b3dd0cd..6acd26957 100644
--- a/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/group_norm_layer_builder.cc
@@ -22,21 +22,21 @@ bool GroupNormTRTPluginLayerBuilder::supportsFormatCombination(
         int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
     const auto &desc = inOut[pos];
     const auto common_cond = nbInputs == 3 && nbOutputs == 1;
-    switch (pos)
-    {
-    case 0:
-        return common_cond 
-            && (desc.type == nvinfer1::DataType::kFLOAT || desc.type == nvinfer1::DataType::kHALF)
-            && desc.format == nvinfer1::TensorFormat::kLINEAR && (pos == 0 || inOut[pos].type == inOut[0].type);
-    case 1:
-    case 2:
-        return common_cond && desc.type == nvinfer1::DataType::kFLOAT;
-    case 3:
-        return common_cond
-            && desc.type == inOut[0].type
-            && desc.format == nvinfer1::TensorFormat::kLINEAR;
-    default:
-        return false;
+    switch (pos) {
+        case 0: {
+            auto type_format_0 = (desc.type == nvinfer1::DataType::kFLOAT || desc.type == nvinfer1::DataType::kHALF) &&
+                                 desc.format == nvinfer1::TensorFormat::kLINEAR;
+            auto type_format_1 = desc.type == nvinfer1::DataType::kINT8 && desc.format == nvinfer1::TensorFormat::kCHW4;
+            auto type_format_2 = desc.type == nvinfer1::DataType::kINT8 && desc.format == nvinfer1::TensorFormat::kCHW32;
+            return common_cond && (type_format_0 || type_format_1 || type_format_2);
+        }
+        case 1:
+        case 2:
+            return common_cond && desc.type == nvinfer1::DataType::kFLOAT;
+        case 3:
+            return common_cond && desc.type == inOut[0].type && desc.format == inOut[0].format;
+        default:
+            return false;
     }
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc
index 1eaccfed5..87ade4284 100644
--- a/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/hard_sigmoid_layer_builder.cc
@@ -38,6 +38,15 @@ nvinfer1::DataType HardSigmoidTRTPluginLayerBuilder::getOutputDataType(int index
 }
 
 ILayer* HardSigmoidTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    auto paramlist = dynamic_cast<HardSigmoidLayerParam *>(param_);
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    auto layer = network->addActivation(*tensor, nvinfer1::ActivationType::kHARD_SIGMOID);
+    layer->setAlpha(paramlist->alpha);
+    layer->setBeta(paramlist->beta);
+    layer->setName(layer_name_.c_str());
+
+    return layer;
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc
index a637ed2b2..5b7e8a549 100644
--- a/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/hard_swish_layer_builder.cc
@@ -20,8 +20,13 @@ DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(HardSwish, LAYER_HARDSWISH);
 
 bool HardSwishTRTPluginLayerBuilder::supportsFormatCombination(
         int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
-    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
-        && inOut[pos].type == inOut[0].type);
+    // return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+    //     && inOut[pos].type == inOut[0].type);
+    bool cond_type = inOut[pos].type == nvinfer1::DataType::kINT8;
+    bool cond_format = inOut[pos].format == nvinfer1::TensorFormat::kCHW4 || inOut[pos].format == nvinfer1::TensorFormat::kCHW32;
+    bool cond_io = inOut[pos].format == inOut[0].format;
+    bool cond = cond_type && cond_format && cond_io;
+    return cond;
 }
 
 Status HardSwishTRTPluginLayerBuilder::Reshape() {
@@ -38,7 +43,26 @@ nvinfer1::DataType HardSwishTRTPluginLayerBuilder::getOutputDataType(int index,
 }
 
 ILayer* HardSwishTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
-    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    auto paramlist             = dynamic_cast<HardSwishLayerParam*>(param_);
+
+    if (paramlist->quantized) {
+        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    } else {
+        auto input_foreign_tensor  = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        if (input_blobs_.size() != 1)
+            input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto tensor  = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+        auto tensor1 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor1)->GetTensor();
+        auto layer   = network->addActivation(*tensor1, nvinfer1::ActivationType::kHARD_SIGMOID);
+        layer->setAlpha(paramlist->alpha);
+        layer->setBeta(paramlist->beta);
+
+        tensor1     = layer->getOutput(0);
+        auto layer1 = network->addElementWise(*tensor, *tensor1, nvinfer1::ElementWiseOperation::kPROD);
+        layer1->setName(layer_name_.c_str());
+        return layer1;
+    }
 }
 
 DimsExprs HardSwishTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
diff --git a/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc
index 88c191f07..25d9151c7 100644
--- a/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/inner_product_layer_builder.cc
@@ -14,6 +14,7 @@
 
 #include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
 #include "tnn/network/tensorrt/utils.h"
+#include "NvInfer.h"
 
 namespace TNN_NS {
 
@@ -26,104 +27,59 @@ ILayer* InnerProductTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
+
+    nvinfer1::ITensor* weight_tensor = nullptr;
+    bool weight_as_input = (input_blobs_.size() == 2);
+    int weight_count = 1;
 
     Weights kernelWeights;
     Weights biasWeights;
-    ILayer* weight_layer;
-    if (int8) {
-        float weight_scale_value = *(resource->scale_handle.force_to<float*>());
-        float input_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            input_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        std::vector<int> dims;
-        dims.push_back(output_blobs_[0]->GetBlobDesc().dims[1]);
-        dims.push_back(input_blobs_[0]->GetBlobDesc().dims[1]);
-        dims.push_back(1);
-        dims.push_back(1);
-        weight_layer = AddInt8WeightQDQLayers(network, &(resource->weight_handle), kernelWeights,
-            paramlist->has_bias ? &(resource->bias_handle) : nullptr,
-            biasWeights, output_scale_value / (weight_scale_value / input_scale_value), dims);
-
-        if (!std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->IsQuantized()) {
-            Weights input_quant_shift;
-            input_quant_shift.type = nvinfer1::DataType::kFLOAT;
-            input_quant_shift.values = nullptr;
-            input_quant_shift.count = 0;
-
-            Weights input_quant_scale;
-            input_quant_scale.type = nvinfer1::DataType::kFLOAT;
-            float* input_quant_scale_data = (float*)malloc(sizeof(float));
-            int8_weight_data.push_back(input_quant_scale_data);
-            *input_quant_scale_data = input_scale_value;
-            input_quant_scale.values = (void*)input_quant_scale_data;
-            input_quant_scale.count = 1;
-
-            Weights input_quant_power;
-            input_quant_power.type = nvinfer1::DataType::kFLOAT;
-            input_quant_power.values = nullptr;
-            input_quant_power.count = 0;
-
-            auto input_quant_layer = network->addScale(*input_tensor, ScaleMode::kUNIFORM,
-                input_quant_shift, input_quant_scale, input_quant_power);
-            std::string input_scale_name = layer_name_ + "_input_quant_";
-            input_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
-            input_quant_layer->setName(input_scale_name.c_str());
-
-            Weights input_dequant_shift;
-            input_dequant_shift.type = nvinfer1::DataType::kFLOAT;
-            input_dequant_shift.values = nullptr;
-            input_dequant_shift.count = 0;
-
-            Weights input_dequant_scale;
-            input_dequant_scale.type = nvinfer1::DataType::kFLOAT;
-            float* input_dequant_scale_data = (float*)malloc(sizeof(float));
-            int8_weight_data.push_back(input_dequant_scale_data);
-            *input_dequant_scale_data = 1 / input_scale_value;
-            input_dequant_scale.values = (void*)input_dequant_scale_data;
-            input_dequant_scale.count = 1;
-
-            Weights input_dequant_power;
-            input_dequant_power.type = nvinfer1::DataType::kFLOAT;
-            input_dequant_power.values = nullptr;
-            input_dequant_power.count = 0;
-
-            auto input_dequant_layer = network->addScale(*(input_quant_layer->getOutput(0)), ScaleMode::kUNIFORM,
-                input_dequant_shift, input_dequant_scale, input_dequant_power);
-            std::string input_dequant_layer_name = layer_name_ + "_input_dequant_";
-            input_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-            input_dequant_layer->setName(input_dequant_layer_name.c_str());
-            std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->SetQuantized();
-            input_tensor = input_dequant_layer->getOutput(0);
-        }
+
+    if (weight_as_input) {
+        auto weight_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        weight_tensor = std::dynamic_pointer_cast<TensorRTTensor>(weight_foreign_tensor)->GetTensor();
+        auto dims = weight_tensor->getDimensions();
+        paramlist->num_output = dims.d[0];
+        for (int i = 0; i < dims.nbDims; i++)
+            weight_count *= dims.d[i];
     } else {
         kernelWeights = ConvertToWeights(&(resource->weight_handle));
-        if (paramlist->has_bias) {
-            biasWeights = ConvertToWeights(&(resource->bias_handle));
-        } else {
-            biasWeights = ConvertToWeights(nullptr, true, resource->weight_handle.GetDataType());
-        }
+        weight_count = kernelWeights.count;
+    }
+
+    if (paramlist->has_bias) {
+        biasWeights = ConvertToWeights(&(resource->bias_handle));
+    } else {
+        biasWeights = ConvertToWeights(nullptr, true, resource->weight_handle.GetDataType());
+    }
+
+    if (!weight_as_input) {
+        // Create a constant layer for the weights
+        Dims weight_dims;
+        weight_dims.nbDims = 2;
+        weight_dims.d[0] = paramlist->num_output;
+        weight_dims.d[1] = weight_count / paramlist->num_output;
+        weight_tensor = network->addConstant(weight_dims, kernelWeights)->getOutput(0);
     }
 
-    ILayer* layer;
-
-    Dims in_dims;
-    in_dims.nbDims = 4;
-    in_dims.d[0] = -1;
-    in_dims.d[1] = kernelWeights.count / paramlist->num_output;
-    in_dims.d[2] = 1;
-    in_dims.d[3] = 1;
-    IShuffleLayer* in_reshape_layer = network->addShuffle(*input_tensor);
-    in_reshape_layer->setReshapeDimensions(in_dims);
-    input_tensor = in_reshape_layer->getOutput(0);
-
-    //FullyConnected
-    layer = network->addFullyConnected(*input_tensor, paramlist->num_output, 
-        kernelWeights, biasWeights);
-    if (int8) {
-        layer->setInput(1, *(weight_layer->getOutput(0)));
-        layer->setPrecision(nvinfer1::DataType::kINT8);
+    // Matrix Multiply
+    ILayer* matmul_layer = network->addMatrixMultiply(*input_tensor, MatrixOperation::kNONE, *weight_tensor, MatrixOperation::kTRANSPOSE);
+    if (matmul_layer == nullptr) {
+        return nullptr;
+    }
+
+    ILayer* layer = matmul_layer;
+
+    // Add bias if present
+    if (paramlist->has_bias) {
+        // Adjust bias tensor dimensions to match the output of matmul_layer
+        Dims bias_dims;
+        bias_dims.nbDims = 2;
+        bias_dims.d[0] = 1; // Broadcast across batch size
+        bias_dims.d[1] = paramlist->num_output;
+
+        auto bias_tensor = network->addConstant(bias_dims, biasWeights)->getOutput(0);
+        layer = network->addElementWise(*matmul_layer->getOutput(0), *bias_tensor, ElementWiseOperation::kSUM);
     }
 
     if (layer != nullptr) {
@@ -131,13 +87,6 @@ ILayer* InnerProductTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         input_tensor = layer->getOutput(0);
     }
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        auto output_dequant_layer =  AddInt8OutputQDQLayers(network, layer->getOutput(0), output_foreign_tensor, 1, 1 / output_scale_value);
-        input_tensor = output_dequant_layer->getOutput(0);
-    }
-
     Dims out_dims;
     out_dims.nbDims = paramlist->axis + 1;
     for (int i = 0; i < out_dims.nbDims; i++) {
diff --git a/source/tnn/network/tensorrt/layer_builder/inplace_slice_copy_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/inplace_slice_copy_layer_builder.cc
new file mode 100644
index 000000000..4c910c37b
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/inplace_slice_copy_layer_builder.cc
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+bool InplaceSliceCopyTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    bool base_check = (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
+                        inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    return base_check && inOut[pos].type == inOut[0].type;
+}
+
+Status InplaceSliceCopyTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* InplaceSliceCopyTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "InplaceSliceCopy";
+}
+
+nvinfer1::DataType InplaceSliceCopyTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* InplaceSliceCopyTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs InplaceSliceCopyTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInput, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+
+    DimsExprs output(inputs[0]);
+    return output;
+}
+
+const char* InplaceSliceCopyPluginCreator::getPluginName() const noexcept {
+    return "InplaceSliceCopy";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(InplaceSliceCopy, LAYER_INPLACE_SLICE_COPY);
+
+}
\ No newline at end of file
diff --git a/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc
index e7c9268dd..093b17f72 100644
--- a/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/layer_norm_layer_builder.cc
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
 
 namespace TNN_NS {
 
@@ -46,7 +47,42 @@ nvinfer1::DataType LayerNormTRTPluginLayerBuilder::getOutputDataType(int index,
 }
 
 ILayer* LayerNormTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 86
+    auto layer_param = dynamic_cast<LayerNormLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("LayerNormTRTPluginLayerBuilder: Unable to get layer param.");
+        return nullptr;
+    }
+
+    float epsilon = layer_param->eps;
+
+    std::vector<ITensor*> input_tensors;
+    for (int i = 0; i < input_blobs_.size(); i++) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[i])->GetForeignTensor();
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor());
+    }
+    auto* input = input_tensors[0];
+    auto* scale = input_tensors[1];
+    auto* bias = input_tensors[2];
+    int axis = input->getDimensions().nbDims - layer_param->reduce_dims_size;
+    uint32_t axesMask{0};
+
+    // Populate axesMask with axis values
+    for (int32_t i = axis; i < input->getDimensions().nbDims; i++)
+    {
+        axesMask |= 1 << i;
+    }
+
+    // Broadcast scale and bias to input size
+    BroadcastTensors(network, input, scale);
+    BroadcastTensors(network, input, bias);
+
+    auto* layer = network->addNormalization(*input, *scale, *bias, axesMask);
+    layer->setEpsilon(epsilon);
+    return layer;
+#else
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
+#endif
 }
 
 DimsExprs LayerNormTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
diff --git a/source/tnn/network/tensorrt/layer_builder/leaky_relu_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/leaky_relu_layer_builder.cc
new file mode 100644
index 000000000..00d7f7781
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/leaky_relu_layer_builder.cc
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(LeakyRelu, LAYER_LEAKY_RELU);
+
+ILayer* LeakyReluTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+    auto paramlist = dynamic_cast<LeakyReluLayerParam*>(param_);
+    IActivationLayer* layer = network->addActivation(*tensor, nvinfer1::ActivationType::kLEAKY_RELU);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setAlpha(paramlist->alpha);
+    }
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(LeakyRelu, LAYER_LEAKY_RELU);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/linspace_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/linspace_layer_builder.cc
new file mode 100644
index 000000000..3a84e4273
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/linspace_layer_builder.cc
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Linspace, LAYER_LINSPACE);
+
+ILayer* LinspaceTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_param = dynamic_cast<LinspaceLayerParam*>(param_);
+    auto input_tensors = GetInputITensors();
+    ShapeTensor start;
+    ShapeTensor start_vec;
+    ShapeTensor end;
+    ShapeTensor steps;
+    if (input_tensors.size() == 3) {
+        start = ShapeTensor(*input_tensors[0]);
+        end = ShapeTensor(*input_tensors[1]);
+        steps = ShapeTensor(*input_tensors[2]);
+    } else { // input_tensors.size() < 3
+        if (layer_param->start_index == -1) {
+            start = shapeScalar(static_cast<int>(layer_param->start.f));
+            start_vec = shapeVector(static_cast<int>(layer_param->start.f));
+        } else {
+            start = ShapeTensor(*input_tensors[layer_param->start_index]);
+        }
+        if (layer_param->end_index==-1) {
+            end = shapeVector(static_cast<int>(layer_param->end.f));
+        } else {
+            end = ShapeTensor(*input_tensors[layer_param->end_index]);
+        }
+        if (layer_param->steps_index == -1) {
+            steps = shapeVector(layer_param->steps.i);
+        } else {
+            steps = ShapeTensor(*input_tensors[layer_param->steps_index]);
+        }
+    }
+
+    ShapeTensor zero;
+    if (start_vec.rank() == 0) {
+        zero = shapeScalar(0);
+    } else {
+        zero = shapeVector(0);
+    }
+    if (steps.rank() == 0) {
+        steps = convertTo1D(network, steps);
+    }
+    ShapeTensor step1 = sub(network, start_vec, end);
+    ShapeTensor step2 = floorDiv(network, step1, steps);
+    ShapeTensor step3 = sub(network, zero, step2);
+    IFillLayer* layer = addFill(network, steps, FillOperation::kLINSPACE);
+    if (start.allValuesKnown() && end.allValuesKnown()) {
+        layer->setAlpha(layer_param->start.f);
+        layer->setBeta(layer_param->end.f);
+        layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    } else {
+        layer->setInput(1, start.tensor(network));
+        layer->setInput(2, convertTo1D(network, step3).tensor(network));
+        layer->setOutputType(0, nvinfer1::DataType::kINT32);
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Linspace, LAYER_LINSPACE);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc
index 130deffd1..a7382841a 100644
--- a/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/mat_mul_layer_builder.cc
@@ -33,7 +33,8 @@ nvinfer1::Dims unsqueeze_trt_dims(const nvinfer1::Dims &input_dims, int unsqueez
 
 bool MatMulTRTPluginLayerBuilder::supportsFormatCombination(
         int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
-    if (pos == 1) {
+    if (pos == 1 && inOut[pos].dims.d[inOut[pos].dims.nbDims-1]==1) {
+        // GEMV + reduce sum case, input 1 should be fp32 to keep precision.
         return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
     } else {
         return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF) &&
@@ -58,7 +59,11 @@ ILayer* MatMulTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) n
     auto paramlist = dynamic_cast<MatMulLayerParam *>(param_);
     auto resource  = dynamic_cast<MatMulLayerResource *>(resource_);
     auto input_tensors = GetInputITensors();
-
+    // TODO: Add Ntew Flag in Context to determine if run with Custom Cuda MatMul Acc
+    //if (paramlist->dynamic_matrix_a_dim0) {
+    //    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    //}
+ 
     ITensor * matrix_a = nullptr;
     ITensor * matrix_b = nullptr;
     
@@ -110,15 +115,45 @@ ILayer* MatMulTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) n
                                                    : MatrixOperation::kNONE;
     };
 
+    if (input_tensors.size() == 1 && dims_a.nbDims == 3) {
+        if (paramlist->extra_config.find("ffn") != paramlist->extra_config.end()) {
+            LOGD("Layer %s of Dims <%d,%d,%d>, weigth:<%d,%d,%d> goto plugin\n", 
+                    layer_name_.c_str(), dims_a.d[0], dims_a.d[1],dims_a.d[2], dims_b.d[0], dims_b.d[1],dims_b.d[2]);
+            return TensorRTPluginLayerBuilder::AddToNetwork(network); 
+        }
+    }
+
     MatrixOperation opA = getMatrixOp(matrix_a);
     MatrixOperation opB = getMatrixOp(matrix_b);
 
-    if (opA == MatrixOperation::kNONE && opB == MatrixOperation::kNONE && dims_b.d[dims_b.nbDims - 1] == 1 &&
+    // CASEs when Custom Plugin MatMul OP is prefered:
+    // case 1: N=1, TRT GEMV with reduce sum, TRT default batched-gemv is slow,
+    //         besides, fp16 GEMV has reduce sum OP, reduce should be calculated under fp32.
+    // case 2: Batched-GEMM, without unsqueeze, TRT 7,8 may trigger "Unable to find CUBLAS algo" ERROR,
+    //         in some corner cases.
+    //         Calling Plugin CUBLAS GEMM may hurt performace, so we put a very strict prerequisite.
+    //         Ideally, Batched-GEMM plugin should only be called by Models with Transformer Kernels.
+    // Update: Disable custom plugin for case 2 above for Myelin optimization to speed-up network.
+    // Update: Disable all plugin cases below, plugin should only be called via extra flag "ffn"
+    /*
+    if (opA == MatrixOperation::kNONE && opB == MatrixOperation::kNONE &&
             input_tensors.size() == 2 &&
             input_tensors[0]->getDimensions().nbDims == input_tensors[1]->getDimensions().nbDims) {
-        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+        bool batch_eq = true;
+        bool mnk_unknown = true;
+        int in0_batch = 1;
+        for (int i=0; i<input_tensors[0]->getDimensions().nbDims-2; i++) {
+            // dim==-1 would be treated as dim>1 here.
+            batch_eq &= (input_tensors[0]->getDimensions().d[i]==input_tensors[1]->getDimensions().d[i]); 
+            in0_batch *= input_tensors[0]->getDimensions().d[i]==-1 ? 2 : input_tensors[0]->getDimensions().d[1];
+        }
+        mnk_unknown &= input_tensors[0]->getDimensions().nbDims==4;
+        if (dims_b.d[dims_b.nbDims - 1] == 1 ||
+            (batch_eq && in0_batch>1 && mnk_unknown)) {
+            return TensorRTPluginLayerBuilder::AddToNetwork(network); 
+        }
     }
-
+    */
     IMatrixMultiplyLayer* layer = network->addMatrixMultiply(*matrix_a, opA, *matrix_b, opB);
 
     if (layer != nullptr) {
@@ -132,10 +167,18 @@ DimsExprs MatMulTRTPluginLayerBuilder::getOutputDimensions(int index, const nvin
         int nbInput, nvinfer1::IExprBuilder& exprBuilder) noexcept {
     DimsExprs output(inputs[0]);
     int size = inputs[0].nbDims;
-    output.d[size - 1] = inputs[1].d[size - 1];
-    output.d[size - 2] = inputs[0].d[size - 2];
-    for (int i = size - 3; i >= 0; i--) {
-        output.d[i] = exprBuilder.operation(DimensionOperation::kMAX, *inputs[0].d[i], *inputs[1].d[i]);
+
+    if (nbInput == 1) {
+        auto resource  = dynamic_cast<MatMulLayerResource *>(resource_);
+        auto buf = resource->weight;
+        DimsVector buf_dims = buf.GetBufferDims();
+        output.d[size - 1] = exprBuilder.constant(*buf_dims.rbegin());
+    } else {
+        output.d[size - 1] = inputs[1].d[size - 1];
+        output.d[size - 2] = inputs[0].d[size - 2];
+        for (int i = size - 3; i >= 0; i--) {
+            output.d[i] = exprBuilder.operation(DimensionOperation::kMAX, *inputs[0].d[i], *inputs[1].d[i]);
+        }
     }
     return output;
 }
diff --git a/source/tnn/network/tensorrt/layer_builder/mod_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/mod_layer_builder.cc
new file mode 100644
index 000000000..ad74cc6a0
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/mod_layer_builder.cc
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <numeric>
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Mod, LAYER_MOD);
+
+ILayer* ModTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    ILayer* layer;
+    nvinfer1::DataType out_dtype = ConvertToTRTDataType(output_blobs_[0]->GetBlobDesc().data_type); 
+
+    // Mod is A Binary OP, Calculation of Mod is similiar with Other Binary OPs like add, mul, etc.
+    if (input_blobs_.size() == 2) {
+        auto input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+        auto input_tensor1 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor1)->GetTensor();
+        auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+
+        if (input_tensor1->getDimensions().nbDims < input_tensor2->getDimensions().nbDims) {
+            std::vector<int> axes(input_tensor2->getDimensions().nbDims - input_tensor1->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *input_tensor1, axes);
+            input_tensor1 = unsqueeze_layer->getOutput(0);
+        } else if (input_tensor1->getDimensions().nbDims > input_tensor2->getDimensions().nbDims) {
+            std::vector<int> axes(input_tensor1->getDimensions().nbDims - input_tensor2->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *input_tensor2, axes);
+            input_tensor2 = unsqueeze_layer->getOutput(0);
+        }
+
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType in1_dtype = input_tensor1->getType();
+        nvinfer1::DataType in2_dtype = input_tensor2->getType();
+
+        // DataType Cast Before Elemwise
+        if (in1_dtype==nvinfer1::DataType::kINT32 &&
+            (in2_dtype==nvinfer1::DataType::kFLOAT || in2_dtype==nvinfer1::DataType::kHALF)) {
+            ILayer* cast_layer = network->addIdentity(*input_tensor1);
+            cast_layer->setName((layer_name_+"_a_int2fp").c_str());
+            cast_layer->setOutputType(0, in2_dtype);
+            input_tensor1 = cast_layer->getOutput(0);
+        } else if ((in1_dtype==nvinfer1::DataType::kFLOAT || in1_dtype==nvinfer1::DataType::kHALF) &&
+            in2_dtype==nvinfer1::DataType::kINT32) {
+            ILayer* cast_layer = network->addIdentity(*input_tensor2);
+            cast_layer->setName((layer_name_+"_b_int2fp").c_str());
+            cast_layer->setOutputType(0, in1_dtype);
+            input_tensor2 = cast_layer->getOutput(0);
+        }
+
+        if(!CheckBroadcastDimsCorrect(input_tensor1, input_tensor2)) {
+            return nullptr;
+        }
+
+        // Main Mod Calculation
+        layer = network->addElementWise(*input_tensor1, *input_tensor2, ElementWiseOperation::kFLOOR_DIV);
+        layer->setName((layer_name_ + "_floordiv").c_str());
+        auto temp_tensor = layer->getOutput(0);
+        
+        layer = network->addElementWise(*temp_tensor, *input_tensor2, ElementWiseOperation::kPROD);
+        layer->setName((layer_name_ + "_mul").c_str());
+        temp_tensor = layer->getOutput(0);
+        
+        layer = network->addElementWise(*input_tensor1, *temp_tensor, ElementWiseOperation::kSUB);
+        layer->setName((layer_name_ + "_sub").c_str());
+    } else {
+        auto paramlist = dynamic_cast<MultidirBroadcastLayerParam*>(param_);
+        auto resource = dynamic_cast<EltwiseLayerResource*>(resource_);
+
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto src_a = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+        bool unsqueeze = src_a->getDimensions().nbDims == 0;
+        if (unsqueeze) {
+            ShapeTensor tmp(*src_a, 0);
+            src_a = &(convertTo1D(network, tmp).tensor(network));
+        }
+
+        auto const_layer = ConvertWeightToConstLayer(network, &(resource->element_handle),
+            resource->element_shape, src_a->getDimensions().nbDims);
+
+        if (const_layer == nullptr) {
+            LOGE("BinaryTRTLayerBuilder create weights node failed\n");
+            return nullptr;
+        }
+
+        auto src_b = const_layer->getOutput(0);
+        if (src_a->getDimensions().nbDims < src_b->getDimensions().nbDims) {
+            std::vector<int> axes(src_b->getDimensions().nbDims - src_a->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *src_a, axes);
+            src_a = unsqueeze_layer->getOutput(0);
+        }
+
+        // DataType Cast
+        //DataType src_a_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+        //DataType src_b_dtype = resource->element_handle.GetDataType();
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType src_a_dtype = src_a->getType();
+        nvinfer1::DataType src_b_dtype = src_b->getType();
+        if (src_a_dtype==nvinfer1::DataType::kINT32 &&
+            (src_b_dtype==nvinfer1::DataType::kFLOAT || src_b_dtype==nvinfer1::DataType::kHALF)) {
+            ILayer* cast_layer = network->addIdentity(*src_a);
+            cast_layer->setName((layer_name_+"_in1_int2fp").c_str());
+            cast_layer->setOutputType(0, src_b_dtype);
+            src_a = cast_layer->getOutput(0);
+        } else if ((src_a_dtype==nvinfer1::DataType::kFLOAT || src_a_dtype==nvinfer1::DataType::kHALF) &&
+            src_b_dtype==nvinfer1::DataType::kINT32) {
+            ILayer* cast_layer = network->addIdentity(*src_b);
+            cast_layer->setName((layer_name_+"_in2_int2fp").c_str());
+            cast_layer->setOutputType(0, src_a_dtype);
+            src_b = cast_layer->getOutput(0);
+        }
+
+        if (paramlist->weight_input_index == 0) {
+            std::swap(src_a, src_b);
+        }
+
+        if(!CheckBroadcastDimsCorrect(src_a, src_b)) {
+            return nullptr;
+        }
+
+        // Main Mod Calculation.
+        layer = network->addElementWise(*src_a, *src_b, ElementWiseOperation::kFLOOR_DIV);
+        layer->setName((layer_name_ + "_floordiv").c_str());
+        auto temp_tensor = layer->getOutput(0);
+        
+        layer = network->addElementWise(*temp_tensor, *src_b, ElementWiseOperation::kPROD);
+        layer->setName((layer_name_ + "_mul").c_str());
+        temp_tensor = layer->getOutput(0);
+        
+        layer = network->addElementWise(*src_a, *temp_tensor, ElementWiseOperation::kSUB);
+        layer->setName((layer_name_ + "_sub").c_str());
+
+        if (unsqueeze) {
+            Dims tmp_dims;
+            tmp_dims.nbDims = 0;
+            IShuffleLayer* shuffle = network->addShuffle(*layer->getOutput(0));
+            shuffle->setReshapeDimensions(tmp_dims);
+            layer = shuffle;
+        }
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Mod, LAYER_MOD);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc
index b0f68ebac..93b209030 100644
--- a/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/mul_layer_builder.cc
@@ -12,16 +12,203 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
+#include <numeric>
 #include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/dimension_expr.h"
+#include "tnn/network/tensorrt/utils.h"
 
 namespace TNN_NS {
 
-DECLARE_TRT_BINARY_LAYER_BUILDER(Mul);
+// DECLARE_TRT_BINARY_LAYER_BUILDER(Mul);
 
-MulTRTLayerBuilder::MulTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
-    m_op = ElementWiseOperation::kPROD;
+// MulTRTLayerBuilder::MulTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+//     m_op = ElementWiseOperation::kPROD;
+// }
+
+// REGISTER_TENSORRT_LAYER_BUILDER(Mul, LAYER_MUL);
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER_WITH_FUNC(Mul, LAYER_MUL,
+                                                virtual void CheckTopo(int id, std::vector<std::shared_ptr<LayerInfo>> &layers););
+
+void MulTRTPluginLayerBuilder::CheckTopo(int id, std::vector<std::shared_ptr<LayerInfo>> &layers) {
+    auto output_names = layers.at(id)->outputs;
+    for (int i = id+1; i < layers.size(); i++) {
+        auto cur_layer = layers.at(i);
+        auto input_names = cur_layer->inputs;
+        if (std::find(input_names.begin(), input_names.end(), output_names[0]) != input_names.end()) {
+            if (cur_layer->type == LAYER_TOPK) {
+                m_maybe_fallback = true;
+            }
+        }
+    }
+}
+
+bool MulTRTPluginLayerBuilder::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,
+                                                          int nbInputs, int nbOutputs) noexcept {
+    bool condition = inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    condition &= inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF;
+    condition &= inOut[pos].type == inOut[0].type;
+
+    return condition;
+}
+
+Status MulTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* MulTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Mul";
+}
+
+nvinfer1::DataType MulTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                                int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+DimsExprs MulTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInput,
+                                                        nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInput, exprBuilder);
+}
+
+ILayer* MulTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    ILayer* layer;
+    auto m_op = ElementWiseOperation::kPROD;
+
+    if (input_blobs_.size() == 2) {
+        auto input_foreign_tensor1 = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[1])->GetForeignTensor();
+        auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+        auto input_tensor1 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor1)->GetTensor();
+        auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+
+        // use plugin if followed by topk and input dims were equal
+        if (input_tensor1->getDimensions().nbDims == input_tensor2->getDimensions().nbDims && m_maybe_fallback) {
+            return TensorRTPluginLayerBuilder::AddToNetwork(network);
+        }
+
+        if (input_tensor1->getDimensions().nbDims < input_tensor2->getDimensions().nbDims) {
+            std::vector<int> axes(input_tensor2->getDimensions().nbDims - input_tensor1->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *input_tensor1, axes);
+            input_tensor1 = unsqueeze_layer->getOutput(0);
+        } else if (input_tensor1->getDimensions().nbDims > input_tensor2->getDimensions().nbDims) {
+            std::vector<int> axes(input_tensor1->getDimensions().nbDims - input_tensor2->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *input_tensor2, axes);
+            input_tensor2 = unsqueeze_layer->getOutput(0);
+        }
+
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType in1_dtype = input_tensor1->getType();
+        nvinfer1::DataType in2_dtype = input_tensor2->getType();
+
+        if (in1_dtype==nvinfer1::DataType::kINT32 && 
+            (in2_dtype==nvinfer1::DataType::kFLOAT || in2_dtype==nvinfer1::DataType::kHALF)) {
+            ILayer* cast_layer = network->addIdentity(*input_tensor1);
+            cast_layer->setName((layer_name_+"_a_int2fp").c_str());
+            cast_layer->setOutputType(0, in2_dtype);
+            input_tensor1 = cast_layer->getOutput(0);
+        } else if ((in1_dtype==nvinfer1::DataType::kFLOAT || in1_dtype==nvinfer1::DataType::kHALF) && 
+            in2_dtype==nvinfer1::DataType::kINT32) {
+            ILayer* cast_layer = network->addIdentity(*input_tensor2);
+            cast_layer->setName((layer_name_+"_b_int2fp").c_str());
+            cast_layer->setOutputType(0, in1_dtype);
+            input_tensor2 = cast_layer->getOutput(0);
+        }
+
+        if(!CheckBroadcastDimsCorrect(input_tensor1, input_tensor2)) {
+            return nullptr;
+        }
+
+        layer = network->addElementWise(*input_tensor1, *input_tensor2, m_op);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+        } else {
+            return nullptr;
+        }
+    } else {
+        auto paramlist = dynamic_cast<MultidirBroadcastLayerParam*>(param_);
+        auto resource = dynamic_cast<EltwiseLayerResource*>(resource_);
+
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+        auto src_a = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+        bool unsqueeze = src_a->getDimensions().nbDims == 0;
+        if (unsqueeze) {
+            ShapeTensor tmp(*src_a, 0);
+            src_a = &(convertTo1D(network, tmp).tensor(network));
+        }
+
+        auto const_layer = ConvertWeightToConstLayer(network, &(resource->element_handle),
+            resource->element_shape, src_a->getDimensions().nbDims);
+
+        if (const_layer == nullptr) {
+            LOGE("BinaryTRTLayerBuilder create weights node failed\n");
+            return nullptr;
+        }
+
+        auto src_b = const_layer->getOutput(0);
+        if (src_a->getDimensions().nbDims < src_b->getDimensions().nbDims) {
+            std::vector<int> axes(src_b->getDimensions().nbDims - src_a->getDimensions().nbDims);
+            std::iota(axes.begin(), axes.end(), 0);
+            ILayer* unsqueeze_layer = addUnsqueeze(network, *src_a, axes);
+            src_a = unsqueeze_layer->getOutput(0);
+        }
+
+        // DataType Cast
+        //DataType src_a_dtype = input_blobs_[0]->GetBlobDesc().data_type;
+        //DataType src_b_dtype = resource->element_handle.GetDataType();
+        // Get Input, Output DataType
+        // Output DataType comes from TNN InferType.
+        nvinfer1::DataType src_a_dtype = src_a->getType();
+        nvinfer1::DataType src_b_dtype = src_b->getType();
+
+        if (src_a_dtype==nvinfer1::DataType::kINT32 && 
+            (src_b_dtype==nvinfer1::DataType::kFLOAT || src_b_dtype==nvinfer1::DataType::kHALF)) {
+            ILayer* cast_layer = network->addIdentity(*src_a);
+            cast_layer->setName((layer_name_+"_in1_int2fp").c_str());
+            cast_layer->setOutputType(0, src_b_dtype);
+            src_a = cast_layer->getOutput(0);
+        } else if ((src_a_dtype==nvinfer1::DataType::kFLOAT || src_a_dtype==nvinfer1::DataType::kHALF) && 
+            src_b_dtype==nvinfer1::DataType::kINT32) {
+            ILayer* cast_layer = network->addIdentity(*src_b);
+            cast_layer->setName((layer_name_+"_in2_int2fp").c_str());
+            cast_layer->setOutputType(0, src_a_dtype);
+            src_b = cast_layer->getOutput(0);
+        }
+
+        if (paramlist->weight_input_index == 0) {
+            std::swap(src_a, src_b);
+        }
+
+        if(!CheckBroadcastDimsCorrect(src_a, src_b)) {
+            return nullptr;
+        }
+
+        layer = network->addElementWise(*src_a, *src_b, m_op);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+        } else {
+            return nullptr;
+        }
+        if (unsqueeze) {
+            Dims tmp_dims;
+            tmp_dims.nbDims = 0;
+            IShuffleLayer* shuffle = network->addShuffle(*layer->getOutput(0));
+            shuffle->setReshapeDimensions(tmp_dims);
+            layer = shuffle;
+        }
+    }
+
+    return layer;
+}
+
+const char* MulPluginCreator::getPluginName() const noexcept {
+    return "Mul";
 }
 
-REGISTER_TENSORRT_LAYER_BUILDER(Mul, LAYER_MUL);
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Mul, LAYER_MUL);
 
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc
index 535c7e670..20b7844d9 100644
--- a/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/neg_layer_builder.cc
@@ -31,10 +31,30 @@ ILayer* NegTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         tensor = shuffle_layer->getOutput(0);
     }
 
-    IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kNEG);
+    ILayer* layer;    
+    nvinfer1::DataType in_dtype = tensor->getType();
+    // TRT8 unary NEG does not suppport INT32
+    // Convert to FLOAT first and then back to INT32 AFTER ABS 
+    if (in_dtype==nvinfer1::DataType::kINT8 || in_dtype==nvinfer1::DataType::kINT32) {
+        ILayer* cast_layer = network->addIdentity(*tensor);
+        cast_layer->setName((layer_name_+"_int2fp").c_str());
+        cast_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        tensor = cast_layer->getOutput(0);
+    }
+
+    // Main Neg OP
+    layer = network->addUnary(*tensor, UnaryOperation::kNEG);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
     }
+ 
+    // Cast back to original data type
+    if (in_dtype==nvinfer1::DataType::kINT8 || in_dtype==nvinfer1::DataType::kINT32) {
+        layer = network->addIdentity(*tensor);
+        layer->setName((layer_name_+"_fp2int").c_str());
+        layer->setOutputType(0, in_dtype);
+        tensor = layer->getOutput(0);
+    }
 
     if (GetInputITensors()[0]->getDimensions().nbDims == 0) {
         IShuffleLayer* shuffle_layer = network->addShuffle(*layer->getOutput(0));
diff --git a/source/tnn/network/tensorrt/layer_builder/nonzero_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/nonzero_layer_builder.cc
new file mode 100644
index 000000000..6bac2494b
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/nonzero_layer_builder.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+
+namespace TNN_NS {
+
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
+
+DECLARE_TENSORRT_LAYER_BUILDER(NonZero, LAYER_NONZERO);
+
+ILayer* NonZeroTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    INonZeroLayer* layer = network->addNonZero(*tensor);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(NonZero, LAYER_NONZERO);
+
+#endif
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/not_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/not_layer_builder.cc
index ead1955df..59c5a2adb 100644
--- a/source/tnn/network/tensorrt/layer_builder/not_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/not_layer_builder.cc
@@ -21,6 +21,15 @@ DECLARE_TENSORRT_LAYER_BUILDER(Not, LAYER_NOT);
 ILayer* NotTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+
+    nvinfer1::DataType in_dtype = tensor->getType();
+    if (in_dtype != nvinfer1::DataType::kBOOL) {
+        ILayer* cast_layer = network->addIdentity(*tensor);
+        cast_layer->setName((layer_name_+"_input_2bool").c_str());
+        cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
+        tensor = cast_layer->getOutput(0);
+    }
+
     IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kNOT);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
diff --git a/source/tnn/network/tensorrt/layer_builder/or_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/or_layer_builder.cc
new file mode 100644
index 000000000..7ec3c00a0
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/or_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Or);
+
+OrTRTLayerBuilder::OrTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kOR;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Or, LAYER_OR);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc
index 2c1c444b0..8823ce63e 100644
--- a/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/pad_layer_builder.cc
@@ -69,13 +69,32 @@ ILayer* PadTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noex
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
     std::vector<int> pads = paramlist->pads;
+
+    // IPaddingLayer only support 4d Tensor
+    // If Input is CHW (supported by pytorch) instead of NCHW,
+    // Unsqueeze input tensor to [1, C, H, W]
+    ILayer* layer;
+    int input_nbDims = input_tensor->getDimensions().nbDims;
+    if (input_nbDims == 3) {
+        layer = addUnsqueeze(network, *input_tensor, {0});
+        layer->setName((layer_name_+"_chw_to_nchw").c_str());
+        input_tensor = layer->getOutput(0);
+    }
+
     // use IPaddingLayer
-    IPaddingLayer* pad_layer;
     Dims pre_padding = ConvertToTRTDims({pads[2], pads[0]});
     Dims post_padding = ConvertToTRTDims({pads[3], pads[1]});
-    pad_layer = network->addPaddingNd(*input_tensor, pre_padding, post_padding);
+    layer = network->addPaddingNd(*input_tensor, pre_padding, post_padding);
+    input_tensor = layer->getOutput(0);
+
+    // If Input is CHW (supported by pytorch) instead of NCHW,
+    // Squeeze result tensor back to [C, H, W]
+    if (input_nbDims == 3) {
+        layer = addSqueeze(network, *input_tensor, {0});
+        layer->setName((layer_name_+"_nchw_to_chw").c_str());
+    }
 
-    return pad_layer;
+    return layer;
 }
 
 DimsExprs PadTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
@@ -86,9 +105,12 @@ DimsExprs PadTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer
     auto pads1 = exprBuilder.constant(param->pads[2] + param->pads[3]);
     auto pads2 = exprBuilder.constant(param->pads[4] + param->pads[5]);
 
-    output.d[3] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[3], *pads0);
-    output.d[2] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[2], *pads1);
-    output.d[1] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[1], *pads2);
+    int nbDims = inputs[0].nbDims;
+    // Support NCHW(nbDims==4) or CHW(nbDims==3)
+    output.d[nbDims-1] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[nbDims-1], *pads0);
+    output.d[nbDims-2] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[nbDims-2], *pads1);
+    output.d[nbDims-3] = exprBuilder.operation(DimensionOperation::kSUM, *output.d[nbDims-3], *pads2);
+
     return output;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc
index 0905f6084..587a63246 100644
--- a/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/pad_v2_layer_builder.cc
@@ -69,11 +69,15 @@ nvinfer1::DataType PadV2TRTPluginLayerBuilder::getOutputDataType(int index, cons
 
 ILayer* PadV2TRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
     auto paramlist = dynamic_cast<PadLayerParam*>(param_);
-
-    if (!UseTRTPaddingND(paramlist)) {
-        return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    if (paramlist->pads.empty()) {
+        LOGE("Error: PadV2 Layer got Empty params->pads.\n");
+        return nullptr;
     }
 
+    //if (!UseTRTPaddingND(paramlist)) {
+    //    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    //}
+
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
     std::vector<int> pads = paramlist->pads;
diff --git a/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc
index 17a7db0dc..353496ff6 100644
--- a/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/permute_layer_builder.cc
@@ -14,6 +14,7 @@
 
 #include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
 #include "tnn/network/tensorrt/utils.h"
+#include <numeric>
 
 namespace TNN_NS {
 
diff --git a/source/tnn/network/tensorrt/layer_builder/permute_v2_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/permute_v2_layer_builder.cc
new file mode 100644
index 000000000..6372814b1
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/permute_v2_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+#include <numeric>
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(PermuteV2, LAYER_PERMUTEV2);
+
+ILayer* PermuteV2TRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PermuteV2LayerParam*>(param_);
+    if (paramlist==nullptr) {
+        LOGE("Error: Unsupported Permute Param, missing param.orders.");
+    }
+ 
+    Permutation permute;
+    
+    int nb_dims = GetInputITensors()[0]->getDimensions().nbDims;
+    if (paramlist->dim0 < 0) paramlist->dim0 += nb_dims;
+    if (paramlist->dim1 < 0) paramlist->dim1 += nb_dims;
+        
+    paramlist->orders.resize(nb_dims);
+    std::iota(paramlist->orders.begin(), paramlist->orders.end(), 0);
+    std::swap(paramlist->orders[paramlist->dim0], paramlist->orders[paramlist->dim1]);
+    for (int i = 0; i < paramlist->orders.size(); ++i) {
+        permute.order[i] = paramlist->orders[i];
+    }
+
+    Blob* input_blob  = input_blobs_[0];
+    auto input_tensors = GetInputITensors();
+    IShuffleLayer* layer = network->addShuffle(*input_tensors[0]);
+    if (layer != nullptr) {
+        Dims reshape_dims;
+        reshape_dims.nbDims = input_tensors[0]->getDimensions().nbDims;
+        for (int i = 0; i < reshape_dims.nbDims; i++) {
+            reshape_dims.d[i] = 0;
+        }
+        layer->setName(layer_name_.c_str());
+        layer->setReshapeDimensions(reshape_dims);
+        layer->setSecondTranspose(permute);
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(PermuteV2, LAYER_PERMUTEV2);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/pooling_3d_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/pooling_3d_layer_builder.cc
new file mode 100644
index 000000000..b28855583
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/pooling_3d_layer_builder.cc
@@ -0,0 +1,110 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Pooling3D, LAYER_POOLING_3D);
+
+ILayer* Pooling3DTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    bool symmetric = (paramlist->pads[0] == paramlist->pads[1])
+                  && (paramlist->pads[0] == paramlist->pads[2])
+                  && (paramlist->pads[3] == paramlist->pads[4])
+                  && (paramlist->pads[3] == paramlist->pads[5]);
+
+    if (paramlist->is_global_pool) {
+        ReduceOperation op;
+        if (paramlist->pool_type == 0) {
+            op = ReduceOperation::kMAX;
+        } else {
+            op = ReduceOperation::kAVG;
+        }
+        uint32_t reduceAxes = ((1 << input_tensor->getDimensions().nbDims) - 1) & ~0b11;
+
+        ILayer* reduce = network->addReduce(*input_tensor, op, reduceAxes, true);
+        reduce->setName(layer_name_.c_str());
+        return reduce;
+    }
+
+    Dims kernelSize(ConvertToTRTDimsReverse(paramlist->kernels));
+
+    PoolingType type;
+    if (paramlist->pool_type == 0) {
+        type = PoolingType::kMAX;
+    } else {
+        type = PoolingType::kAVERAGE;
+    }
+
+    IPoolingLayer *layer;
+    auto pads = paramlist->pads;
+
+    bool padNeg = false;
+    for(const auto& p : pads)
+        padNeg |= p < 0;
+
+    if (padNeg) {
+        DimsVector postPadding{pads[5], pads[3], pads[1]};
+        DimsVector  prePadding{pads[4], pads[2], pads[0]};
+        IPaddingLayer* padding_layer = network->addPaddingNd(*input_tensor,
+                                                    ConvertToTRTDims(prePadding),
+                                                    ConvertToTRTDims(postPadding));
+        input_tensor = padding_layer->getOutput(0);
+        pads = {0, 0, 0, 0, 0, 0};
+    }
+    layer = network->addPoolingNd(*input_tensor, type, kernelSize);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        layer->setStrideNd(ConvertToTRTDimsReverse(paramlist->strides));
+        if (!padNeg) {
+            if (symmetric) {
+                layer->setPaddingNd(ConvertPaddingToTRTDims(pads));
+            } else {
+                DimsVector postPadding{pads[5], pads[3], pads[1]};
+                DimsVector  prePadding{pads[4], pads[2], pads[0]};
+                layer->setPrePadding(ConvertToTRTDims(prePadding));
+                layer->setPostPadding(ConvertToTRTDims(postPadding));
+            }
+        }
+        if (paramlist->pad_type == -1) {
+            if (paramlist->ceil_mode == 1) {
+                // layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_UP);
+                LOGE("Caffe is not supported since 9.0\n");
+            } else {
+                // layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+                LOGE("Caffe is not supported since 9.0\n");
+            }
+        } else if (paramlist->pad_type == 0) {
+            layer->setPaddingMode(PaddingMode::kSAME_UPPER);
+        } else if (paramlist->pad_type == 1) {
+            layer->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_UP);
+        }
+        if (paramlist->pool_type == 1) {
+            layer->setAverageCountExcludesPadding(true);
+        }
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Pooling3D, LAYER_POOLING_3D);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc
index 134ad3a94..815f5dead 100644
--- a/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/pooling_layer_builder.cc
@@ -18,11 +18,18 @@
 
 namespace TNN_NS {
 
-DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING);
-
-bool PoolingTRTPluginLayerBuilder::supportsFormatCombination(
-        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
-    return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER_WITH_FUNC(Pooling, LAYER_POOLING,
+                                                void CheckInputShapeTensor(INetworkDefinition* network););
+
+bool PoolingTRTPluginLayerBuilder::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,
+                                                             int nbInputs, int nbOutputs) noexcept {
+    if (pos == 0) {
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    } else if (pos == nbInputs + nbOutputs - 1) {
+        return inOut[pos].type == inOut[0].type && inOut[pos].format == inOut[0].format;
+    } else {
+        return true;
+    }
 }
 
 Status PoolingTRTPluginLayerBuilder::Reshape() {
@@ -43,10 +50,9 @@ ILayer* PoolingTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network)
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
 
     bool symmetric = (paramlist->pads[0] == paramlist->pads[1]) && (paramlist->pads[2] == paramlist->pads[3]);
-    if (symmetric && ((int8 && paramlist->pool_type == 1) || paramlist->is_adaptive_pool)) {
+    if (symmetric && paramlist->is_adaptive_pool && !paramlist->is_global_pool) {
         return TensorRTPluginLayerBuilder::AddToNetwork(network);
     }
 
@@ -105,9 +111,11 @@ ILayer* PoolingTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network)
         }
         if (paramlist->pad_type == -1) {
             if (paramlist->ceil_mode == 1) {
-                layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_UP);
+                // layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_UP);
+                LOGD("Caffe padding mode unsupported since TensorRT 9.0 but result correct in limited cases so far.\n");
             } else {
-                layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+                // layer->setPaddingMode(PaddingMode::kCAFFE_ROUND_DOWN);
+                LOGD("Caffe padding mode unsupported since TensorRT 9.0 but result correct in limited cases so far.\n");
             }
         } else if (paramlist->pad_type == 0) {
             layer->setPaddingMode(PaddingMode::kSAME_UPPER);
@@ -118,21 +126,24 @@ ILayer* PoolingTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network)
             layer->setAverageCountExcludesPadding(true);
         }
     }
-    if (int8 && std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->GetInt8Mode()) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        return AddInt8OutputQDQLayers(network, layer->getOutput(0), output_foreign_tensor, output_scale_value, 1 / output_scale_value);
-    }
+
     return layer;
 }
 
 DimsExprs PoolingTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
-        int nbInputDims, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+                                                            int nbInputDims,
+                                                            nvinfer1::IExprBuilder& exprBuilder) noexcept {
     auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
     if (paramlist->is_adaptive_pool) {
         DimsExprs output(inputs[0]);
-        output.d[2] = exprBuilder.constant(paramlist->output_shape[1]);
-        output.d[3] = exprBuilder.constant(paramlist->output_shape[0]);
+        auto shape = std::set<int>(paramlist->output_shape.begin(), paramlist->output_shape.end());
+        if (shape.size() == 0 || (shape.size() == 1 && shape.count(-1))) {
+            output.d[2] = inputs[1].d[0];
+            output.d[3] = inputs[1].d[1];
+        } else {
+            output.d[2] = exprBuilder.constant(paramlist->output_shape[1]);
+            output.d[3] = exprBuilder.constant(paramlist->output_shape[0]);
+        }
         return output;
     } else if (paramlist->is_global_pool) {
         DimsExprs output(inputs[0]);
@@ -144,11 +155,21 @@ DimsExprs PoolingTRTPluginLayerBuilder::getOutputDimensions(int index, const nvi
     return TensorRTPluginLayerBuilder::getOutputDimensions(index, inputs, nbInputDims, exprBuilder);
 }
 
+void PoolingTRTPluginLayerBuilder::CheckInputShapeTensor(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<PoolingLayerParam*>(param_);
+    if (paramlist->is_adaptive_pool) {
+        auto shape = std::set<int>(paramlist->output_shape.begin(), paramlist->output_shape.end());
+        if (shape.size() == 0 || (shape.size() == 1 && shape.count(-1))) {
+            // param->output_shape is invalid, replace shape tensor with execution tensor
+            ReplaceInputShapeTensor(1, network);
+        }
+    }
+}
+
 const char* PoolingPluginCreator::getPluginName() const noexcept {
     return "Pooling";
 }
 
 REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING);
-REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Pooling, LAYER_POOLING_3D);
 
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc
index 746156e37..7afc301ec 100644
--- a/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/power_layer_builder.cc
@@ -38,6 +38,15 @@ ILayer* PowTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
             shuffle_layer->setReshapeDimensions(d);
             tensor = shuffle_layer->getOutput(0);
         }
+
+        if (tensor->getType()==nvinfer1::DataType::kINT8 ||
+            tensor->getType()==nvinfer1::DataType::kINT32) {
+            ILayer* cast_layer = network->addIdentity(*tensor);
+            cast_layer->setName((layer_name_+"_int2fp").c_str());
+            cast_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+            tensor = cast_layer->getOutput(0);
+        }
+
         IUnaryLayer* layer = network->addUnary(*tensor, UnaryOperation::kSQRT);
         if (layer != nullptr) {
             layer->setName(layer_name_.c_str());
diff --git a/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc
index 6a19e0e93..1ac92e06a 100644
--- a/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/prelu_layer_builder.cc
@@ -32,7 +32,6 @@ ILayer* PReluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) no
     auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-    bool int8 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetInt8Mode();
 
     const auto activation_type = nvinfer1::ActivationType::kLEAKY_RELU;
     ILayer* last_layer;
@@ -45,13 +44,6 @@ ILayer* PReluTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) no
         last_layer = activation_layer;
     }
 
-    if (int8) {
-        float output_scale_value = std::dynamic_pointer_cast<TensorRTTensor>(
-            output_foreign_tensor)->GetIntResource()->scale_handle.force_to<float*>()[0];
-        return AddInt8OutputQDQLayers(network, last_layer->getOutput(0), output_foreign_tensor,
-            output_scale_value, 1 / output_scale_value);
-    }
-
     return last_layer;
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/quantize_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/quantize_layer_builder.cc
new file mode 100644
index 000000000..74fadd403
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/quantize_layer_builder.cc
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/tensorrt_network.h"
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Quantize, LAYER_QUANTIZE);
+
+ILayer* QuantizeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+#if NV_TENSORRT_MAJOR < 8
+    LOGE("quant layer builder is not support before TensorRT8\n");
+    return nullptr;
+#else
+    auto layer_param = dynamic_cast<QuantizeLayerParam*>(param_);
+    auto tensor = GetInputITensors()[0];
+    //auto scale = GetInputITensors()[1];
+    auto layer_resource = dynamic_cast<QuantizeLayerResource*>(resource_);
+    auto const_layer = ConvertWeightToConstLayer(network, &(layer_resource->scale_handle));
+    nvinfer1::ITensor * scale = nullptr;
+    if (const_layer != nullptr) {
+        scale = const_layer->getOutput(0);
+    }
+
+    int64_t axis = layer_param->axis;
+
+
+    IQuantizeLayer* quantize_layer = network->addQuantize(*tensor, *scale);
+    quantize_layer->setAxis(axis);
+    if (quantize_layer != nullptr) {
+        quantize_layer->setName(layer_name_.c_str());
+    }
+    return quantize_layer;
+#endif
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Quantize, LAYER_QUANTIZE);
+
+}  //  namespace TNN_NS
+
diff --git a/source/tnn/network/tensorrt/layer_builder/range_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/range_layer_builder.cc
index f030e773d..44478c3cc 100644
--- a/source/tnn/network/tensorrt/layer_builder/range_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/range_layer_builder.cc
@@ -20,15 +20,56 @@ namespace TNN_NS {
 DECLARE_TENSORRT_LAYER_BUILDER(Range, LAYER_RANGE);
 
 ILayer* RangeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto layer_param = dynamic_cast<RangeLayerParam*>(param_);
     auto input_tensors = GetInputITensors();
-    const ShapeTensor start(*input_tensors[0]);
-    const ShapeTensor limit(*input_tensors[1]);
-    const ShapeTensor delta(*input_tensors[2]);
-
-    ShapeTensor zero = shapeScalar(0);
-    ShapeTensor numberOfElements = max(network, sub(network, zero,
-        floorDiv(network, sub(network, start, limit), delta)), zero);
-    IFillLayer* layer = addFill(network, convertTo1D(network, numberOfElements), FillOperation::kLINSPACE);
+    ShapeTensor start;
+    ShapeTensor limit;
+    ShapeTensor delta;
+    if (input_tensors.size()==3) {
+        start = ShapeTensor(*input_tensors[0]);
+        limit = ShapeTensor(*input_tensors[1]);
+        delta = ShapeTensor(*input_tensors[2]);
+    } else { //input_tensors.size()<3
+        if (layer_param->start_index==-1) {
+            //std::cout << "[Range AddToNet] start_index, start.i = " << layer_param->start.i << std::endl;
+            start = shapeVector(layer_param->start.i);
+        } else {
+            //std::cout << "[Range AddToNet] start_index, param_idx = " << layer_param->start_index << std::endl;
+            start = ShapeTensor(*input_tensors[layer_param->start_index]);
+        }
+        if (layer_param->limit_index==-1) {
+            //std::cout << "[Range AddToNet] limit_index, limit.i = " << layer_param->limit.i << std::endl;
+            limit = shapeVector(layer_param->limit.i);
+        } else {
+            //std::cout << "[Range AddToNet] limit_index, param_idx = " << layer_param->limit_index << std::endl;
+            limit = ShapeTensor(*input_tensors[layer_param->limit_index]);
+            //std::cout << "[Range AddToNet] limit.allValuesKnown = " << (int)limit.allValuesKnown() << std::endl;
+            //std::cout << "[Range AddToNet] limit[0] = " << (int)limit[0] << std::endl;
+        }
+        if (layer_param->delta_index==-1) {
+            //std::cout << "[Range AddToNet] delta_index, delta.i = " << layer_param->delta.i << std::endl;
+            delta = shapeVector(layer_param->delta.i);
+        } else {
+            //std::cout << "[Range AddToNet] delta_index, param_idx = " << layer_param->delta_index << std::endl;
+            delta = ShapeTensor(*input_tensors[layer_param->delta_index]);
+        }
+    }
+
+    //ShapeTensor zero = shapeScalar(0);
+    ShapeTensor zero;
+    if (start.rank()==0) {
+        zero = shapeScalar(0);
+    } else {
+        zero = shapeVector(0);
+    }
+    ShapeTensor step1 = sub(network, start, limit);
+    ShapeTensor step2 = floorDiv(network, step1, delta);
+    ShapeTensor step3 = sub(network, zero, step2);
+    ShapeTensor numberOfElements = max(network, step3, zero);
+    if (numberOfElements.rank()==0) {
+        numberOfElements = convertTo1D(network, numberOfElements);
+    }
+    IFillLayer* layer = addFill(network, numberOfElements, FillOperation::kLINSPACE);
     if (start.allValuesKnown() && delta.allValuesKnown()) {
         layer->setAlpha(start[0]);
         layer->setBeta(delta[0]);
@@ -36,6 +77,7 @@ ILayer* RangeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     } else {
         layer->setInput(1, start.tensor(network));
         layer->setInput(2, convertTo1D(network, delta).tensor(network));
+        layer->setOutputType(0, nvinfer1::DataType::kINT32);
     }
 
     return layer;
diff --git a/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc
index 38483f080..7a08a534b 100644
--- a/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/reduce_layer_builder.cc
@@ -27,9 +27,12 @@ uint32_t ReduceTRTLayerBuilder::GetReduceAxis() {
     auto axis = paramlist->axis;
     uint32_t reduceAxis = 0x0;
     for (int i = 0; i < axis.size(); i++) {
-        axis[i] = axis[i] > 0 ? axis[i] : axis[i] + GetInputITensors()[0]->getDimensions().nbDims;
+        axis[i] = axis[i] >= 0 ? axis[i] : axis[i] + GetInputITensors()[0]->getDimensions().nbDims;
     }
 
+    if (std::find(axis.begin(), axis.end(), 0) != axis.end()) {
+        reduceAxis |= 0x1;
+    }
     if (std::find(axis.begin(), axis.end(), 1) != axis.end()) {
         reduceAxis |= 0x2;
     }
diff --git a/source/tnn/network/tensorrt/layer_builder/reduce_prod_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/reduce_prod_layer_builder.cc
new file mode 100644
index 000000000..d836433cc
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/reduce_prod_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/reduce_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_REDUCE_LAYER_BUILDER(ReduceProd);
+
+ReduceProdTRTLayerBuilder::ReduceProdTRTLayerBuilder(LayerType ignore) : ReduceTRTLayerBuilder(ignore) {
+    m_op = ReduceOperation::kPROD;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReduceProd, LAYER_REDUCE_PROD);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/reshape_torch_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/reshape_torch_layer_builder.cc
new file mode 100644
index 000000000..1bc5e3bfa
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/reshape_torch_layer_builder.cc
@@ -0,0 +1,81 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(ReshapeTorch, LAYER_RESHAPETORCH);
+
+// In TorchScript model, there may be multiple Reshapes that share the same shape.
+// E.g., there are two inputs input_0 and input_1, the input size of input_0 is (1, 3, 16, 16) and
+// the input size of input_1 is (1, 3, 32, 32), the shared shape is (1, 3, -1). When inferencing,
+// the -1 in shape will be converted to the real value. In LAYER_RESHAPE,
+// there is a problem with using input_tensors[1] as shape. Therefore, add LAYER_RESHAPETORCH,
+// use the shape stored in param first.
+ILayer* ReshapeTorchTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    auto paramlist = dynamic_cast<ReshapeLayerParam*>(param_);
+
+    Blob* output_blob  = output_blobs_[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    Dims reshape_dims = ConvertToTRTDims(paramlist->shape);
+    auto input_tensors = GetInputITensors();
+    auto output_tensors = GetOutputITensors();
+    IShuffleLayer* layer = network->addShuffle(*input_tensors[0]);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+
+        if (input_tensors.size() > 1 && paramlist->shape.size() == 0) {
+            layer->setInput(1, *input_tensors[1]);
+        } else {
+            layer->setReshapeDimensions(reshape_dims);
+        }
+        if (paramlist->reshape_type != 0 && output_dims.size() <= 4) {
+            Permutation CHW2HWC;
+            const auto& input_dims = input_blobs_[0]->GetBlobDesc().dims;
+            CHW2HWC.order[0] = 0;
+            CHW2HWC.order[input_dims.size()-1] = 1;
+            for(int i=1; i<input_dims.size()-1; ++i) {
+                CHW2HWC.order[i] = i+1;
+            }
+            layer->setFirstTranspose(CHW2HWC);
+            Permutation HWC2CHW;
+            HWC2CHW.order[0] = 0;
+            HWC2CHW.order[1] = output_dims.size()-1;
+            for(int i=2; i<output_dims.size(); ++i) {
+                HWC2CHW.order[i] = i-1;
+            }
+            auto permuted_dims = output_dims;
+            permuted_dims[output_dims.size()-1] = output_dims[1];
+            for(int i=2; i<output_dims.size(); ++i) {
+                permuted_dims[i-1] = output_dims[i];
+            }
+            layer->setReshapeDimensions(ConvertToTRTDynamicDims(permuted_dims));
+            layer->setSecondTranspose(HWC2CHW);
+        }
+    }
+
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    if (std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->IsQuantized()) {
+        std::dynamic_pointer_cast<TensorRTTensor>(output_foreign_tensor)->SetQuantized();
+    }
+
+    return layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(ReshapeTorch, LAYER_RESHAPETORCH);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc
index 5f4624f26..06e9a8892 100644
--- a/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/roialign_layer_builder.cc
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
 
 namespace TNN_NS {
 
@@ -44,7 +45,51 @@ nvinfer1::DataType RoiAlignTRTPluginLayerBuilder::getOutputDataType(int index, c
 }
 
 ILayer* RoiAlignTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 84
+    auto layer_param = dynamic_cast<RoiAlignLayerParam*>(param_);
+    if (!layer_param) {
+        LOGE("RoiAlignTRTPluginLayerBuilder: Unable to get layer param.");
+        return nullptr;
+    }
+
+    int coordinate_transformation_mode = layer_param->aligned;
+    int mode = layer_param->mode;
+    int output_height = layer_param->output_height;
+    int output_width = layer_param->output_width;
+    int sampling_ratio = layer_param->sampling_ratio;
+    float spatial_scale = layer_param->spatial_scale;
+
+    auto creator = getPluginRegistry()->getPluginCreator("ROIAlign_TRT", "1", "");
+    if (!creator) {
+        LOGE("ROIAlignTRTLayerBuilder: Unable to find creator for TRT ROIAlign_TRT V1 Plugin Layer.");
+        return nullptr;
+    }
+
+    std::vector<ITensor*> input_tensors;
+    for (int i = 0; i < input_blobs_.size(); i++) {
+        auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[i])->GetForeignTensor();
+        input_tensors.push_back(std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor());
+    }
+
+    std::vector<nvinfer1::PluginField> roi_align_v1_field;
+    roi_align_v1_field.emplace_back("coordinate_transformation_mode", &coordinate_transformation_mode, nvinfer1::PluginFieldType::kINT32, 1);
+    roi_align_v1_field.emplace_back("mode", &mode, nvinfer1::PluginFieldType::kINT32, 1);
+    roi_align_v1_field.emplace_back("output_height", &output_height, nvinfer1::PluginFieldType::kINT32, 1);
+    roi_align_v1_field.emplace_back("output_width", &output_width, nvinfer1::PluginFieldType::kINT32, 1);
+    roi_align_v1_field.emplace_back("sampling_ratio", &sampling_ratio, nvinfer1::PluginFieldType::kINT32, 1);
+    roi_align_v1_field.emplace_back("spatial_scale", &spatial_scale, nvinfer1::PluginFieldType::kFLOAT32, 1);
+
+    PluginFieldCollection roi_align_v1_fc {6, roi_align_v1_field.data()};
+    IPluginV2* plugin_obj = creator->createPlugin(layer_name_.c_str(), &roi_align_v1_fc);
+    auto layer = network->addPluginV2(input_tensors.data(), input_blobs_.size(), *plugin_obj);
+    if (layer != nullptr) {
+        layer->setName((layer_name_).c_str());
+    }
+
+    return layer;
+#else
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
+#endif
 }
 
 DimsExprs RoiAlignTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
diff --git a/source/tnn/network/tensorrt/layer_builder/roll_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/roll_layer_builder.cc
new file mode 100644
index 000000000..2dddb9752
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/roll_layer_builder.cc
@@ -0,0 +1,62 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(Roll, LAYER_ROLL);
+
+bool RollTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF ||
+             inOut[pos].type == nvinfer1::DataType::kINT32) && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
+        && inOut[pos].type == inOut[0].type);
+}
+
+Status RollTRTPluginLayerBuilder::Reshape() {
+    return TNN_OK;
+}
+
+const char* RollTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "Roll";
+}
+
+nvinfer1::DataType RollTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* RollTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs RollTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
+        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    DimsExprs output(inputs[0]);
+    for (int i = 1; i < nbInputs; i++) {
+        for (int j = 0; j < output.nbDims; j++) {
+            output.d[j] = exprBuilder.operation(DimensionOperation::kMAX, *output.d[j], *inputs[i].d[j]);
+        }
+    }
+    return output;
+}
+
+const char* RollPluginCreator::getPluginName() const noexcept {
+    return "Roll";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(Roll, LAYER_ROLL);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc
index 31daa50db..efa885e7a 100644
--- a/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/scatter_nd_layer_builder.cc
@@ -47,6 +47,16 @@ nvinfer1::DataType ScatterNDTRTPluginLayerBuilder::getOutputDataType(int index,
 }
 
 ILayer* ScatterNDTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    auto input_tensors = GetInputITensors();
+    if (input_tensors.size() == 3) {
+        IScatterLayer* layer = network->addScatter(*input_tensors[0], *input_tensors[1], *input_tensors[2], nvinfer1::ScatterMode::kND);
+        if (layer != nullptr) {
+            layer->setName(layer_name_.c_str());
+            layer->setAxis(0);
+        }
+
+        return layer;
+    }
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
 }
 
diff --git a/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc
index 2f744867c..4f2eb35ed 100644
--- a/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/shape_layer_builder.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -25,7 +25,15 @@ ILayer* ShapeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());   
     }
+#if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+    auto castlayer = network->addCast(*(layer->getOutput(0)), nvinfer1::DataType::kINT32);
+    if (castlayer != nullptr) {
+        castlayer->setName((layer_name_+"_cast2int32").c_str());   
+    }
+    return castlayer;
+#else
     return layer;
+#endif
 }
 
 REGISTER_TENSORRT_LAYER_BUILDER(Shape, LAYER_SHAPE);
diff --git a/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc
index b8267118e..a22ac0147 100644
--- a/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/softmax_layer_builder.cc
@@ -24,11 +24,15 @@ ILayer* SoftmaxTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
     auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
     ILayer* layer;
+    auto tensor = GetInputITensors()[0];
+    int size = tensor->getDimensions().nbDims;
+    int axis = paramlist->axis;
+    axis = axis < 0 ? axis + size : axis;
 
     ISoftMaxLayer* softmax_layer = network->addSoftMax(*input_tensor);
     if (softmax_layer != nullptr) {
         softmax_layer->setName(layer_name_.c_str());
-        softmax_layer->setAxes(1 << paramlist->axis);
+        softmax_layer->setAxes(1 << axis);
         input_tensor = softmax_layer->getOutput(0);
         layer = softmax_layer;
     }
diff --git a/source/tnn/network/tensorrt/layer_builder/split_gelu_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/split_gelu_layer_builder.cc
new file mode 100644
index 000000000..4e7fc1987
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/split_gelu_layer_builder.cc
@@ -0,0 +1,60 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(SplitGELU, LAYER_FUSED_SPLIT_GELU);
+
+bool SplitGELUTRTPluginLayerBuilder::supportsFormatCombination(
+        int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
+    if (pos == 0) {
+        return (inOut[0].type == nvinfer1::DataType::kFLOAT || inOut[0].type == nvinfer1::DataType::kHALF) &&
+               inOut[0].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+    return inOut[pos].type == inOut[0].type && inOut[pos].format == inOut[0].format;
+}
+
+Status SplitGELUTRTPluginLayerBuilder::Reshape() {
+    return m_layer->Reshape();
+}
+
+const char* SplitGELUTRTPluginLayerBuilder::getPluginType() const noexcept {
+    return "SplitGELU";
+}
+
+nvinfer1::DataType SplitGELUTRTPluginLayerBuilder::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+        int nbInputs) const noexcept {
+    return inputTypes[0];
+}
+
+ILayer* SplitGELUTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
+    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+}
+
+DimsExprs SplitGELUTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,
+                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
+    DimsExprs output(inputs[0]);
+    output.d[2] = exprBuilder.operation(DimensionOperation::kFLOOR_DIV, *inputs[0].d[2], *exprBuilder.constant(2));
+    return output;
+}
+
+const char* SplitGELUPluginCreator::getPluginName() const noexcept {
+    return "SplitGELU";
+}
+
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(SplitGELU, LAYER_FUSED_SPLIT_GELU);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc
index 95cb74bf1..c34f0ba78 100644
--- a/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/splitv_layer_builder.cc
@@ -16,12 +16,18 @@
 
 namespace TNN_NS {
 
-DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(SplitV, LAYER_SPLITV);
+DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER_WITH_FUNC(SplitV, LAYER_SPLITV,
+                                                void CheckInputShapeTensor(INetworkDefinition* network););
 
 bool SplitVTRTPluginLayerBuilder::supportsFormatCombination(
         int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept {
-    return ((inOut[pos].type == nvinfer1::DataType::kFLOAT) && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR
-        && inOut[pos].type == inOut[0].type);
+    if (pos == 0) {
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    } else if (pos >= nbInputs) {
+        return inOut[pos].type == inOut[0].type && inOut[pos].format == inOut[0].format;
+    } else {
+        return true;
+    }
 }
 
 Status SplitVTRTPluginLayerBuilder::Reshape() {
@@ -41,23 +47,40 @@ ILayer* SplitVTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) n
     return TensorRTPluginLayerBuilder::AddToNetwork(network);
 }
 
-DimsExprs SplitVTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
-        int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept {
+DimsExprs SplitVTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,
+                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
     auto param = dynamic_cast<SplitVLayerParam*>(param_);
     DimsExprs output(inputs[0]);
+    auto axis = param->axis >= 0 ? param->axis : (inputs[0].nbDims + param->axis);
     if (param->is_split_specified) {
-        output.d[param->axis] = exprBuilder.constant(param->slices[index]);
+        auto slices = std::set<int>(param->slices.begin(), param->slices.end());
+        if (slices.size() == 0 || (slices.size() == 1 && slices.count(0))) {
+            output.d[axis] = inputs[1].d[index];
+        } else {
+            output.d[axis] = exprBuilder.constant(param->slices[index]);
+        }
     } else {
-        output.d[param->axis] = exprBuilder.operation(DimensionOperation::kCEIL_DIV, *inputs[0].d[param->axis],
+        output.d[axis] = exprBuilder.operation(DimensionOperation::kCEIL_DIV, *inputs[0].d[axis],
                                                       *exprBuilder.constant(param->slices.size()));
     }
     return output;
 }
 
+void SplitVTRTPluginLayerBuilder::CheckInputShapeTensor(INetworkDefinition* network) {
+    auto param = dynamic_cast<SplitVLayerParam*>(param_);
+    if (param->is_split_specified) {
+        auto slices = std::set<int>(param->slices.begin(), param->slices.end());
+        if (slices.size() == 0 || (slices.size() == 1 && slices.count(0))) {
+            ReplaceInputShapeTensor(1, network);
+        }
+    }
+}
+
 const char* SplitVPluginCreator::getPluginName() const noexcept {
     return "SplitV";
 }
 
 REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(SplitV, LAYER_SPLITV);
+REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(SplitV, LAYER_SPLITTORCH);
 
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc
index d4e289b3b..6e3e20b32 100644
--- a/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/squeeze_layer_builder.cc
@@ -23,11 +23,49 @@ ILayer* SqueezeTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto axes = paramlist->axes;
     auto tensor = GetInputITensors()[0];
     int size = tensor->getDimensions().nbDims;
-    for (auto& axis : axes) {
-        if (axis < 0) {
-            axis += size;
+    if (axes.empty()) {
+        // TORCH has squeeze without dim,
+        // it squeezes all dims != 1
+        // This squeeze is dangerous, it is not encouraged, model trainers should make sure
+        // that min_dim[i]==1, max_dim[i]!=1 cases should not happen, otherwise ERRORS may occur.
+        DimsVector blob_dims = input_blobs_[0]->GetBlobDesc().dims;
+        if (!blob_dims.empty()) {
+            // We have input blob dim infomation,
+            // This infomation is rather relible
+            for (int i=0; i<blob_dims.size(); i++) {
+                if (blob_dims[i] == 1) {
+                    axes.push_back(i);
+                } 
+            }
+        } else {
+            // No input blob info available,
+            // we use TRT ITensor dim
+            // less reliable because dim[i] == -1 is not counted.
+            LOGI("WARNING: Run into Squeeze TRT LayerBuilder with param->axes empty and input blob info EMPTY, axes now depends on TRT ITensor dim, may lead to potential error. torch.Squeeze(%in) with no dims is overall not recommended.");
+            auto itensor_dims = tensor->getDimensions();
+            for (int i=0; i<itensor_dims.nbDims; i++) {
+                if (itensor_dims.d[i] == 1) {
+                    axes.push_back(i);
+                }
+            }
+        }
+        if (!axes.empty()) {
+            paramlist->axes = axes;
+        }
+    } else {
+        for (auto& axis : axes) {
+            if (axis < 0) {
+                axis += size;
+            }
         }
     }
+
+    // Return ERROR if axes is still empty
+    if (axes.empty()) {
+        LOGE("SqueezeTRTLayerBuilder: Unable to to get or determine AXEs for Squeeze Layer.");
+        return nullptr;
+    }
+
     auto layer = addSqueeze(network, *tensor, axes);
     if (layer != nullptr) {
         layer->setName(layer_name_.c_str());
diff --git a/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc
index b7eb8391b..b26943a53 100644
--- a/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/strided_slice_v2_layer_builder.cc
@@ -75,6 +75,13 @@ ILayer* StrideSliceV2TRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
                 end_dim[param->axes[i]] = param->begins[param->axes[i]] + dim[param->axes[i]];
         }
     }
+
+    if (stride_dim.size() == 0 && input_tensors.size() < 5) { // stride defaults to 1
+        stride_dim = DimsVector(axes.size() > 0 ? axes.size() : dims.size(), 1);
+    }
+    // NOTE & TODO: when input_tensors[3] exists, should we accept it as axes?
+    // NOTE & TODO: when input_tensors[4] exists, should we accept it as strides?
+
     axes = ShapeTensor(1, std::move(axes_dim));
     strides = ShapeTensor(1, std::move(stride_dim));
 
@@ -84,11 +91,24 @@ ILayer* StrideSliceV2TRTLayerBuilder::AddToNetwork(INetworkDefinition* network)
     }
 
     if (input_tensors.size() >= 2) {
-        begins = ShapeTensor(*input_tensors[1], 0);
-    }
-
-    if (input_tensors.size() >= 3) {
-        ends = ShapeTensor(*input_tensors[2], 0);
+        if (input_tensors.size() >= 3) {
+            begins = ShapeTensor(*input_tensors[1], 0);
+            ends = ShapeTensor(*input_tensors[2], 0);
+        } else {
+            if ((param->begins_index != -1 && param->ends_index != -1) ||
+                (param->begins_index == -1 && param->ends_index == -1)) {
+                LOGE("StridedSliceV2LayerBuilder: Unable to determine whether input 1 is begins or ends.");
+                return nullptr;
+            }
+            if (param->begins_index != -1) {
+                begins = ShapeTensor(*input_tensors[param->begins_index], 0);
+                ends = ShapeTensor(1, std::move(end_dim));
+            }
+            if (param->ends_index != -1) {
+                begins = ShapeTensor(1, std::move(begin_dim));
+                ends = ShapeTensor(*input_tensors[param->ends_index], 0);
+            }
+        }
     }
 
     std::vector<int> newAxes;
diff --git a/source/tnn/network/tensorrt/layer_builder/swish_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/swish_layer_builder.cc
new file mode 100644
index 000000000..f3c302c29
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/swish_layer_builder.cc
@@ -0,0 +1,49 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h"
+#include "tnn/network/tensorrt/utils.h"
+
+namespace TNN_NS {
+
+DECLARE_TENSORRT_LAYER_BUILDER(Swish, LAYER_SWISH);
+
+ILayer* SwishTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    // Swish: y = x * sigmoid(x)
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+
+    // Step 1: Add Activatation Layer (sigmoid)
+    IActivationLayer* sigmoid_layer = network->addActivation(*input_tensor, nvinfer1::ActivationType::kSIGMOID);
+    if (!sigmoid_layer) {
+        LOGE("SwishTRTLayerBuilder: Unable to Add Sigmoid Activation Layer to network.");
+        return nullptr;
+    }
+    sigmoid_layer->setName((layer_name_+"_sigmoid").c_str());
+    ITensor* sigmoid_out_tensor = sigmoid_layer->getOutput(0);
+
+    // Step 2: Add Mul Layer
+    IElementWiseLayer* mul_layer = network->addElementWise(*input_tensor, *sigmoid_out_tensor, nvinfer1::ElementWiseOperation::kPROD);
+    if (!mul_layer) {
+        LOGE("SwishTRTLayerBuilder: Unable to Add Mul Layer to network.");
+        return nullptr;
+    }
+    mul_layer->setName((layer_name_+"_mul").c_str());
+
+    return mul_layer;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Swish, LAYER_SWISH);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h b/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h
index 77df3e4c0..18f1f0eb4 100644
--- a/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h
+++ b/source/tnn/network/tensorrt/layer_builder/tensorrt_base_layer_builder.h
@@ -21,7 +21,7 @@
 #include <vector>
 
 #include "NvInfer.h"
-#include "NvUtils.h"
+// #include "NvUtils.h"
 #include "NvInferPlugin.h"
 
 #include "tnn/core/macro.h"
@@ -33,6 +33,7 @@
 #include "tnn/core/status.h"
 #include "tnn/interpreter/layer_param.h"
 #include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/net_structure.h"
 #include "tnn/network/tensorrt/shape_tensor.h"
 #include "tnn/network/tensorrt/tensorrt_tensor.h"
 #include "tnn/extern_wrapper/base_layer_builder.h"
@@ -78,6 +79,11 @@ class TensorRTBaseLayerBuilder: public BaseLayerBuilder {
     // @brief set tensorrt_network
     void SetNetwork(TensorRTNetwork_ *network);
 
+    // @brief check graph topo and set fallback flag
+    virtual void CheckTopo(int id, std::vector<std::shared_ptr<LayerInfo>> &layers) = 0;
+    // @breif replace input of shape tensor with execution tensor if needed
+    virtual void CheckInputShapeTensor(INetworkDefinition* network) = 0;
+
 protected:
     // @brief Build the foreign network
     virtual Status Build();
diff --git a/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h b/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h
index ec15b8c15..c861ce91e 100644
--- a/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h
+++ b/source/tnn/network/tensorrt/layer_builder/tensorrt_layer_builder.h
@@ -37,6 +37,10 @@ class TensorRTLayerBuilder: public TensorRTBaseLayerBuilder {
     // @brief add layer to tensorRT network
     virtual ILayer* AddToNetwork(INetworkDefinition* network) = 0;
 
+    // @brief check topo
+    virtual void CheckTopo(int id, std::vector<std::shared_ptr<LayerInfo>> &layers) {};
+    // @breif replace input of shape tensor with execution tensor if needed
+    virtual void CheckInputShapeTensor(INetworkDefinition* network) {};
 };
 
 //@brief TensorRTTypeLayerBuilderRegister register TypeLayerBuilderCreator
diff --git a/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc
index 3e3a9fd4d..ff604a4cb 100644
--- a/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.cc
@@ -25,7 +25,8 @@
 namespace TNN_NS {
 
 TensorRTPluginLayerBuilder::TensorRTPluginLayerBuilder(LayerType type) : TensorRTBaseLayerBuilder(type) {
-    is_plugin = true;
+    is_plugin        = true;
+    m_maybe_fallback = false;
 }
 
 TensorRTPluginLayerBuilder::~TensorRTPluginLayerBuilder() {
@@ -50,6 +51,7 @@ Status TensorRTPluginLayerBuilder::Init(Context* context, LayerParam* param, Lay
 
     m_format = nvinfer1::TensorFormat::kLINEAR;
     m_type = nvinfer1::DataType::kFLOAT;
+    m_has_empty_tensor_input = false;
 
     return TNN_OK;
 }
@@ -100,11 +102,18 @@ int TensorRTPluginLayerBuilder::enqueue(const nvinfer1::PluginTensorDesc* inputD
         input_handle.bytes_offset = input_blob->GetHandle().bytes_offset;
         input_blob->SetHandle(input_handle);
         DimsVector dims;
+        auto foreign_blob = dynamic_cast<ForeignBlob*>(input_blob);
+        if (!foreign_blob) return -1;
         for (int j = 0; j < inputDesc[i].dims.nbDims; j++) {
             dims.push_back(inputDesc[i].dims.d[j]);
-            if (inputDesc[i].dims.d[j] == 0) is_input_zero = true;
+            // plugin with shape tensor input should be excluded
+            if (!m_has_empty_tensor_input) {
+                if (inputDesc[i].dims.d[j] == 0) is_input_zero = true;
+            }
         }
         input_blob->GetBlobDesc().dims = dims;
+        input_blob->GetBlobDesc().data_type = ConvertTRTDataType(inputDesc[i].type);
+        input_blob->GetBlobDesc().data_format = ConvertTRTDataFormat(inputDesc[i].format);
     }
 
     for (int i = 0; i < output_blobs_.size(); i++) {
@@ -118,6 +127,8 @@ int TensorRTPluginLayerBuilder::enqueue(const nvinfer1::PluginTensorDesc* inputD
             dims.push_back(outputDesc[i].dims.d[j]);
         }
         output_blob->GetBlobDesc().dims = dims;
+        output_blob->GetBlobDesc().data_type = ConvertTRTDataType(outputDesc[i].type);
+        output_blob->GetBlobDesc().data_format = ConvertTRTDataFormat(outputDesc[i].format);
     }
 
     if (is_input_zero) return 0;
@@ -129,13 +140,14 @@ int TensorRTPluginLayerBuilder::enqueue(const nvinfer1::PluginTensorDesc* inputD
 }
 
 size_t TensorRTPluginLayerBuilder::getSerializationSize() const noexcept {
-    return sizeof(m_type) + sizeof(m_format);
+    return sizeof(m_type) + sizeof(m_format) + sizeof(m_has_empty_tensor_input);
 }
 
 void TensorRTPluginLayerBuilder::serialize(void* buffer) const noexcept {
     char* d = reinterpret_cast<char*>(buffer);
     write(d, m_type);
     write(d, m_format);
+    write(d, m_has_empty_tensor_input);
 }
 
 const char* TensorRTPluginLayerBuilder::getPluginVersion() const noexcept {
@@ -173,6 +185,7 @@ nvinfer1::IPluginV2DynamicExt* TensorRTPluginLayerBuilder::CreatePlugin(const vo
     const char* d = reinterpret_cast<const char*>(data);
     m_type = read<nvinfer1::DataType>(d);
     m_format = read<TensorFormat>(d);
+    m_has_empty_tensor_input = read<bool>(d);
     return this;
 }
 
@@ -185,5 +198,82 @@ ILayer* TensorRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) no
     return layer;
 }
 
+void TensorRTPluginLayerBuilder::ReplaceInputShapeTensor(int index, INetworkDefinition* network) {
+    auto foreign_blob = dynamic_cast<ForeignBlob*>(input_blobs_[index]);
+    if (foreign_blob->GetReplaceFlag()) {
+        m_has_empty_tensor_input = true;
+        return;
+    }
+
+    auto input_tensors = GetInputITensors();
+#if 0
+    int rank           = input_tensors[index]->getDimensions().d[0];
+
+    Dims strides{rank};
+    std::fill(strides.d, strides.d + strides.nbDims, 0);
+
+    static float dump = 0.f;
+    Weights const_weight;
+    const_weight.count  = 1;
+    const_weight.type   = nvinfer1::DataType::kFLOAT;
+    const_weight.values = (void*)&dump;
+
+    nvinfer1::Dims weightDims;
+    weightDims.nbDims      = 1;
+    weightDims.d[0]        = 1;
+    ILayer* constant_layer = network->addConstant(weightDims, const_weight);
+    nvinfer1::Dims unsqueezeDims{rank};
+    std::fill(unsqueezeDims.d, unsqueezeDims.d + unsqueezeDims.nbDims, 1);
+    IShuffleLayer* unsqueeze = network->addShuffle(*constant_layer->getOutput(0));
+    unsqueeze->setReshapeDimensions(unsqueezeDims);
+
+    Dims starts;
+    starts.nbDims = rank;
+    for (int i = 0; i < rank; i++) {
+        starts.d[i] = 0;
+    }
+    ISliceLayer* broadcast_layer = network->addSlice(*unsqueeze->getOutput(0), starts, nvinfer1::Dims{}, strides);
+    broadcast_layer->setName((layer_name_ + "_constant_of_shape_slice").c_str());
+
+    if (broadcast_layer != nullptr) {
+        broadcast_layer->setInput(2, *input_tensors[index]);
+    }
+
+    ILayer* layer = broadcast_layer;
+#else
+    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#comm-shape-tensors-plug-ins
+    // the method in above doc has a misstake, empty tensor must have empty value ptr
+    // Create an empty-tensor constant with dimensions [1...0].
+    int rank = input_tensors[index]->getDimensions().d[0];
+    Dims d01;
+    d01.nbDims = rank + 1;
+    d01.d[rank] = 0;
+    std::fill(d01.d, d01.d + rank, 1);
+    ITensor *c01 = network->addConstant(d01, {nvinfer1::DataType::kFLOAT, 0x0, 0})->getOutput(0);
+
+    // Create shape tensor that has the value [P,Q...0]
+    static int32_t const intZero = 0;
+    ITensor* z = network->addConstant({1, {1}}, {nvinfer1::DataType::kINT32, &intZero, 1})->getOutput(0);
+    ITensor* concatInputs[] = {input_tensors[index], z};
+    IConcatenationLayer* zpq = network->addConcatenation(concatInputs, 2);
+    zpq->setAxis(0);
+
+    // Create zero-stride slice with output size [P,Q...0]
+    Dims dx;
+    dx.nbDims = rank + 1;
+    std::fill(dx.d, dx.d + dx.nbDims, 0);
+    ISliceLayer* slice = network->addSlice(*c01, dx, dx, dx);
+    slice->setInput(2, *zpq->getOutput(0));
+    ILayer *layer = slice;
+
+#endif
+
+    auto replace_tensor = std::make_shared<TensorRTTensor>();
+    replace_tensor->SetTensor(layer->getOutput(0));
+
+    foreign_blob->SetForeignTensor(replace_tensor, true);
+    m_has_empty_tensor_input = true;
+}
+
 }  //  namespace TNN_NS
 
diff --git a/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h b/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h
index f7bfd2eb3..74c9d8d15 100644
--- a/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h
+++ b/source/tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h
@@ -20,7 +20,8 @@
 
 namespace TNN_NS {
 
-constexpr const char* PLUGIN_VERSION{"1"};
+#define TNN_PLUGIN_VERSION "1001"
+constexpr const char* PLUGIN_VERSION{TNN_PLUGIN_VERSION};
 
 // @brief TensorRTPluginLayer Builder, defines the tensorRT plugin layer builder interface
 class TensorRTPluginLayerBuilder : public TensorRTBaseLayerBuilder, public nvinfer1::IPluginV2DynamicExt {
@@ -74,11 +75,18 @@ class TensorRTPluginLayerBuilder : public TensorRTBaseLayerBuilder, public nvinf
 
     nvinfer1::IPluginV2DynamicExt* CreatePlugin(const void* data, size_t length) noexcept;
 
+    virtual void CheckTopo(int id, std::vector<std::shared_ptr<LayerInfo>> &layers) {};
+    virtual void CheckInputShapeTensor(INetworkDefinition* network) {};
+
 protected:
     std::string m_plugin_namespace;
     nvinfer1::DataType m_type;
     TensorFormat m_format;
     Context* context_;
+    bool m_has_empty_tensor_input;
+    bool m_maybe_fallback;
+
+    void ReplaceInputShapeTensor(int index, INetworkDefinition* network);
 
 private:
     template<typename T>
@@ -104,69 +112,105 @@ class TRTPluginTypeLayerBuilderRegister {
     }
 };
 
-#define DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                             \
-    class type_string##TRTPluginLayerBuilder : public TensorRTPluginLayerBuilder {                                 \
-    public:                                                                                                        \
-        type_string##TRTPluginLayerBuilder(LayerType layer_type) : TensorRTPluginLayerBuilder(layer_type) {}       \
-        virtual ~type_string##TRTPluginLayerBuilder() {}                                                           \
-        virtual bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut,                   \
-            int nbInputs, int nbOutputs) noexcept;                                                                 \
-        virtual DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,          \
-            nvinfer1::IExprBuilder& exprBuilder) noexcept;                                                         \
-        virtual const char* getPluginType() const noexcept;                                                        \
-        virtual nvinfer1::IPluginV2DynamicExt* clone() const noexcept {                                            \
-            auto* plugin = new type_string##TRTPluginLayerBuilder(*this);                                          \
-            plugin->setPluginNamespace(this->m_plugin_namespace.c_str());                                          \
-            return plugin;                                                                                         \
-        }                                                                                                          \
-        virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,              \
-            int nbInputs) const noexcept;                                                                          \
-        virtual ILayer* AddToNetwork(INetworkDefinition* network) noexcept;                                        \
-        virtual Status Reshape();                                                                                  \
-    };                                                                                                             \
-    class type_string##PluginCreator : public nvinfer1::IPluginCreator {                                           \
-    public:                                                                                                        \
-        type_string##PluginCreator() {                                                                             \
-            m_fc.nbFields = 0;                                                                                     \
-            m_fc.fields = nullptr;                                                                                 \
-        }                                                                                                          \
-        virtual const char* getPluginName() const noexcept;                                                        \
-        virtual const char* getPluginVersion() const noexcept { return PLUGIN_VERSION; }                           \
-        virtual const nvinfer1::PluginFieldCollection* getFieldNames() noexcept { return &m_fc; }                  \
-        virtual const char* getPluginNamespace() const noexcept { return m_plugin_namespace.c_str(); }             \
-        virtual void setPluginNamespace(const char* libNamespace) noexcept { m_plugin_namespace = libNamespace; }  \
-        virtual nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name,                                      \
-                const nvinfer1::PluginFieldCollection* fc) noexcept {                                              \
-            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                               \
-                TensorRTNetwork_::GetPluginLayerNameMap();                                                         \
-            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                   \
-            auto plugin = layer->CreatePlugin();                                                                   \
-            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                \
-            return plugin;                                                                                         \
-        }                                                                                                          \
-        virtual nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,                                 \
-                const void* serialData, size_t serialLength) noexcept {                                            \
-            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                               \
-                TensorRTNetwork_::GetPluginLayerNameMap();                                                         \
-            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                   \
-            IPluginV2DynamicExt* plugin;                                                                           \
-            if (serialLength == 0) {                                                                               \
-                plugin = layer->CreatePlugin();                                                                    \
-            } else {                                                                                               \
-                plugin = layer->CreatePlugin(serialData, serialLength);                                            \
-            }                                                                                                      \
-            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                \
-            auto new_plugin = plugin->clone();                                                                     \
-            return new_plugin;                                                                                     \
-        }                                                                                                          \
-    private:                                                                                                       \
-        nvinfer1::PluginFieldCollection m_fc;                                                                      \
-        std::string m_plugin_namespace;                                                                            \
-    };                                                                                                             \
+#define DECLARE_TENSORRT_PLUGIN(type_string)                                                                           \
+    class type_string##PluginCreator : public nvinfer1::IPluginCreator {                                               \
+    public:                                                                                                            \
+        type_string##PluginCreator() {                                                                                 \
+            m_fc.nbFields = 0;                                                                                         \
+            m_fc.fields   = nullptr;                                                                                   \
+        }                                                                                                              \
+        virtual const char* getPluginName() const noexcept;                                                            \
+        virtual const char* getPluginVersion() const noexcept {                                                        \
+            return PLUGIN_VERSION;                                                                                     \
+        }                                                                                                              \
+        virtual const nvinfer1::PluginFieldCollection* getFieldNames() noexcept {                                      \
+            return &m_fc;                                                                                              \
+        }                                                                                                              \
+        virtual const char* getPluginNamespace() const noexcept {                                                      \
+            return m_plugin_namespace.c_str();                                                                         \
+        }                                                                                                              \
+        virtual void setPluginNamespace(const char* libNamespace) noexcept {                                           \
+            m_plugin_namespace = libNamespace;                                                                         \
+        }                                                                                                              \
+        virtual nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name,                                          \
+                                                            const nvinfer1::PluginFieldCollection* fc) noexcept {      \
+            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                                   \
+                TensorRTNetwork_::GetPluginLayerNameMap();                                                             \
+            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                       \
+            auto plugin                       = layer->CreatePlugin();                                                 \
+            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                    \
+            return plugin;                                                                                             \
+        }                                                                                                              \
+        virtual nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData,             \
+                                                                 size_t serialLength) noexcept {                       \
+            std::unordered_map<std::string, TensorRTPluginLayerBuilder*> layer_map =                                   \
+                TensorRTNetwork_::GetPluginLayerNameMap();                                                             \
+            TensorRTPluginLayerBuilder* layer = layer_map[name];                                                       \
+            IPluginV2DynamicExt* plugin;                                                                               \
+            if (serialLength == 0) {                                                                                   \
+                plugin = layer->CreatePlugin();                                                                        \
+            } else {                                                                                                   \
+                plugin = layer->CreatePlugin(serialData, serialLength);                                                \
+            }                                                                                                          \
+            plugin->setPluginNamespace(m_plugin_namespace.c_str());                                                    \
+            auto new_plugin = plugin->clone();                                                                         \
+            return new_plugin;                                                                                         \
+        }                                                                                                              \
+                                                                                                                       \
+    private:                                                                                                           \
+        nvinfer1::PluginFieldCollection m_fc;                                                                          \
+        std::string m_plugin_namespace;                                                                                \
+    };                                                                                                                 \
     REGISTER_TENSORRT_PLUGIN(type_string##PluginCreator);
 
-#define REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                            \
-    TRTPluginTypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##TRTPluginLayerBuilder>>                 \
+#define DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                                 \
+    class type_string##TRTPluginLayerBuilder : public TensorRTPluginLayerBuilder {                                     \
+    public:                                                                                                            \
+        type_string##TRTPluginLayerBuilder(LayerType layer_type) : TensorRTPluginLayerBuilder(layer_type) {}           \
+        virtual ~type_string##TRTPluginLayerBuilder() {}                                                               \
+        virtual bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,         \
+                                               int nbOutputs) noexcept;                                                \
+        virtual DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,              \
+                                              nvinfer1::IExprBuilder& exprBuilder) noexcept;                           \
+        virtual const char* getPluginType() const noexcept;                                                            \
+        virtual nvinfer1::IPluginV2DynamicExt* clone() const noexcept {                                                \
+            auto* plugin = new type_string##TRTPluginLayerBuilder(*this);                                              \
+            plugin->setPluginNamespace(this->m_plugin_namespace.c_str());                                              \
+            return plugin;                                                                                             \
+        }                                                                                                              \
+        virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,                  \
+                                                     int nbInputs) const noexcept;                                     \
+        virtual ILayer* AddToNetwork(INetworkDefinition* network) noexcept;                                            \
+        virtual Status Reshape();                                                                                      \
+    };                                                                                                                 \
+    DECLARE_TENSORRT_PLUGIN(type_string)
+
+
+#define DECLARE_TENSORRT_PLUGIN_LAYER_BUILDER_WITH_FUNC(type_string, layer_type, extra_func)                                                 \
+    class type_string##TRTPluginLayerBuilder : public TensorRTPluginLayerBuilder {                                     \
+    public:                                                                                                            \
+        type_string##TRTPluginLayerBuilder(LayerType layer_type) : TensorRTPluginLayerBuilder(layer_type) {}           \
+        virtual ~type_string##TRTPluginLayerBuilder() {}                                                               \
+        virtual bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,         \
+                                               int nbOutputs) noexcept;                                                \
+        virtual DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs, int nbInputs,              \
+                                              nvinfer1::IExprBuilder& exprBuilder) noexcept;                           \
+        virtual const char* getPluginType() const noexcept;                                                            \
+        virtual nvinfer1::IPluginV2DynamicExt* clone() const noexcept {                                                \
+            auto* plugin = new type_string##TRTPluginLayerBuilder(*this);                                              \
+            plugin->setPluginNamespace(this->m_plugin_namespace.c_str());                                              \
+            return plugin;                                                                                             \
+        }                                                                                                              \
+        virtual nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,                  \
+                                                     int nbInputs) const noexcept;                                     \
+        virtual ILayer* AddToNetwork(INetworkDefinition* network) noexcept;                                            \
+        virtual Status Reshape();                                                                                      \
+        extra_func                                                                                                     \
+    };                                                                                                                 \
+    DECLARE_TENSORRT_PLUGIN(type_string)
+
+#define REGISTER_TENSORRT_PLUGIN_LAYER_BUILDER(type_string, layer_type)                                                \
+    TRTPluginTypeLayerBuilderRegister<TypeLayerBuilderCreator<type_string##TRTPluginLayerBuilder>>                     \
         g_##layer_type##_trt_plugin_layer_builder_register(layer_type);
 
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc
index 8a328c0a5..9086c01e9 100644
--- a/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/tile_layer_builder.cc
@@ -22,7 +22,7 @@ ILayer* TileTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
     auto paramlist = dynamic_cast<TileLayerParam*>(param_);
     auto reps = paramlist->reps;
 
-    const auto inputDims = shapeOf(*GetInputITensors()[0]);
+    auto inputDims = shapeOf(*GetInputITensors()[0]);
     ShapeTensor repeats;
 
     if (GetInputITensors().size() == 2) {
@@ -31,11 +31,28 @@ ILayer* TileTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
         repeats = ShapeTensor(1, std::move(reps));
     }
 
+    nvinfer1::ITensor* slice_input = GetInputITensors()[0];
+
+    if (inputDims.size() != repeats.size()) {
+        std::vector<int> axis;
+        int unsqueeze_size = repeats.size() - inputDims.size();
+        if (unsqueeze_size < 0) {
+            LOGE("TileTRTLayerBuilder: repeat dims can not be less than input dims.");
+            return nullptr;
+        }
+        for (int i = 0; i < unsqueeze_size; ++i) {
+            axis.push_back(i);
+        }
+        nvinfer1::IShuffleLayer* reshape_layer = addUnsqueeze(network, *GetInputITensors()[0], axis);
+        slice_input = reshape_layer->getOutput(0);
+        inputDims = shapeOf(*slice_input);
+    }
+
     ShapeTensor outputShape = mul(network, inputDims, repeats);
-    nvinfer1::ISliceLayer* layer = addSlice(network, *GetInputITensors()[0],
+    nvinfer1::ISliceLayer* layer = addSlice(network, *slice_input,
         similar(network, inputDims, 0), outputShape, similar(network, inputDims, 1));
     layer->setName(layer_name_.c_str());
-    layer->setMode(nvinfer1::SliceMode::kWRAP);
+    layer->setMode(nvinfer1::SampleMode::kWRAP);
 
     return layer;
 }
diff --git a/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc
index 83100cc21..0d2058eb5 100644
--- a/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/upsample_layer_builder.cc
@@ -38,46 +38,132 @@ nvinfer1::DataType UpsampleTRTPluginLayerBuilder::getOutputDataType(int index, c
 
 ILayer* UpsampleTRTPluginLayerBuilder::AddToNetwork(INetworkDefinition* network) noexcept {
     auto paramlist = dynamic_cast<UpsampleLayerParam*>(param_);
-    if (!paramlist->align_corners) {
-        Blob* output_blob = output_blobs_[0];
-        auto output_dims = output_blob->GetBlobDesc().dims;
-        auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
-        auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
-        auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
-        IResizeLayer* layer = network->addResize(*input_tensor);
-        if (layer != nullptr) {
-            layer->setName(layer_name_.c_str());
-            if (input_blobs_.size() == 1) {
-                if (!paramlist->dims.empty()) {
-                    nvinfer1::Dims4 dims(output_dims[0], output_dims[1], output_dims[2], output_dims[3]);
-                    layer->setOutputDimensions(dims);
-                } else {
+    Blob* output_blob = output_blobs_[0];
+    auto output_dims = output_blob->GetBlobDesc().dims;
+    auto input_foreign_tensor = dynamic_cast<ForeignBlob*>(input_blobs_[0])->GetForeignTensor();
+    auto output_foreign_tensor = dynamic_cast<ForeignBlob*>(output_blobs_[0])->GetForeignTensor();
+    auto input_tensor = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor)->GetTensor();
+    ShapeTensor out_shape_tensor;
+    if (input_blobs_.size() == 2) {
+        // when got 2 blobs, upsample is converted from torch op, second input is hw shape tensor
+        auto input_tensors = GetInputITensors();
+        auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[input_blobs_.size()-1])->GetForeignTensor();
+        auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+        // input shape tensor
+        auto in_shape_tensor = shapeOf(*input_tensors[0]);
+        // hw shape tensor
+        auto size = ShapeTensor(*input_tensor2);
+        // get nc shape tensor
+        DimsVector nc_axes = {0, 1};
+        auto nc_index = ShapeTensor(1, std::move(nc_axes));
+        auto nc = gather(network, in_shape_tensor, nc_index);
+        // concat nc and hw
+        out_shape_tensor = concat(network, nc, size);
+    }
+
+
+    // Dim Mode Special Case:
+    // Cases When Both N,C and H+W are dynamic
+    // In this case, We cannot turn to Scale mode.
+    // Also layer->SetOutputDimensions() API does not accept -1 as dim
+    // Have to use TNN Upsample Plugin.
+    // e.g [-1,2,-1,-1]
+    if (input_blobs_.size() == 1 && !paramlist->dims.empty()) {
+        // In this case, network->addResize should not be called. GO Plugin
+        auto trt_dim = input_tensor->getDimensions();
+        if (trt_dim.d[0] <= 0 || trt_dim.d[1] <= 0) {
+            LOGI("WARNING: Dynamic NCHW Upsample with fixed dims param is  NOT SUPPORTED by TensorRT, use TNN Upsample Plugin instead.\n");
+            return TensorRTPluginLayerBuilder::AddToNetwork(network); 
+        }
+    }
+
+    IResizeLayer* layer = network->addResize(*input_tensor);
+    if (layer != nullptr) {
+        layer->setName(layer_name_.c_str());
+        if (input_blobs_.size() == 1) {
+            if (!paramlist->dims.empty()) {
+                auto trt_dim = input_tensor->getDimensions();
+                if (trt_dim.nbDims != 4) {
+                    LOGE("Upsample with 1 input only support 4d input.\n");
+                    return nullptr;
+                }
+
+                // trt_dim may have one of the following values:
+                // [-1,3,32,32], [-1,2,-1,-1], [1,16,256,256]
+                if (trt_dim.d[0] <= 0 || trt_dim.d[1] <= 0) {
+                    // Cases When At least One of N, C be dynamic
+                    // and H,W are fixed, turn to scale mode
+                    // Here trt_dim.d[2] > 0 && trt_dim.d[3] > 0
+                    // e.g [-1,3,32,32]
                     float scale[4];
                     scale[0] = 1;
                     scale[1] = 1;
-                    scale[2] = paramlist->scales[1];
-                    scale[3] = paramlist->scales[0];
+                    scale[2] = paramlist->dims[0] / float(trt_dim.d[2]);
+                    scale[3] = paramlist->dims[1] / float(trt_dim.d[3]);
                     layer->setScales(scale, 4);
+                } else {
+                    // Cases When Both N and C are fixed
+                    // e.g [1,16,256,256]
+                    if (!output_dims.empty() && output_dims[2] > 0 && output_dims[3] > 0) {
+                        nvinfer1::Dims4 dims(trt_dim.d[0], trt_dim.d[1], 
+                            output_dims[2], output_dims[3]);
+                        layer->setOutputDimensions(dims);
+                    } else if (paramlist->dims.size() >= 2 && 
+                               paramlist->dims[0] > 0 && paramlist->dims[1] > 0) {
+                        nvinfer1::Dims4 dims(trt_dim.d[0], trt_dim.d[1], 
+                            paramlist->dims[0], paramlist->dims[1]);
+                        layer->setOutputDimensions(dims);
+                    } else {
+                        LOGE("Upsample with 1 input Fix N,C + Fixed dims does not have standard positive dim, Unsupported.\n");
+                        return nullptr;
+                    }
                 }
-            } else if (input_blobs_.size() == 4) {
-                auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[input_blobs_.size()-1])->GetForeignTensor();
-                auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
-                layer->setInput(1, *input_tensor2);
             } else {
+                auto trt_dim = input_tensor->getDimensions();
+                if (output_dims.size() == 4 || (output_dims.size() == 0 && trt_dim.nbDims == 4)) {
+                    // NOTE: keep 2nd condition until dims or rank of blob can be tracked in dynamic shape scenario
                     float scale[4];
                     scale[0] = 1;
                     scale[1] = 1;
                     scale[2] = paramlist->scales[1];
                     scale[3] = paramlist->scales[0];
                     layer->setScales(scale, 4);
+                } else if (output_dims.size() == 5 || (output_dims.size() == 0 && trt_dim.nbDims == 5)) {
+                    // NOTE: keep 2nd condition until dims or rank of blob can be tracked in dynamic shape scenario
+                    float scale[5];
+                    scale[0] = 1;
+                    scale[1] = 1;
+                    scale[2] = paramlist->scales[2];
+                    scale[3] = paramlist->scales[1];
+                    scale[4] = paramlist->scales[0];
+                    layer->setScales(scale, 5);
+                } else {
+                    LOGE("Upsample with 1 input and scale param only support 2d or 3d now.\n");
+                    return nullptr;
+                }
             }
-            layer->setResizeMode(paramlist->mode == 1 ? ResizeMode::kNEAREST : ResizeMode::kLINEAR);
-            layer->setAlignCorners(paramlist->align_corners);
+        } else if (input_blobs_.size() == 2) {
+            // set resize layer input with shape tensor
+            layer->setInput(1, out_shape_tensor.tensor(network));
+        } else if (input_blobs_.size() == 4) {
+            auto input_foreign_tensor2 = dynamic_cast<ForeignBlob*>(input_blobs_[input_blobs_.size()-1])->GetForeignTensor();
+            auto input_tensor2 = std::dynamic_pointer_cast<TensorRTTensor>(input_foreign_tensor2)->GetTensor();
+            layer->setInput(1, *input_tensor2);
+        } else {
+            float scale[4];
+            scale[0] = 1;
+            scale[1] = 1;
+            scale[2] = paramlist->scales[1];
+            scale[3] = paramlist->scales[0];
+            layer->setScales(scale, 4);
+        }
+        layer->setResizeMode(paramlist->mode == 1 ? InterpolationMode::kNEAREST : InterpolationMode::kLINEAR);
+        // layer->setAlignCorners(paramlist->align_corners);
+        if (paramlist->align_corners) {
+            layer->setCoordinateTransformation(nvinfer1::ResizeCoordinateTransformation::kALIGN_CORNERS);
         }
-        return layer;
     }
-
-    return TensorRTPluginLayerBuilder::AddToNetwork(network);
+    return layer;
 }
 
 DimsExprs UpsampleTRTPluginLayerBuilder::getOutputDimensions(int index, const nvinfer1::DimsExprs* inputs,
diff --git a/source/tnn/network/tensorrt/layer_builder/where_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/where_layer_builder.cc
index 295caf977..db63d3caf 100644
--- a/source/tnn/network/tensorrt/layer_builder/where_layer_builder.cc
+++ b/source/tnn/network/tensorrt/layer_builder/where_layer_builder.cc
@@ -20,17 +20,70 @@ namespace TNN_NS {
 DECLARE_TENSORRT_LAYER_BUILDER(Where, LAYER_WHERE);
 
 ILayer* WhereTRTLayerBuilder::AddToNetwork(INetworkDefinition* network) {
+    ITensor *x, *y, *condition;
     auto input_tensors = GetInputITensors();
-    auto x = input_tensors[0];
-    auto y = input_tensors[1];
-    auto condition = input_tensors[2];
+    if (input_tensors.size()==3) {
+        x = input_tensors[0];
+        y = input_tensors[1];
+        condition = input_tensors[2];
+    } else {
+        auto layer_resource = dynamic_cast<WhereLayerResource*>(resource_);
+        if (!layer_resource) {
+            LOGE("WhereTRTLayerBuilder: Unable to Get LayerResource while at least one of x, y missing.");
+            return nullptr;
+        }
+ 
+        if (layer_resource->x.GetBytesSize()>0 && layer_resource->y.GetBytesSize()>0) {
+            auto x_const_layer = ConvertWeightToConstLayer(network, &(layer_resource->x));
+            auto y_const_layer = ConvertWeightToConstLayer(network, &(layer_resource->y));
+            if (x_const_layer==nullptr || y_const_layer==nullptr) {
+                LOGE("WhereTRTLayerBuilder: Unable to to turn x or y in LayerResource to TRT constant layer.");
+                return nullptr;
+            }
+            x = x_const_layer->getOutput(0);
+            y = y_const_layer->getOutput(0);
+            condition = input_tensors[0];
+        } else if (layer_resource->x.GetBytesSize()>0) {
+            auto x_const_layer = ConvertWeightToConstLayer(network, &(layer_resource->x));
+            if (x_const_layer==nullptr) {
+                LOGE("WhereTRTLayerBuilder: Unable to to turn x in LayerResource to TRT constant layer.");
+                return nullptr;
+            }
+            x = x_const_layer->getOutput(0);
+            y = input_tensors[0];
+            condition = input_tensors[1];
+        } else if (layer_resource->y.GetBytesSize()>0) {
+            auto y_const_layer = ConvertWeightToConstLayer(network, &(layer_resource->y));
+            if (y_const_layer==nullptr) {
+                LOGE("WhereTRTLayerBuilder: Unable to to turn x in LayerResource to TRT constant layer.");
+                return nullptr;
+            }
+            x = input_tensors[0];
+            y = y_const_layer->getOutput(0);
+            condition = input_tensors[1];
+        } else {
+            LOGE("WhereTRTLayerBuilder: Unable to Get LayerResource while at least one of x, y missing.");
+            return nullptr;
+        }
+    }
 
-    if (condition->getType() == nvinfer1::DataType::kFLOAT) {
+    if (condition->getType() == nvinfer1::DataType::kFLOAT  ||
+        condition->getType() == nvinfer1::DataType::kHALF ||
+        condition->getType() == nvinfer1::DataType::kINT32) {
         ILayer* cast_layer = network->addIdentity(*condition);
         cast_layer->setOutputType(0, nvinfer1::DataType::kBOOL);
         condition = cast_layer->getOutput(0);
     }
 
+    // aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value)
+    // when %self is float tensor, %value may be zero tensor of int type
+    if (y->getType() != x->getType()) {
+        // cast x to the same type of y
+        ILayer* cast_layer = network->addIdentity(*x);
+        cast_layer->setOutputType(0, y->getType());
+        x = cast_layer->getOutput(0);
+    }
+
     BroadcastTensors(network, x, y, condition);
 
     ISelectLayer* layer = network->addSelect(*condition, *x, *y);
diff --git a/source/tnn/network/tensorrt/layer_builder/xor_layer_builder.cc b/source/tnn/network/tensorrt/layer_builder/xor_layer_builder.cc
new file mode 100644
index 000000000..0d644e67b
--- /dev/null
+++ b/source/tnn/network/tensorrt/layer_builder/xor_layer_builder.cc
@@ -0,0 +1,27 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/network/tensorrt/layer_builder/binary_layer_builder.h"
+
+namespace TNN_NS {
+
+DECLARE_TRT_BINARY_LAYER_BUILDER(Xor);
+
+XorTRTLayerBuilder::XorTRTLayerBuilder(LayerType ignore) : BinaryTRTLayerBuilder(ignore) {
+    m_op = ElementWiseOperation::kXOR;
+}
+
+REGISTER_TENSORRT_LAYER_BUILDER(Xor, LAYER_XOR);
+
+}  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/shape_tensor.cc b/source/tnn/network/tensorrt/shape_tensor.cc
index b4a6cd183..337f492d8 100644
--- a/source/tnn/network/tensorrt/shape_tensor.cc
+++ b/source/tnn/network/tensorrt/shape_tensor.cc
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making TNN available.
 //
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -122,7 +122,11 @@ nvinfer1::ITensor& ShapeTensor::tensor(INetworkDefinition* network) const {
                 count *= value;
             }
             nvinfer1::Weights w{nvinfer1::DataType::kINT32, count == 0 ? nullptr : m_values.data(), count};
+            #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+            m_tensor = network->addCast(*(network->addShape(*network->addConstant(dims, w)->getOutput(0))->getOutput(0)), nvinfer1::DataType::kINT32)->getOutput(0);
+            #else
             m_tensor = network->addShape(*network->addConstant(dims, w)->getOutput(0))->getOutput(0);
+            #endif
             if (rank() == 0) {
                 nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*m_tensor);
                 nvinfer1::Dims d{0, {}};
@@ -133,7 +137,11 @@ nvinfer1::ITensor& ShapeTensor::tensor(INetworkDefinition* network) const {
         } else {
             assert(m_tensor);
             for (; m_depth > 0; --m_depth) {
+                #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 100
+                m_tensor = network->addCast(*(network->addShape(*m_tensor)->getOutput(0)), nvinfer1::DataType::kINT32)->getOutput(0);
+                #else
                 m_tensor = network->addShape(*m_tensor)->getOutput(0);
+                #endif
             }
         }
     }
diff --git a/source/tnn/network/tensorrt/tensorrt_blob_manager.cc b/source/tnn/network/tensorrt/tensorrt_blob_manager.cc
index 5cc41f499..7a6ad923c 100644
--- a/source/tnn/network/tensorrt/tensorrt_blob_manager.cc
+++ b/source/tnn/network/tensorrt/tensorrt_blob_manager.cc
@@ -136,7 +136,7 @@ Status TensorRTBlobManager::AllocateBlobMemory(int flag) {
     Status status = TNN_OK;
 
     do {
-            // ignore share memory mode, allocated the blob memory separately.
+            // ignore share memory mode, allocated the blob memory seperately.
             MemorySeperateAssignStrategy strategy;
             for (auto blob_memory_pool_iter : blob_memory_pool_map_) {
                 status = blob_memory_pool_iter.second->AssignAllBlobMemory(strategy);
diff --git a/source/tnn/network/tensorrt/tensorrt_network.cc b/source/tnn/network/tensorrt/tensorrt_network.cc
index 898d4d59c..f317d414c 100644
--- a/source/tnn/network/tensorrt/tensorrt_network.cc
+++ b/source/tnn/network/tensorrt/tensorrt_network.cc
@@ -17,21 +17,22 @@
 #include <mutex>
 
 #include "tnn/device/cuda/cuda_context.h"
-#include "tnn/interpreter/default_model_interpreter.h"
+#include "tnn/interpreter/tnn/model_interpreter.h"
 #include "tnn/optimizer/net_optimizer_manager.h"
 #include "tnn/network/tensorrt/tensorrt_network.h"
 #include "tnn/network/tensorrt/utils.h"
 #include "tnn/utils/exclusive_file.h"
 #include "tnn/utils/dims_utils.h"
 #include "tnn/utils/md5.h"
+#include "tnn/utils/string_utils_inner.h"
 #include "tnn/device/cuda/cuda_macro.h"
 #include "tnn/utils/blob_dump_utils.h"
 #include "tnn/utils/data_type_utils.h"
 
 namespace TNN_NS {
 
-#define MAX_SCRATCH_MEMORY (1<<31 - 1)
-#define TENSORRT_SERIALIZE_VERSION "v1.5"
+#define MAX_SCRATCH_MEMORY (1l<<31)
+#define TENSORRT_SERIALIZE_VERSION "v1.6"
 
 NetworkImplFactoryRegister<NetworkImplFactory<TensorRTNetwork_>>
     g_network_impl_tensorrt_factory_register(NETWORK_TYPE_TENSORRT);
@@ -43,11 +44,13 @@ std::mutex TensorRTNetwork_::network_mutex;
 
 TensorRTNetwork_::TensorRTNetwork_() {
     int8_mode = false;
+    explicit_int8_mode = false;
     test_mode = false;
     m_trt_engine = nullptr;
     m_trt_context = nullptr;
+    m_trt_builder = nullptr;
     m_context_memory = nullptr;
-    m_trt_bindings = nullptr;
+    // m_trt_bindings = nullptr;
     device_id_ = 0;
 }
 
@@ -66,24 +69,24 @@ TensorRTNetwork_::~TensorRTNetwork_() {
     }
 
     if (m_trt_context) {
-        m_trt_context->destroy();
+        delete m_trt_context;
     }
 
-    if (m_trt_engine) m_trt_engine->destroy();
+    if (m_trt_engine) delete m_trt_engine;
+
+    if (m_trt_builder) delete m_trt_builder;
 
-    if(m_trt_bindings) delete[] m_trt_bindings;
 }
 
 Status TensorRTNetwork_::Init(NetworkConfig &net_config, ModelConfig &model_config,
         AbstractModelInterpreter* interpreter, InputShapesMap min_inputs_shape,
-        InputShapesMap max_inputs_shape, bool enable_const_folder) {
+        InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder) {
     std::unique_lock<std::mutex> lck(network_mutex);
     device_id_ = net_config.device_id;
     CUDA_CHECK(cudaSetDevice(net_config.device_id));
     config_ = net_config;
-    DefaultModelInterpreter *default_interpreter = dynamic_cast<DefaultModelInterpreter *>(interpreter);
+    ModelInterpreter *default_interpreter = dynamic_cast<ModelInterpreter *>(interpreter);
     CHECK_PARAM_NULL(default_interpreter);
-
     auto params_md5 = default_interpreter->GetParamsMd5();
     if (params_md5.size() == 0) {
         test_mode = true;
@@ -111,6 +114,10 @@ Status TensorRTNetwork_::Init(NetworkConfig &net_config, ModelConfig &model_conf
     {
         // use mutex to protect net_resource and net_structure in multi-thread
         std::unique_lock<std::mutex> lck(optimize_mtx_);
+        cudaDeviceProp props;
+        CUDA_CHECK(cudaGetDeviceProperties(&props, device_id_));
+        int sm_version = props.major * 10 + props.minor;
+        net_config.extra["gpu_sm"] = std::to_string(sm_version);
         ret = optimizer::NetOptimizerManager::Optimize(net_structure, net_resource, net_config);
         if (ret != TNN_OK) {
             return ret;
@@ -146,45 +153,87 @@ Status TensorRTNetwork_::Init(NetworkConfig &net_config, ModelConfig &model_conf
         return ret;
     }
 
+    std::string cache_buf;
+    default_interpreter->GetCache(cache_buf);
+
     std::string cache_file_name = GetCacheFileName(params_md5, inputs, outputs, min_inputs_shape,
         net_config.device_id, this->int8_mode, config_.precision == PRECISION_LOW,
-        enable_const_folder);
+        enable_const_folder, net_config.cache_path);
 
-    std::unique_ptr<ExclFile> file_lock(new ExclFile(cache_file_name));
+    if (cache_buf.size() == 0) {
+        std::unique_ptr<ExclFile> file_lock(new ExclFile(cache_file_name));
 
-    if (test_mode || false == file_lock->Ready()) {
-        ret = InitWithoutCache(inputs, outputs, cache_file_name, net_resource, min_inputs_shape);
-        if (ret != TNN_OK) {
-            return ret;
+        if ((test_mode || false == file_lock->Ready()) && cache_buf.size() == 0) {
+            ret = InitWithoutCache(inputs, outputs, cache_file_name, net_resource, min_inputs_shape);
+            if (ret != TNN_OK) {
+                return ret;
+            }
+
+            IHostMemory *model_stream = nullptr;
+            model_stream = m_trt_engine->serialize();
+            char *model_stream_ptr = reinterpret_cast<char*>(model_stream->data());
+            if (!test_mode) {
+                std::ofstream deploy_output(cache_file_name, std::ofstream::binary);
+                deploy_output.write(model_stream_ptr, model_stream->size());
+                deploy_output.close();
+
+            } else {
+                auto model_interpreter = dynamic_cast<ModelInterpreter *>(interpreter);
+                std::string cache_str(model_stream_ptr, model_stream->size());
+                model_interpreter->SetCache(cache_str);
+            }
+            delete model_stream_ptr;
         }
     }
 
     if (!test_mode) {
-        size_t size = 0;
-        std::ifstream deploy_input(cache_file_name, std::ios::binary);
-        deploy_input.seekg(0, deploy_input.end);
-        size = deploy_input.tellg();
-        deploy_input.seekg(0, deploy_input.beg);
-        char *model_stream = new char[size + 1];
-        deploy_input.read(model_stream, size);
-        IRuntime* runtime = createInferRuntime(m_trt_logger);
-        if (m_trt_engine) m_trt_engine->destroy();
-        m_trt_engine = runtime->deserializeCudaEngine(model_stream, size);
-        delete[] model_stream;
-        ret = CreateExecuteContext();
-        if (ret != TNN_OK)
-            return ret;
+        if (cache_buf.size() == 0) {
+            // deserialize cuda engine with cache file
+            size_t size = 0;
+            std::ifstream deploy_input(cache_file_name, std::ios::binary);
+            deploy_input.seekg(0, deploy_input.end);
+            size = deploy_input.tellg();
+            deploy_input.seekg(0, deploy_input.beg);
+            char *model_stream = new char[size + 1];
+            deploy_input.read(model_stream, size);
+            initLibNvInferPlugins(&m_trt_logger, "");
+            IRuntime* runtime = createInferRuntime(m_trt_logger);
+            if (m_trt_engine) delete m_trt_engine;
+            m_trt_engine = runtime->deserializeCudaEngine(model_stream, size);
+            delete[] model_stream;
+            if (!m_trt_engine) {
+                LOGE("create tensorrt engine failed\n");
+                return TNNERR_CUDA_TENSORRT_ERROR;
+            }
+            ret = CreateExecuteContext();
+            if (ret != TNN_OK)
+                return ret;
 
-        runtime->destroy();
-        deploy_input.close();
+            delete runtime;
+            deploy_input.close();
+        } else {
+            // deserialize cuda engine with cache buf
+            initLibNvInferPlugins(&m_trt_logger, "");
+            IRuntime* runtime = createInferRuntime(m_trt_logger);
+            if (m_trt_engine) delete m_trt_engine;
+            m_trt_engine = runtime->deserializeCudaEngine(cache_buf.data(), cache_buf.size());
+            if (!m_trt_engine) {
+                LOGE("create tensorrt engine failed\n");
+                return TNNERR_CUDA_TENSORRT_ERROR;
+            }
+            ret = CreateExecuteContext();
+            if (ret != TNN_OK)
+                return ret;
+
+            delete runtime;
+        }
     } else {
         ret = CreateExecuteContext();
         if (ret != TNN_OK)
             return ret;
     }
 
-    int bind_num = m_trt_engine->getNbBindings();
-    this->m_trt_bindings = new void*[bind_num];
+    int bind_num = m_trt_engine->getNbIOTensors();
 
     ret = ReshapeLayers();
     if (ret != TNN_OK) {
@@ -192,14 +241,15 @@ Status TensorRTNetwork_::Init(NetworkConfig &net_config, ModelConfig &model_conf
         return ret;
     }
 
-    ret = blob_manager_->AllocateBlobMemory();
-    if (ret != TNN_OK) {
-       return ret;
+    if (net_config.share_memory_mode != SHARE_MEMORY_MODE_SET_ALL_FROM_EXTERNAL) {
+        ret = blob_manager_->AllocateBlobMemory();
+        if (ret != TNN_OK) {
+            return ret;
+        }
     }
 
     for (auto iter : outputs) {
-        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
-        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+        this->m_trt_context->setTensorAddress(iter.first.c_str(), iter.second->GetHandle().base);
     }
 
     return TNN_OK;
@@ -214,14 +264,22 @@ Status TensorRTNetwork_::Forward() {
         return ret;
     }
 
+    BlobMap outputs;
+    ret = blob_manager_->GetAllOutputBlobs(outputs);
+    if (ret != TNN_OK) {
+        LOGE("ERROR: get output blobs failed");
+        return ret;
+    }
+
     for (auto iter : inputs) {
-        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
-        if (index < 0) continue;
-        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+        this->m_trt_context->setTensorAddress(iter.first.c_str(), iter.second->GetHandle().base);
+    }
+
+    for (auto iter : outputs) {
+        this->m_trt_context->setTensorAddress(iter.first.c_str(), iter.second->GetHandle().base);
     }
 
-    bool trt_ret = this->m_trt_context->enqueueV2(this->m_trt_bindings,
-        dynamic_cast<CudaContext*>(context_)->GetStream(), nullptr);
+    bool trt_ret = this->m_trt_context->enqueueV3(dynamic_cast<CudaContext*>(context_)->GetStream());
     if (trt_ret != true) {
         return TNNERR_CUDA_TENSORRT_ERROR;
     }
@@ -251,12 +309,12 @@ Status TensorRTNetwork_::ReshapeLayers() {
     }
 
     for (auto iter : inputs) {
-        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
-        if (index < 0) continue;
-        auto dims = blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims;
-        nvinfer1::Dims inputDims = ConvertToTRTDims(dims);
-        m_trt_context->setBindingDimensions(index, inputDims);
-        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+        auto dims = ConvertToTRTDims(blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims);
+        bool success = m_trt_context->setInputShape(iter.first.c_str(), dims);
+	    if(!success) {
+            return Status(TNNERR_PARAM_ERR, "Reshape failed\n");
+        }
+        this->m_trt_context->setTensorAddress(iter.first.c_str(), iter.second->GetHandle().base);
     }
 
     BlobMap outputs;
@@ -269,32 +327,29 @@ Status TensorRTNetwork_::ReshapeLayers() {
     for (auto blob_name : const_input_blobs_) {
         Blob *blob = blob_manager_->GetBlob(blob_name);
         auto buf = net_resource_->constant_map[blob_name];
-        int index = m_trt_engine->getBindingIndex(blob_name.c_str());
-        if (index < 0) continue;
-        // Data is reload from const_map to blob in CudaLayerAcc::ReloadConstantBlobs
-        m_trt_bindings[index] = blob->GetHandle().base;
+        this->m_trt_context->setTensorAddress(blob_name.c_str(), blob->GetHandle().base);
 
-        bool ret;
+        bool success;
         auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
         if (std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->IsShapeTensor()) {
             auto name = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetShapeBlobName();
             auto dims = net_resource_->blob_shapes_map[name];
-            ret = m_trt_context->setInputShapeBinding(index, dims.data());
+            success = m_trt_context->setInputTensorAddress(name.c_str(), dims.data());
         } else {
+            auto name = buf->GetName();
             nvinfer1::Dims inputDims = ConvertToTRTDims(buf->GetBufferDims());
-            ret = m_trt_context->setBindingDimensions(index, inputDims);
+            success = m_trt_context->setInputShape(name.c_str(), inputDims);
         }
 
-        if (!ret) {
+        if (!success) {
             return Status(TNNERR_PARAM_ERR, "Reshape failed\n");
         }
     }
 
     for (auto iter : outputs) {
-        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
-        auto trt_dims = m_trt_context->getBindingDimensions(index).d;
+        auto trt_dims = m_trt_context->getTensorShape(iter.first.c_str()).d;
         DimsVector dims;
-        for (int i = 0; i < m_trt_context->getBindingDimensions(index).nbDims; i++) {
+        for (int i = 0; i < m_trt_context->getTensorShape(iter.first.c_str()).nbDims; i++) {
             dims.push_back(trt_dims[i]);
         }
         blob_manager_->GetBlob(iter.first)->GetBlobDesc().dims = dims;
@@ -336,13 +391,10 @@ Status TensorRTNetwork_::ForwardAsync(Callback call_back) {
     }
 
     for (auto iter : inputs) {
-        int index = m_trt_engine->getBindingIndex(iter.first.c_str());
-        if (index < 0) continue;
-        this->m_trt_bindings[index] = iter.second->GetHandle().base;
+        this->m_trt_context->setTensorAddress(iter.first.c_str(), iter.second->GetHandle().base);
     }
 
-    bool trt_ret = this->m_trt_context->enqueueV2(this->m_trt_bindings,
-        dynamic_cast<CudaContext*>(context_)->GetStream(), nullptr);
+    bool trt_ret = this->m_trt_context->enqueueV3(dynamic_cast<CudaContext*>(context_)->GetStream());
     if (trt_ret != true) {
         return TNNERR_CUDA_TENSORRT_ERROR;
     }
@@ -364,6 +416,24 @@ std::unordered_map<std::string, TensorRTPluginLayerBuilder*> TensorRTNetwork_::G
 Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *net_resource, bool enable_const_folder) {
     Status ret = TNN_OK;
 
+    CheckExplicitPrecision(net_structure);
+
+
+    // set quant res to context
+    auto cuda_context = dynamic_cast<CudaContext *>(context_);
+    for (auto layer_info : net_structure->layers) {
+        auto set_quant_res = [&](std::vector<std::string> &names) {
+            for (auto &name : names) {
+                auto blob_scale_name = name + "_scale_data_";
+                if (net_resource->constant_map.count(blob_scale_name) > 0 && !cuda_context->GetQuantResource(name)) {
+                    cuda_context->AddQuantResource(name, net_resource->constant_map[blob_scale_name]);
+                } 
+            }
+        };
+        set_quant_res(layer_info->inputs);
+        set_quant_res(layer_info->outputs);
+    }
+
     // mark const blobs and blob data type
     auto const_blobs = net_resource->constant_map;
     for (auto layer_info : net_structure->layers) {
@@ -380,7 +450,7 @@ Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *ne
     }
 
     auto const_layers = net_resource->constant_layers;
-    for (auto layer_info : net_structure->layers) {
+    for (auto &layer_info : net_structure->layers) {
         if (runtime_model_ == RUNTIME_MODE_NORMAL && const_layers.find(layer_info->name) != const_layers.end()) {
             continue;
         }
@@ -395,6 +465,7 @@ Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *ne
         std::string layer_name = layer_info->name;
         cur_layer->SetNetwork(this);
         cur_layer->SetLayerName(layer_name);
+        cur_layer->CheckTopo(&layer_info - &net_structure->layers[0], net_structure->layers);
         // set layer nodes
         std::vector<Blob *> inputs;
         std::vector<std::string> &input_names = layer_info->inputs;
@@ -403,17 +474,6 @@ Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *ne
 
         for (auto name : input_names) {
             auto blob = blob_manager_->GetBlob(name);
-            if (is_int8_blob) {
-                auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
-                auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
-                if (!tensorrt_tensor->GetInt8Mode()) {
-                    std::string blob_scale_name = name + "_scale_data_";
-                    tensorrt_tensor->SetIntResource(
-                        reinterpret_cast<IntScaleResource *>(net_resource->resource_map[blob_scale_name].get()));
-                    tensorrt_tensor->SetInt8Mode(true);
-                }
-                this->int8_mode = true;
-            }
             inputs.push_back(blob);
         }
 
@@ -422,17 +482,6 @@ Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *ne
 
         for (auto name : output_names) {
             auto blob = blob_manager_->GetBlob(name);
-            if (is_int8_blob) {
-                auto foreign_tensor = dynamic_cast<ForeignBlob*>(blob)->GetForeignTensor();
-                auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
-                if (!tensorrt_tensor->GetInt8Mode()) {
-                    std::string blob_scale_name = name + "_scale_data_";
-                    tensorrt_tensor->SetIntResource(
-                        reinterpret_cast<IntScaleResource *>(net_resource->resource_map[blob_scale_name].get()));
-                    tensorrt_tensor->SetInt8Mode(true);
-                }
-                this->int8_mode = true;
-            }
             outputs.push_back(blob);
         }
 
@@ -440,13 +489,14 @@ Status TensorRTNetwork_::InitLayers(NetStructure *net_structure, NetResource *ne
         if (net_resource->resource_map.count(layer_name) != 0 ) {
             layer_resource = net_resource->resource_map[layer_name].get();
         }
-
         cur_layer->SetRuntimeMode(runtime_model_);
         cur_layer->SetConstantResource(&net_resource->constant_map);
         ret = cur_layer->Init(context_, layer_info->param.get(), layer_resource, inputs,
             outputs, device_, enable_const_folder);
         if (ret != TNN_OK) {
-            LOGE("Error Init layer %s (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(), (int)ret, (int)ret);
+            LOGE("Error Init layer: %s layer type: %d (err: %d or 0x%X)\n", cur_layer->GetLayerName().c_str(),
+                type, (int)ret, (int)ret);
+            delete cur_layer;
             return ret;
         }
 
@@ -478,14 +528,15 @@ Status TensorRTNetwork_::CreateExecuteContext() {
     return TNN_OK;
 }
 
-Status TensorRTNetwork_::GetForwardMemorySize(int &memory_size) {
+Status TensorRTNetwork_::GetForwardMemorySize(size_t &memory_size) {
     memory_size = context_memory_size_;
     return TNN_OK;
 }
 
 Status TensorRTNetwork_::SetForwardMemory(void *memory) {
-    if (config_.share_memory_mode != SHARE_MEMORY_MODE_SET_FROM_EXTERNAL) {
-        LOGE("Error Only SHARE_MEMORY_MODE_SET_FROM_EXTERNAL mode can set forward memory from external\n");
+    if (config_.share_memory_mode != SHARE_MEMORY_MODE_SET_FROM_EXTERNAL &&
+        config_.share_memory_mode != SHARE_MEMORY_MODE_SET_ALL_FROM_EXTERNAL) {
+        LOGE("Error Only SHARE_MEMORY_MODE_SET_FROM_EXTERNAL and SHARE_MEMORY_MODE_SET_ALL_FROM_EXTERNAL mode can set forward memory from external\n");
         return TNNERR_SHARE_MEMORY_MODE_NOT_SUPPORT;
     }
 
@@ -495,13 +546,16 @@ Status TensorRTNetwork_::SetForwardMemory(void *memory) {
 
 Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std::string cache_file_name,
         NetResource *net_resource, const InputShapesMap &min_inputs_shape) {
-    auto m_trt_builder = nvinfer1::createInferBuilder(m_trt_logger);
+    initLibNvInferPlugins(&m_trt_logger, "");
+    m_trt_builder = nvinfer1::createInferBuilder(m_trt_logger);
     NetworkDefinitionCreationFlags networkFlags = 1U << static_cast<uint32_t>(
         NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-    if (int8_mode) networkFlags |= 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION);
+    // if (int8_mode) networkFlags |= 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_PRECISION);
     auto m_trt_network = m_trt_builder->createNetworkV2(networkFlags);
     auto m_trt_config = m_trt_builder->createBuilderConfig();
     auto profile = m_trt_builder->createOptimizationProfile();
+
+    bool is_input_fp16 = false;
     for (auto input : inputs) {
         auto foreign_blob = dynamic_cast<ForeignBlob*>(input.second);
         auto desc = input.second->GetBlobDesc();
@@ -514,63 +568,23 @@ Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std
         auto nv_dims = ConvertToTRTDynamicDims(max_dims, min_dims);
         nvinfer1::ITensor* in_tensor = m_trt_network->addInput(desc.name.c_str(),
             ConvertToTRTDataType(desc.data_type), nv_dims);
+	    
+        {
+            std::stringstream ss;
+            int nbDims = in_tensor->getDimensions().nbDims;
+            for( int d=0;d<nbDims;d++) ss << in_tensor->getDimensions().d[d] << ",";
+            ss << "add " << input.first << " as input, blob shape:";
+            for(auto d:input.second->GetBlobDesc().dims) ss << d << ",";
+            LOGD("tensor shape %s \n", ss.str().c_str());
+        } 
+
+        is_input_fp16 |= desc.data_type == DATA_TYPE_HALF;
         profile->setDimensions(desc.name.c_str(), OptProfileSelector::kMIN, min_dims);
         profile->setDimensions(desc.name.c_str(), OptProfileSelector::kOPT, opt_dims);
         profile->setDimensions(desc.name.c_str(), OptProfileSelector::kMAX, max_dims);
         auto foreign_tensor = foreign_blob->GetForeignTensor();
         auto tensorrt_tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor);
-        if (int8_mode) {
-            auto input_scale_value = tensorrt_tensor->GetIntResource()->scale_handle.force_to<float *>()[0];
-
-            Weights input_quant_shift;
-            input_quant_shift.type = nvinfer1::DataType::kFLOAT;
-            input_quant_shift.values = nullptr;
-            input_quant_shift.count = 0;
-
-            Weights input_quant_scale;
-            input_quant_scale.type = nvinfer1::DataType::kFLOAT;
-            float* input_quant_scale_data = (float*)malloc(sizeof(float));
-            *input_quant_scale_data = input_scale_value;
-            input_quant_scale.values = (void*)input_quant_scale_data;
-            input_quant_scale.count = 1;
-
-            Weights input_quant_power;
-            input_quant_power.type = nvinfer1::DataType::kFLOAT;
-            input_quant_power.values = nullptr;
-            input_quant_power.count = 0;
-
-            auto input_quant_layer = m_trt_network->addScale(*in_tensor, ScaleMode::kUNIFORM,
-                input_quant_shift, input_quant_scale, input_quant_power);
-            std::string input_quant_layer_name = desc.name + "_input_quant_";
-            input_quant_layer->setOutputType(0, nvinfer1::DataType::kINT8);
-            input_quant_layer->setName(input_quant_layer_name.c_str());
-
-            Weights input_dequant_shift;
-            input_dequant_shift.type = nvinfer1::DataType::kFLOAT;
-            input_dequant_shift.values = nullptr;
-            input_dequant_shift.count = 0;
-
-            Weights input_dequant_scale;
-            input_dequant_scale.type = nvinfer1::DataType::kFLOAT;
-            float* input_dequant_scale_data = (float*)malloc(sizeof(float));
-            *input_dequant_scale_data = 1 / input_scale_value;
-            input_dequant_scale.values = (void*)input_dequant_scale_data;
-            input_dequant_scale.count = 1;
-
-            Weights input_dequant_power;
-            input_dequant_power.type = nvinfer1::DataType::kFLOAT;
-            input_dequant_power.values = nullptr;
-            input_dequant_power.count = 0;
-
-            auto input_dequant_layer = m_trt_network->addScale(*(input_quant_layer->getOutput(0)),
-                ScaleMode::kUNIFORM, input_dequant_shift, input_dequant_scale, input_dequant_power);
-            std::string input_dequant_layer_name = desc.name + "_input_dequant_";
-            input_dequant_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-            input_dequant_layer->setName(input_dequant_layer_name.c_str());
-            tensorrt_tensor->SetTensor(input_dequant_layer->getOutput(0));
-        } else {
-            tensorrt_tensor->SetTensor(in_tensor);
-        }
+        tensorrt_tensor->SetTensor(in_tensor);
     }
 
     // Add Const_resources as inputs to tensorrt network
@@ -633,7 +647,7 @@ Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std
             std::stringstream ss;
             ss << "<" << blob->GetBlobDesc().name << "> count:" << buf->GetDataCount();
             ss << " DataType:" << buf->GetDataType() << " shape:[";
-            for(int i: blob->GetBlobDesc().dims) {ss <<  i << ","; }
+            for(int i: buf->GetBufferDims()) {ss <<  i << ","; }
             ss << "]";
             LOGD("Adding %s as weights from constant_map to trt network\n", ss.str().c_str());
         }            
@@ -651,7 +665,9 @@ Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std
 
     for (int layer_id = 0; layer_id < this->layers_.size(); layer_id++) {
         BaseLayer* cur_layer = this->layers_[layer_id];
-        
+
+        dynamic_cast<TensorRTBaseLayerBuilder*>(cur_layer)->CheckInputShapeTensor(m_trt_network);
+
         nvinfer1::ILayer *cur_trt_layer = dynamic_cast<TensorRTBaseLayerBuilder*>(cur_layer)->AddToNetwork(m_trt_network);
         if (cur_trt_layer == nullptr ) {
             LOGE("build trt layer for \"%s\" failed\n", cur_layer->GetLayerName().c_str());
@@ -678,12 +694,15 @@ Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std
     }
 
     for (auto output : outputs) {
-        auto foreign_tensor = dynamic_cast<ForeignBlob*>(output.second)->GetForeignTensor();
-        auto tensor = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
-        //Do not delete, may cause trt bug
+        auto foreign_tensor = dynamic_cast<ForeignBlob *>(output.second)->GetForeignTensor();
+        auto tensor         = std::dynamic_pointer_cast<TensorRTTensor>(foreign_tensor)->GetTensor();
+        // Do not delete, may cause trt bug
         for (int i = 0; i < tensor->getDimensions().nbDims; i++) {
             LOGD("shape: %d\n", tensor->getDimensions().d[i]);
         }
+        if (config_.precision == PRECISION_LOW && is_input_fp16) {
+            tensor->setType(nvinfer1::DataType::kHALF);
+        }
         m_trt_network->markOutput(*tensor);
     }
 
@@ -694,27 +713,20 @@ Status TensorRTNetwork_::InitWithoutCache(BlobMap &inputs, BlobMap &outputs, std
     if (this->int8_mode) {
         m_trt_config->setFlag(BuilderFlag::kINT8);
     }
-    m_trt_engine = m_trt_builder->buildEngineWithConfig(*m_trt_network, *m_trt_config);
+    if (this->explicit_int8_mode) {
+        m_trt_config->setFlag(BuilderFlag::kINT8);
+    }
+
+    auto ihost_serialized_model = m_trt_builder->buildSerializedNetwork(*m_trt_network, *m_trt_config);
+    IRuntime* runtime = createInferRuntime(m_trt_logger);
+    m_trt_engine = runtime->deserializeCudaEngine(ihost_serialized_model->data(), ihost_serialized_model->size());
     if (!m_trt_engine) {
         LOGE("create tensorrt engine failed\n");
         return TNNERR_CUDA_TENSORRT_ERROR;
     }
-//    Status ret = CreateExecuteContext();
-//    if (ret != TNN_OK)
-//        return ret;
-    m_trt_builder->destroy();
-    m_trt_config->destroy();
-    m_trt_network->destroy();
-
-    if (!test_mode) {
-        IHostMemory *model_stream = nullptr;
-        model_stream = m_trt_engine->serialize();
-        std::ofstream deploy_output(cache_file_name, std::ofstream::binary);
-        char *model_stream_ptr = reinterpret_cast<char*>(model_stream->data());
-        deploy_output.write(model_stream_ptr, model_stream->size());
-        deploy_output.close();
-        delete model_stream_ptr;
-    }
+    delete m_trt_config;
+    delete m_trt_network;
+    delete ihost_serialized_model;
 
     return TNN_OK;
 }
@@ -731,7 +743,7 @@ bool TensorRTNetwork_::IsBlobUsed(Blob* blob) {
 
 std::string TensorRTNetwork_::GetCacheFileName(std::vector<std::string> params_md5, BlobMap input_map,
         BlobMap output_map, const InputShapesMap &min_inputs_shape, int device_id, bool int8_mode,
-        bool use_fp16, bool enable_const_folder) {
+        bool use_fp16, bool enable_const_folder, std::string cache_path) {
     std::string md5_source = "";
 
     for (auto iter : params_md5) {
@@ -772,7 +784,14 @@ std::string TensorRTNetwork_::GetCacheFileName(std::vector<std::string> params_m
         + TENSORRT_SERIALIZE_VERSION + "-" + GetGpuType(device_id)
         + "-" + GetTrtVersion() + GetCudaVersion()
         + "-" + const_folder + ".cache";
-    return cache_file_name;
+
+    if(EndsWith(cache_path, ".cache")) {
+        return cache_path;
+    } else if(cache_path.empty() || cache_path.compare(CACHE_MEMORY_TAG) == 0) {
+        return cache_file_name;
+    } else {
+        return cache_path + "/" + cache_file_name;
+    }
 }
 
 
@@ -835,6 +854,27 @@ Status TensorRTNetwork_::CheckConstBlobs() {
     return TNN_OK;
 }
 
+void TensorRTNetwork_::CheckExplicitPrecision(NetStructure *net_structure) {
+    for (int i = 0; i < net_structure->layers.size(); i++) {
+        auto layer_info = net_structure->layers.at(i);
+        if (layer_info->type == LAYER_QUANTIZE) {
+            auto outputs = layer_info->outputs;
+            for (int n = i + 1; n < net_structure->layers.size(); i++) {
+                auto next_layer_info   = net_structure->layers.at(i);
+                auto next_layer_inputs = next_layer_info->inputs;
+                if (std::find(next_layer_inputs.begin(), next_layer_inputs.end(), outputs[0]) !=
+                    next_layer_inputs.end()) {
+                    if (next_layer_info->type == LAYER_DEQUANTIZE) {
+                        // find qdq pair, use explicit int8 precision
+                        this->explicit_int8_mode = true;
+                        return;
+                    }
+                }
+            }
+        }
+    }
+}
+
 void TensorRTNetwork_::OnSharedForwardMemoryChanged(void *memory) {
     m_trt_context->setDeviceMemory(memory);    
 }
diff --git a/source/tnn/network/tensorrt/tensorrt_network.h b/source/tnn/network/tensorrt/tensorrt_network.h
index f95e85f89..d638b9a54 100644
--- a/source/tnn/network/tensorrt/tensorrt_network.h
+++ b/source/tnn/network/tensorrt/tensorrt_network.h
@@ -25,6 +25,7 @@
 #include "tnn/network/tensorrt/layer_builder/tensorrt_plugin_layer_builder.h"
 #include "tnn/network/tensorrt/tensorrt_tensor.h"
 #include "tnn/network/tensorrt/tensorrt_blob_manager.h"
+#include "NvInferPlugin.h"
 
 namespace TNN_NS {
 
@@ -33,13 +34,19 @@ class TRTLogger : public nvinfer1::ILogger {
     void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override {
         // suppress info-level messages
 #ifndef DEBUG
-        if (severity == Severity::kINFO || severity == Severity::kVERBOSE) return;
+        if (severity == Severity::kINFO || severity == Severity::kVERBOSE || severity == Severity::kWARNING) return;
 #endif
         const char * skips[] = {
             "INVALID_ARGUMENT: Cannot find binding of given name",
             "Unused Input:",
             "Detected invalid timing cache",
             "unused or used only at compile-time",
+            "Subnormal FP16 values detected",
+            "following issues when converted to FP16",
+            "Values less than smallest positive FP16 Subnormal value detected",
+            "from a network created with NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag",
+            "required to use ViltModel. Please upgrade torch.",
+            "please modify the weights or retrain with regularization"
         };
 
         std::string msg_str = std::string(msg);
@@ -76,9 +83,10 @@ class TensorRTNetwork_ : public DefaultNetwork, public ISharedMemoryChangeListen
     // @param net_resource network resource info
     // @param inputs_shape_map modify input shape, if empty, it will use the
     // shape in proto
+	// @param inputs_data_type modify input data type, by default float.
     virtual Status Init(NetworkConfig &net_config, ModelConfig &model_config,
         AbstractModelInterpreter* interpreter, InputShapesMap min_inputs_shape,
-        InputShapesMap max_inputs_shape, bool enable_const_folder);
+        InputShapesMap max_inputs_shape, InputDataTypeMap inputs_data_type, bool enable_const_folder);
 
     // @brief network forward
     virtual Status Forward();
@@ -94,7 +102,7 @@ class TensorRTNetwork_ : public DefaultNetwork, public ISharedMemoryChangeListen
     virtual void OnSharedForwardMemoryChanged(void *memory);
 
     // @brief get network forward for all blob memory size
-    virtual Status GetForwardMemorySize(int &memory_size);
+    virtual Status GetForwardMemorySize(size_t &memory_size);
 
     // @brief set forward memory when share memory mode is set from external
     virtual Status SetForwardMemory(void *memory);
@@ -103,7 +111,7 @@ class TensorRTNetwork_ : public DefaultNetwork, public ISharedMemoryChangeListen
 
     std::string GetCacheFileName(std::vector<std::string> params_md5, BlobMap input_map,
         BlobMap output_map, const InputShapesMap &min_inputs_shape, int device_id,
-        bool int8_mode, bool use_fp16, bool enable_const_folder);
+        bool int8_mode, bool use_fp16, bool enable_const_folder, std::string cache_path);
 
     std::set<std::string> m_concat_blob_names;
 
@@ -123,9 +131,13 @@ class TensorRTNetwork_ : public DefaultNetwork, public ISharedMemoryChangeListen
 
     Status CheckConstBlobs();
 
+    void CheckExplicitPrecision(NetStructure *net_structure);
+
     bool int8_mode;
+    bool explicit_int8_mode;
     bool test_mode;
     int m_max_batchsize;
+    nvinfer1::IBuilder* m_trt_builder;
     nvinfer1::ICudaEngine* m_trt_engine;
     nvinfer1::IExecutionContext* m_trt_context;
     static TRTLogger m_trt_logger;
diff --git a/source/tnn/network/tensorrt/utils.cc b/source/tnn/network/tensorrt/utils.cc
index 1139f4e80..24be4872f 100644
--- a/source/tnn/network/tensorrt/utils.cc
+++ b/source/tnn/network/tensorrt/utils.cc
@@ -13,9 +13,15 @@
 // specific language governing permissions and limitations under the License.
 
 #include <string.h>
+#include <map>
+#include <numeric>
+#include <set>
 #include <string>
+#include <set>
+#include <numeric>
 #include <stdio.h>
 
+
 #include "tnn/network/tensorrt/utils.h"
 #include "tnn/network/tensorrt/shape_tensor.h"
 #include "tnn/core/macro.h"
@@ -23,6 +29,11 @@
 
 namespace TNN_NS {
 
+static std::map<std::string, std::string> global_gpu_type_map = {
+    {"GeForce_RTX_3090", "NVIDIA_GeForce_RTX_3090"},
+    {"NVIDIA_GeForce_RTX_3090", "NVIDIA_GeForce_RTX_3090"},
+};
+
 std::string GetGpuType(int gpu_id) {
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, gpu_id);
@@ -36,7 +47,13 @@ std::string GetGpuType(int gpu_id) {
         }
         prop.name[i] = '_';
     }
-    return std::string(prop.name);
+
+    std::string gpu_type = std::string(prop.name);
+    if (global_gpu_type_map.count(gpu_type) > 0) {
+        return global_gpu_type_map[gpu_type];
+    } else {
+        return gpu_type;
+    }
 }
 
 std::string GetGpuArch(int gpu_id) {
@@ -54,7 +71,7 @@ std::string GetCudaVersion() {
 #error CUDART_VERSION Undefined!
 #else
     version_num = CUDART_VERSION;
-#endif 
+#endif
 
     char ss[50];
     sprintf(ss, "%02d", version_num / 1000);
@@ -69,7 +86,7 @@ std::string GetTrtVersion() {
 #error NV_TENSORRT_MAJOR Undefined!
 #else
     version_num = NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR * 10 + NV_TENSORRT_PATCH;
-#endif 
+#endif
 
     char ss[50];
     sprintf(ss, "%3d", version_num);
@@ -85,6 +102,8 @@ DataType ConvertTRTDataType(nvinfer1::DataType type) {
             return DATA_TYPE_HALF;
         case nvinfer1::DataType::kINT32 :
             return DATA_TYPE_INT32;
+        case nvinfer1::DataType::kINT8 :
+            return DATA_TYPE_INT8;
         default:
             return DATA_TYPE_FLOAT;
     }
@@ -100,6 +119,8 @@ DataFormat ConvertTRTDataFormat(nvinfer1::TensorFormat format) {
             return DATA_FORMAT_NC4HW4;
         case nvinfer1::TensorFormat::kCHW16 :
             return DATA_FORMAT_NC16HW16;
+        case nvinfer1::TensorFormat::kCHW32 :
+            return DATA_FORMAT_NC32HW32;
         default:
             return DATA_FORMAT_NCHW;
     }
@@ -150,17 +171,36 @@ nvinfer1::Dims ConvertToTRTDynamicDims(nvinfer1::Dims max_dims, nvinfer1::Dims m
     return trt_dims;
 }
 
+nvinfer1::ILayer* ConstantLayer(nvinfer1::INetworkDefinition* network, float data, size_t dim_size) {
+    nvinfer1::Dims const_dim;
+    const_dim.nbDims = dim_size;
+    for (int i = 0; i < dim_size; i++)
+        const_dim.d[i] = 1;
+
+    nvinfer1::Weights const_weight;
+    const_weight.type = nvinfer1::DataType::kFLOAT;
+    float* weight_data = (float*)malloc(sizeof(float));
+    *weight_data = data;
+    const_weight.values = (void*)weight_data;
+    const_weight.count = 1;
+
+    nvinfer1::ILayer* constant_layer = network->addConstant(const_dim, const_weight);
+    return constant_layer;
+}
+
 nvinfer1::DataType ConvertToTRTDataType(DataType type) {
     switch (type) {
         case DATA_TYPE_FLOAT:
             return nvinfer1::DataType::kFLOAT;
-        case DATA_TYPE_HALF: 
+        case DATA_TYPE_HALF:
             return nvinfer1::DataType::kHALF;
-        case DATA_TYPE_INT32: 
+        case DATA_TYPE_INT32:
             return nvinfer1::DataType::kINT32;
+        case DATA_TYPE_INT8:
+            return nvinfer1::DataType::kINT8;
         default:
             return nvinfer1::DataType::kFLOAT;
-    } 
+    }
 }
 
 nvinfer1::ILayer* ConvertWeightToConstLayer(nvinfer1::INetworkDefinition* network, RawBuffer *buf,
@@ -202,7 +242,7 @@ nvinfer1::ILayer* ConvertWeightToConstLayer(nvinfer1::INetworkDefinition* networ
         }
     }
 
-    nvinfer1::ILayer* constant_layer = network->addConstant(weightDims, const_weight); 
+    nvinfer1::ILayer* constant_layer = network->addConstant(weightDims, const_weight);
     return constant_layer;
 }
 
@@ -216,6 +256,38 @@ nvinfer1::ILayer* AddReshapeToNetwork(nvinfer1::INetworkDefinition* network, nvi
     return shuffle_layer;
 }
 
+nvinfer1::ILayer* AddUnSqueezeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor,
+    const std::vector<int>& axes, const char* layer_name) {
+    const auto dims = shapeOf(*input_tensor);
+    const std::set<int> axesSet(axes.begin(), axes.end());
+
+    std::vector<int> subscripts(dims.size());
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    for (const auto& axis : axesSet)
+    {
+        subscripts.insert(subscripts.begin() + axis, dims.size());
+    }
+
+    const auto newDims = interlace(network, dims, shapeVector(1), ShapeTensor(1, std::move(subscripts)));
+    nvinfer1::IShuffleLayer* unsqueezeLayer = addShuffle(network, *input_tensor, newDims);
+    return unsqueezeLayer;
+}
+
+nvinfer1::ILayer* AddSqueezeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor,
+    const std::vector<int>& axes, const char* layer_name) {
+    const auto dims = shapeOf(*input_tensor);
+
+    std::vector<int> subscripts(dims.size());
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    auto p = std::remove_if(subscripts.begin(), subscripts.end(),
+        [axes](int x) { return std::find(axes.begin(), axes.end(), x) != axes.end(); });
+    subscripts.resize(p - subscripts.begin());
+
+    auto newDims = gather(network, dims, ShapeTensor(1, std::move(subscripts)));
+    nvinfer1::IShuffleLayer* squeezeLayer = addShuffle(network, *input_tensor, newDims);
+    return squeezeLayer;
+}
+
 nvinfer1::Weights ConvertToWeights(RawBuffer *buf, bool zero_weight, DataType recommend_type) {
     Weights weights;
     if (!zero_weight) {
@@ -281,4 +353,43 @@ void BroadcastTensors(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor*&
     BroadcastTensor(network, t3, maxDims);
 }
 
+bool CheckBroadcastDimsCorrect(nvinfer1::ITensor* input_tensor1, nvinfer1::ITensor* input_tensor2) {
+    if(input_tensor1->getDimensions().nbDims != input_tensor2->getDimensions().nbDims) {
+        return false;
+    }
+    int nbDims = input_tensor1->getDimensions().nbDims;
+    for(int i = 0; i < nbDims; ++i) {
+        auto input_dims1 = input_tensor1->getDimensions().d[i];
+        auto input_dims2 = input_tensor2->getDimensions().d[i];
+        if(input_dims1 > 1 && input_dims2 > 1 && input_dims1 != input_dims2) {
+            return false;
+        }
+    }
+    return true;
+}
+
+nvinfer1::ITensor* unsqueezeTensor(nvinfer1::INetworkDefinition* network,
+    nvinfer1::ITensor& tensor, const std::vector<int>& axes) {
+    const auto dims = shapeOf(tensor);
+    const std::set<int> axesSet(axes.begin(), axes.end());
+
+    // Ensure that result fits maximum allowed dimensions.
+    if (dims.size() + axesSet.size() > nvinfer1::Dims::MAX_DIMS)
+    {
+        return nullptr;
+    }
+
+    // Compute interlacing subscripts.
+    std::vector<int> subscripts(dims.size());
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    for (const auto& axis : axesSet)
+    {
+        subscripts.insert(subscripts.begin() + axis, dims.size());
+    }
+
+    const auto newDims = interlace(network, dims, shapeVector(1), ShapeTensor(1, std::move(subscripts)));
+    nvinfer1::IShuffleLayer* unsqueezeLayer = addShuffle(network, tensor, newDims);
+    return unsqueezeLayer->getOutput(0);
+}
+
 }  //  namespace TNN_NS
diff --git a/source/tnn/network/tensorrt/utils.h b/source/tnn/network/tensorrt/utils.h
index 2ec8355e4..e82894a09 100644
--- a/source/tnn/network/tensorrt/utils.h
+++ b/source/tnn/network/tensorrt/utils.h
@@ -46,11 +46,19 @@ nvinfer1::Dims ConvertToTRTDynamicDims(nvinfer1::Dims max_dims, nvinfer1::Dims m
 
 nvinfer1::Dims ConvertToTRTDimsReverse(DimsVector dims);
 
+nvinfer1::ILayer* ConstantLayer(nvinfer1::INetworkDefinition* network, float data, size_t dim_size);
+
 nvinfer1::DataType ConvertToTRTDataType(DataType type);
 
 nvinfer1::ILayer* AddReshapeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor,
     DimsVector reshape_dims, const char* layer_name);
 
+nvinfer1::ILayer* AddUnSqueezeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor,
+    const std::vector<int>& axes, const char* layer_name);
+
+nvinfer1::ILayer* AddSqueezeToNetwork(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor* input_tensor,
+    const std::vector<int>& axes, const char* layer_name);
+
 nvinfer1::Weights ConvertToWeights(RawBuffer *buf, bool zero_weight = false, DataType recommend_type = DATA_TYPE_FLOAT);
 
 nvinfer1::ILayer* ConvertWeightToConstLayer(nvinfer1::INetworkDefinition* network, RawBuffer *buf,
@@ -61,6 +69,11 @@ void BroadcastTensors(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor*&
 void BroadcastTensors(nvinfer1::INetworkDefinition* network, nvinfer1::ITensor*& t1,
     nvinfer1::ITensor*& t2, nvinfer1::ITensor*& t3);
 
+bool CheckBroadcastDimsCorrect(nvinfer1::ITensor* input_tensor1, nvinfer1::ITensor* input_tensor2);
+
+nvinfer1::ITensor* unsqueezeTensor(nvinfer1::INetworkDefinition* network,
+    nvinfer1::ITensor& tensor, const std::vector<int>& axes);
+
 }  //  namespace TNN_NS
 
 #endif  //  TNN_SOURCE_TNN_NETWORK_TENSORRT_UTILS_H_
diff --git a/source/tnn/optimizer/QDQ/graph.cc b/source/tnn/optimizer/QDQ/graph.cc
new file mode 100644
index 000000000..596cf57dc
--- /dev/null
+++ b/source/tnn/optimizer/QDQ/graph.cc
@@ -0,0 +1,156 @@
+#include "tnn/optimizer/QDQ/graph.h"
+
+namespace TNN_NS {
+namespace QDQ {
+
+std::vector<layer_id_t> Graph::FindPredecessors(layer_id_t lid) const {
+    std::vector<layer_id_t> res;
+    auto layer = net_structure_->layers[lid];
+    for (auto &input : layer->inputs) {
+        auto lid = GetProducerByName(input);
+        res.emplace_back(lid);
+    }
+
+    return res;
+}
+
+std::vector<layer_id_t> Graph::FindSuccessors(layer_id_t lid) const {
+    std::vector<layer_id_t> res;
+    auto layer = net_structure_->layers[lid];
+    for (auto &output : layer->outputs) {
+        auto successors = GetConsumerByName(output);
+        res.insert(res.end(), successors.begin(), successors.end());
+    }
+
+    return res;
+}
+
+std::shared_ptr<LayerInfo> Graph::GetLayerById(layer_id_t lid) {
+    if (lid == INVALID_NODEID || lid < 0 || lid >= net_structure_->layers.size()) {
+        return nullptr;
+    } else {
+        return net_structure_->layers[lid];
+    }
+}
+
+Status Graph::ReplaceWithLayer(layer_id_t lid, std::shared_ptr<LayerInfo> new_layer) {
+    net_structure_->layers[lid] = new_layer;
+    return TNN_OK;
+}
+
+Status Graph::EraseLayerById(layer_id_t lid) {
+    auto begin = net_structure_->layers.begin();
+    net_structure_->layers.erase(begin + lid);
+    return TNN_OK;
+}
+
+layer_id_t Graph::GetMaxLayerId() {
+    return net_structure_->layers.size() - 1;
+}
+
+Status Graph::SetLayerResByName(const std::string &name, std::shared_ptr<LayerResource> l_res) {
+    net_resource_->resource_map[name] = l_res;
+    return TNN_OK;
+}
+
+std::shared_ptr<LayerResource> Graph::GetLayerResByName(const std::string &name) {
+    if (!net_resource_->resource_map.count(name)) {
+        return nullptr;
+    } else {
+        return net_resource_->resource_map[name];
+    }
+}
+
+Status Graph::SetConstResByName(const std::string &name, std::shared_ptr<RawBuffer> const_res) {
+    net_resource_->constant_map[name] = const_res;
+    return TNN_OK;
+}
+
+std::shared_ptr<RawBuffer> Graph::GetConstResByName(const std::string &name) {
+    if (!net_resource_->constant_map.count(name)) {
+        return nullptr;
+    } else {
+        return net_resource_->constant_map[name];
+    }
+}
+
+layer_id_t Graph::GetProducerByName(const std::string &blob_name) const {
+    for (int l = 0; l < net_structure_->layers.size(); l++) {
+        auto layer = net_structure_->layers[l];
+        std::set<std::string> output_set(layer->outputs.begin(), layer->outputs.end());
+        if (output_set.count(blob_name))
+            return l;
+    }
+    return INVALID_NODEID;
+}
+
+std::vector<layer_id_t> Graph::GetConsumerByName(const std::string &blob_name) const {
+    std::vector<layer_id_t> res;
+    for (int l = 0; l < net_structure_->layers.size(); l++) {
+        auto layer = net_structure_->layers[l];
+        std::set<std::string> input_set(layer->inputs.begin(), layer->inputs.end());
+        if (input_set.count(blob_name))
+            res.emplace_back(l);
+    }
+    return res;
+}
+
+layer_id_t Graph::FindPos(std::shared_ptr<LayerInfo> layer) const {
+    int res = INVALID_NODEID;
+    for (int l = 0; l < net_structure_->layers.size(); l++) {
+        if (layer.get() == net_structure_->layers[l].get()) {
+            res = l;
+            break;
+        }
+    }
+    return res;
+}
+
+std::shared_ptr<LayerInfo> Graph::CloneLayerById(layer_id_t lid, int times) {
+    auto old_layer = net_structure_->layers[lid];
+    auto new_layer = old_layer->Copy();
+    new_layer->name = old_layer->name + "_clone_" + std::to_string(times);
+
+    if (net_resource_->resource_map.count(old_layer->name)) {
+        net_resource_->resource_map[new_layer->name] = net_resource_->resource_map[old_layer->name];
+    }
+
+    std::vector<std::string> new_outputs;
+    for (auto &output : old_layer->outputs) {
+        auto new_output = output + "_clone_" + std::to_string(times);
+        if (net_resource_->constant_map.count(output)) {
+            net_resource_->constant_map[new_output] = net_resource_->constant_map[output];
+        }
+        new_outputs.emplace_back(new_output);
+        net_structure_->blobs.insert(new_output);
+    }
+    new_layer->outputs = new_outputs;
+
+    return new_layer;
+}
+
+Status Graph::InsertLayers(layer_id_t lid, std::vector<std::shared_ptr<LayerInfo>> &insert_layers) {
+    auto begin = net_structure_->layers.begin();
+    net_structure_->layers.insert(begin + lid, insert_layers.begin(), insert_layers.end());
+    return TNN_OK;
+}
+
+Status Graph::EliminateDeadLayer() {
+    std::for_each(net_structure_->layers.begin(), net_structure_->layers.end(), [&](std::shared_ptr<LayerInfo> &l) {
+        if (l->type == LAYER_NOT_SUPPORT) {
+            if (this->net_resource_->resource_map.count(l->name)) {
+                this->net_resource_->resource_map.erase(l->name);
+            }
+            // Todo : erase const map
+        }
+    });
+
+    net_structure_->layers.erase(
+        std::remove_if(net_structure_->layers.begin(), net_structure_->layers.end(),
+                       [](std::shared_ptr<LayerInfo> &l) { return l->type == LAYER_NOT_SUPPORT;}),
+        net_structure_->layers.end());
+
+    return TNN_OK;
+}
+}  // namespace QDQ
+}  // namespace TNN_NS
\ No newline at end of file
diff --git a/source/tnn/optimizer/QDQ/graph.h b/source/tnn/optimizer/QDQ/graph.h
new file mode 100644
index 000000000..2dedd5358
--- /dev/null
+++ b/source/tnn/optimizer/QDQ/graph.h
@@ -0,0 +1,61 @@
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_QDQ_GRAPH_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_QDQ_GRAPH_H_
+
+#include <stdint.h>
+#include <memory>
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+
+namespace TNN_NS {
+
+namespace QDQ {
+typedef int32_t layer_id_t;
+
+static const uint32_t INVALID_NODEID = INT32_MAX;
+
+class Graph {
+public:
+    Graph(NetStructure *net_structure, NetResource *net_resource)
+        : net_structure_(net_structure), net_resource_(net_resource) {}
+    ~Graph() {}
+
+    std::vector<layer_id_t> FindPredecessors(layer_id_t) const;
+
+    std::vector<layer_id_t> FindSuccessors(layer_id_t) const;
+
+    std::shared_ptr<LayerInfo> GetLayerById(layer_id_t);
+
+    std::shared_ptr<LayerInfo> CloneLayerById(layer_id_t, int times);
+
+    Status ReplaceWithLayer(layer_id_t, std::shared_ptr<LayerInfo>);
+
+    Status InsertLayers(layer_id_t, std::vector<std::shared_ptr<LayerInfo>> &);
+
+    Status EraseLayerById(layer_id_t);
+
+    Status EliminateDeadLayer();
+
+    layer_id_t GetMaxLayerId();
+
+    Status SetLayerResByName(const std::string &, std::shared_ptr<LayerResource>);
+
+    std::shared_ptr<LayerResource> GetLayerResByName(const std::string &);
+
+    Status SetConstResByName(const std::string &, std::shared_ptr<RawBuffer>);
+
+    std::shared_ptr<RawBuffer> GetConstResByName(const std::string &);
+
+    layer_id_t GetProducerByName(const std::string &) const;
+
+    std::vector<layer_id_t> GetConsumerByName(const std::string &blob_name) const;
+
+    layer_id_t FindPos(std::shared_ptr<LayerInfo>) const;
+
+private:
+    NetStructure *net_structure_;
+    NetResource *net_resource_;
+};
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/source/tnn/optimizer/graph_matcher/README.MD b/source/tnn/optimizer/graph_matcher/README.MD
index 9eed28743..895f1a744 100644
--- a/source/tnn/optimizer/graph_matcher/README.MD
+++ b/source/tnn/optimizer/graph_matcher/README.MD
@@ -41,7 +41,7 @@ Represents a model in TNN. Including Nodes, Tensors, Edges.
     }
     ```
     
-# SubGraph Matchitng
+# SubGraph Matching
 ``` 
         Graph * graph = ...;
         Graph * pattern = ...;
diff --git a/source/tnn/optimizer/graph_matcher/graph_matcher.cc b/source/tnn/optimizer/graph_matcher/graph_matcher.cc
index 977f9284f..ed84d7d1b 100644
--- a/source/tnn/optimizer/graph_matcher/graph_matcher.cc
+++ b/source/tnn/optimizer/graph_matcher/graph_matcher.cc
@@ -83,7 +83,7 @@ bool AnchorGraph::matchUp(const Node *node, Node* probe, int recursion, bool sil
     return true;   
 }
 
-std::vector<const Node *> AnchorGraph::allStructualMatchedNodes(const Node * pattern_sibling_node) {
+std::set<const Node *> AnchorGraph::allStructualMatchedNodes(const Node * pattern_sibling_node) {
     struct Path {
         const Node * n;
         std::stack<LayerType> types;
@@ -119,7 +119,7 @@ std::vector<const Node *> AnchorGraph::allStructualMatchedNodes(const Node * pat
         }
     }
 
-    std::vector<const Node *> res;
+    std::set<const Node *> res;
     // BFS to find all matched Nodes
     while(!start_points.empty()) {
         Path path = start_points.front(); start_points.pop();
@@ -131,7 +131,7 @@ std::vector<const Node *> AnchorGraph::allStructualMatchedNodes(const Node * pat
         DEBUG("%s", ss.str().c_str());
 
         if (path.types.empty()) {
-            res.push_back(path.n);
+            res.insert(path.n);
             continue;
         }
 
@@ -161,7 +161,9 @@ void AnchorGraph::formalize(Graph *g) {
     //     2.here we do not modify the original Graph, Nodes and Edges.
     //     3.the AnchorGraph do not use marked_outputs.
 
+    std::vector<std::string> output_order_ = output_order;
     *dynamic_cast<Graph*>(this) = Graph();
+    setOutputsOrder(output_order_);
     tnn_structure = g->tnn_structure;
     tnn_resource = g->tnn_resource;
 
@@ -403,7 +405,8 @@ void match(const std::shared_ptr<Graph> graph, const std::shared_ptr<Graph> patt
                 for(auto &n : possible_outs) { ss << "[" << n->name() << "],"; }
                 DEBUG("%s", ss.str().c_str());
 
-                for(auto &candidate : possible_outs) {
+
+		        for(auto &candidate : possible_outs) {
                     res->backTrace(getRecursion(cur.output_id));
                     if (res->matchUp(candidate, pattern_outs[cur.output_id], getRecursion(cur.output_id))) {
                         que.push(DFSState(cur.output_id +1, candidate));
diff --git a/source/tnn/optimizer/graph_matcher/graph_matcher.h b/source/tnn/optimizer/graph_matcher/graph_matcher.h
index af7c0cb0f..cfce2579c 100644
--- a/source/tnn/optimizer/graph_matcher/graph_matcher.h
+++ b/source/tnn/optimizer/graph_matcher/graph_matcher.h
@@ -55,7 +55,7 @@ struct AnchorGraph : public Graph {
 
     void backTrace(int recursion);
 
-    std::vector<const Node *> allStructualMatchedNodes(const Node * pattern_sibling_node);
+    std::set<const Node *> allStructualMatchedNodes(const Node * pattern_sibling_node);
 
     Status getIOOrderingOfPatternGraph(Graph *g, std::vector<std::string> & in, std::vector<std::string> & out);
     void formalize(Graph *g);
diff --git a/source/tnn/optimizer/graph_matcher/ir.cc b/source/tnn/optimizer/graph_matcher/ir.cc
index 264b760f4..b377b95d1 100644
--- a/source/tnn/optimizer/graph_matcher/ir.cc
+++ b/source/tnn/optimizer/graph_matcher/ir.cc
@@ -146,7 +146,6 @@ namespace TNN_NS {
         auto validOutput = [&](const std::string &name) -> bool {
             return std::find(info->outputs.begin(), info->outputs.end(), name) != info->outputs.end();
         };
-
         for(size_t i=0;i<output_edges.size();i++) {
             NODE_TEST(validOutput(output_edges[i]->tensor_name));
             NODE_TEST(output_edges[i]->src == this);
@@ -378,6 +377,36 @@ namespace TNN_NS {
         RETURN_ON_NEQ(buildNodeTensorIndex(n), TNN_OK);
         return TNN_OK;
     }
+    
+    Status Graph::reorderNodeAfter(const std::string target_node_name, const std::string reorder_node_name) {
+        if (tensor_2_node.find(target_node_name) == tensor_2_node.end()) {
+            ERRORV("unable to find Target Node [%s].", msg, target_node_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, msg);
+        }
+        if (tensor_2_node.find(reorder_node_name) == tensor_2_node.end()) {
+            ERRORV("unable to find Source Node [%s].", msg, reorder_node_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, msg);
+        }
+        std::shared_ptr<Node> target_node  = tensor_2_node.find(target_node_name)->second;
+        std::shared_ptr<Node> reorder_node = tensor_2_node.find(reorder_node_name)->second;
+
+        auto reorder_node_iter = std::find(nodes.begin(), nodes.end(), reorder_node);
+        if (reorder_node_iter == nodes.end()) {
+            ERRORV("unable to find Node [%s] in graph.nodes.", msg, reorder_node_name.c_str());
+            return Status(TNNERR_COMMON_ERROR, msg);
+        }
+
+        // Get Node to be Reordered, erase the node from nodes()
+        std::shared_ptr<Node> node_to_reorder = *reorder_node_iter;
+        nodes.erase(reorder_node_iter);
+
+        // Find out target position and insert.
+        auto target_node_iter = std::find(nodes.begin(), nodes.end(), target_node);
+        auto pos_to_insert = target_node_iter == nodes.end() ? nodes.begin() : std::next(target_node_iter, 1);     
+        nodes.insert(pos_to_insert, node_to_reorder); 
+
+        return TNN_OK;
+    }
 
     Status Graph::createNode(const LayerType &type, const std::vector<std::string> &in_names, 
                             const std::vector<std::string> &out_names, const std::vector<std::shared_ptr<Tensor>> out_tensors) {
@@ -399,7 +428,7 @@ namespace TNN_NS {
         for(auto & in : out_names) {
             auto src = getNodeByTensorName(in);
             if (src) {
-                ERRORV("specified output alread exists.", msg);
+                ERRORV("specified output: %s alread exists.", msg, in.c_str());
                 return Status(TNNERR_COMMON_ERROR, msg);
             }
         }
@@ -552,6 +581,7 @@ namespace TNN_NS {
         }
         return TNN_OK;
     }
+
     Status Graph::sanityCheck() {
         for(auto &n : placeholders) {
             RETURN_ON_FAIL(n->sanityCheck());
@@ -572,6 +602,14 @@ namespace TNN_NS {
                 return Status(TNNERR_COMMON_ERROR, msg);
             }
         }
+
+        // TODO
+        // TODO: by doxu@tencent.com on 2022.09.14
+        // Temporary Close Graph IsConnectedCheck.
+        // Because some Constant Input may not need to be connected in the new graph.
+        // In Multi-Head Attention Plugin
+        // Add Back Connectivity Check in the future.
+        /*
         // Check if the graph is a connected graph
         AnchorGraph* anchor_ptr = dynamic_cast<AnchorGraph*>(this);
         bool connected;
@@ -584,6 +622,7 @@ namespace TNN_NS {
             ERRORV("the graph is not connected.", msg);
             return Status(TNNERR_COMMON_ERROR, msg);
         }
+        */
         return TNN_OK;
     }
 
@@ -642,9 +681,12 @@ namespace TNN_NS {
 
         for(auto &n : placeholders) {
             RAISE_ON_ERROR(n->sanityCheck());
-            if (n->output_edges.size() > 0) {
-                names.push_back(n->info->outputs[0]);
-            }
+            // TODO: Temp Loose restrict for 'inputs', allow node with no OPs using it to be recoginized as 'input' as well.
+            // TODO: FIXME in future.
+            names.emplace_back(n->info->outputs[0]);
+            //if (n->output_edges.size() > 0) {
+            //    names.insert(n->info->outputs[0]);
+            //}
         }
         return getTensorsByNames(names);
     }
@@ -678,6 +720,7 @@ namespace TNN_NS {
     } 
 
     Status Graph::setOutputsOrder(std::vector<std::string> tensor_names) {
+        /*
         std::set<std::string> names_set(tensor_names.begin(), tensor_names.end());
         if (names_set.size() != tensor_names.size()) {
             ERRORV("setOutputsOrder got dulicated tensor names", msg);
@@ -695,11 +738,13 @@ namespace TNN_NS {
                 return Status(TNNERR_COMMON_ERROR, msg);
             }
         }
+        */
         output_order = tensor_names;
         return TNN_OK;
     } 
 
     Status Graph::rewrite(std::shared_ptr<Graph> &pattern, graph_generator generator) {
+        //TNN_NS::Logger::instance().set_verbose_level("D");
         try { 
             RAISE_ON_ERROR(sanityCheck());
 
@@ -800,7 +845,27 @@ namespace TNN_NS {
         }
 
     }
-
+        
+    void Graph::updateTnnNetStructure() {
+        if (tnn_structure) {
+            // add new blobs to tnn_structure
+            for (auto &n : nodes) {
+                for (auto &name : n->info->outputs) {
+                    tnn_structure->blobs.insert(name);
+                }
+            }
+ 
+            // update net_structure
+            std::vector<std::shared_ptr<LayerInfo>> new_layers;
+            for (auto &n : nodes) {
+                // ignore const layers, which are added in fromIntepreted function accourding to const_map.
+                if (n->info->type != LAYER_CONST) {
+                    new_layers.push_back(n->info);
+                }
+            }
+            tnn_structure->layers = new_layers;
+        }
+    }
 
     // output based reverse BFS with Priority Que 
     // Priorities:
@@ -820,7 +885,6 @@ namespace TNN_NS {
         std::list<std::shared_ptr<Node>> pool;
         std::vector<std::shared_ptr<Node>> sorted;
         sorted.reserve(nodes.size());
-
         for(auto &n : placeholders) {
             for(auto &name : n->info->outputs) 
                 known_names.insert(name);
@@ -931,6 +995,21 @@ namespace TNN_NS {
         // 5. remove unused Nodes
         // NB. we need to keep the original graph output tensor names un-changed.
 
+        auto return_check = [&]() {
+            std::set<std::string> graph_outputs;
+            for (auto &output: g->outputs())
+                graph_outputs.insert(output->name);
+
+            return std::any_of(anchor->nodes.begin(), anchor->nodes.end(), [&](const std::shared_ptr<Node> &node) {
+                return std::any_of(node->info->outputs.begin(), node->info->outputs.end(), [&](const std::string &output) {
+                    return (graph_outputs.count(output));
+                });
+            });
+        };
+
+        if (return_check())
+            return;
+
         std::set<std::string> tensor_names;
         for(auto & p : tensor_map) tensor_names.insert(p.first);
         for(auto &name : tensor_names) renameTensor(name, name_prefix + name);
@@ -943,6 +1022,7 @@ namespace TNN_NS {
 
         std::map<std::string, std::string> out_mapping;
         for(size_t i=0;i<anchor->outputs().size();i++) {
+            DEBUG("out_mapping from (anchor->outputs) %s -> (outputs) %s ", anchor->outputs()[i]->name.c_str(), outputs()[i]->name.c_str());
             out_mapping[anchor->outputs()[i]->name] = outputs()[i]->name;
         }
 
@@ -1029,6 +1109,7 @@ namespace TNN_NS {
         for(auto & e : out_edges) {
             Node * old_node = e->src;
             Node * new_node = getNodeByTensorName(out_mapping[e->tensor_name]).get();
+            DEBUG("out_mapping from %s -> %s ", e->tensor_name.c_str(), out_mapping[e->tensor_name].c_str());
             old_node->output_edges.erase(std::remove_if(old_node->output_edges.begin(), old_node->output_edges.end(), [&](Edge * cur){
                                             return cur->dst == e->dst;
                                         }), old_node->output_edges.end());
diff --git a/source/tnn/optimizer/graph_matcher/ir.h b/source/tnn/optimizer/graph_matcher/ir.h
index 119018b0a..3e94283fb 100644
--- a/source/tnn/optimizer/graph_matcher/ir.h
+++ b/source/tnn/optimizer/graph_matcher/ir.h
@@ -173,6 +173,8 @@ namespace TNN_NS {
 
         // will also handle the tensors
         Status addNode(const std::shared_ptr<Node> &pattern, bool creat_tensors = true);
+        
+        Status reorderNodeAfter(const std::string target_node_name, const std::string source_node_name);
 
         // create node of specified type, Node name is set to the first output tensor_name, will also handle the tensors by addNode function if out_tensors not specified.
         Status createNode(const LayerType &type, const std::vector<std::string> &in_names, const std::vector<std::string> &out_names, 
@@ -185,8 +187,12 @@ namespace TNN_NS {
         const std::vector<std::weak_ptr<const Node>> allNodes() const;
 
         Status rewrite(std::shared_ptr<Graph> &pattern, graph_generator generator);
+        
+        void embed(std::shared_ptr<Graph> g, const std::shared_ptr<AnchorGraph> anchor, std::string name_prefx) ;
 
         void dump(std::ostream &os) const;
+        
+        void updateTnnNetStructure();
 
         // will create a placeholder node if tensor not found.
         std::shared_ptr<Node> getNodeOrCreatePlaceHolder(const std::string &tensor_name);
@@ -213,7 +219,6 @@ namespace TNN_NS {
 
         Status buildNodeTensorIndex(const std::shared_ptr<Node> );
 
-        void embed(std::shared_ptr<Graph> g, const std::shared_ptr<AnchorGraph> anchor, std::string name_prefx) ;
 
         Status topologicalSort();
 
@@ -232,7 +237,7 @@ namespace TNN_NS {
         // following members are used to manage the ordering of outputs
         std::vector<std::string> output_order;
 
-        // following members are managed by the reBuidlTensorIndex function
+        // following members are managed by the reBuildTensorIndex function
         std::unordered_map<std::string, std::shared_ptr<Tensor>> tensor_map;
         std::map<std::string, std::shared_ptr<Node>> tensor_2_node;
         std::map<std::string, std::vector<Edge*>> tensor_2_edge;
diff --git a/source/tnn/optimizer/net_optimizer_context_marker.cc b/source/tnn/optimizer/net_optimizer_context_marker.cc
new file mode 100644
index 000000000..7d23b56ce
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_context_marker.cc
@@ -0,0 +1,240 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_context_marker.h"
+
+#include <map>
+#include <memory>
+#include <tuple>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+namespace context_marker {
+
+    struct Node;
+
+    struct Edge {
+        Edge(Node * _src, Node * _dst) : src(_src), dst(_dst) {}
+    public:
+        Node * src;
+        Node * dst;
+    };
+
+
+    struct Node {
+        Node(std::shared_ptr<LayerInfo> &layer_info) {
+            info = layer_info;
+            name = info->name;
+        }
+
+        Node(std::string &blob_name) {
+            info = std::make_shared<LayerInfo>();
+            info->type = LAYER_NOT_SUPPORT;
+            name = blob_name;
+        }
+
+        void addOutputEdge(Edge * e) {
+            output_edges.push_back(e);
+        }
+
+        void addInputEdge(Edge * e) {
+            input_edges.push_back(e);
+        }
+
+        Node * prev(int id) {
+            if (id < input_edges.size()) {
+                return input_edges[id]->src;
+            } else {
+                throw std::runtime_error("invalid Node input index.");
+            }
+        }
+
+        Node * next(int id) {
+            if (id < output_edges.size()) {
+                return output_edges[id]->dst;
+            } else {
+                throw std::runtime_error("invalid Node output index.");
+            }
+        }
+
+        bool matchSequence(std::pair<int, LayerType> * seq, int seq_len, bool reverse) {
+            if (seq_len == 0) {
+                return true;
+            }
+            
+            int port = seq[0].first;
+            LayerType type = seq[0].second;
+
+            std::vector<Edge*> edges = output_edges;
+            if (reverse) {
+                edges = input_edges;
+            }
+
+            if (port == -1) {
+                for(auto e: edges) {
+                    Node * n = e->dst;
+                    if (reverse) n = e->src;
+                    if (n->info->type != type) { 
+                        // printf("\t\t\tseq_len:%d skip:<%s> on %d != %d\n",seq_len, n->info->name.c_str(), n->info->type, type);
+                        continue;
+                    }
+                    if (n->matchSequence(seq + 1, seq_len -1, reverse)) {
+                        return true;
+                    }
+                }
+                // printf("\t\tseq_len:%d for loop failed at:<%s>\n", seq_len, info->name.c_str());
+                return false;
+            }
+
+            if (port >= edges.size()) { return false; }
+            Node * n = edges[port]->dst;
+            if (reverse) {n = edges[port]->src; }
+
+            if (n->info->type != type) {
+                // printf("\t\tseq_len:%d type not match at:<%s> %d != %d\n", seq_len, n->info->name.c_str(), n->info->type, type);
+                return false;
+            }
+            return edges[port]->src->matchSequence(seq+1, seq_len -1, reverse);
+        }
+
+    public:
+        std::string name;
+        std::shared_ptr<LayerInfo> info;
+        std::vector<Edge*> output_edges;
+        std::vector<Edge*> input_edges;
+    };
+
+    struct Graph {
+
+        Graph(std::vector<std::shared_ptr<LayerInfo>> layers) {
+            for (auto layer : layers) {
+                // printf("Construct Node <%s>\n", layer->name.c_str());
+                auto node = std::make_shared<Node>(layer);
+                nodes.push_back(node);
+                for (auto out : layer->outputs) {
+                    if (blob_2_node.find(out) != blob_2_node.end()) {
+                        throw std::runtime_error("duplicated tensor_name found.");
+                    }
+                    blob_2_node[out] = node;
+                }
+                for (auto in : layer->inputs) {
+                    auto n = getNodeByBlobName(in);
+                    auto e = std::make_shared<Edge>(n.get(), node.get());
+                    n->addOutputEdge(e.get());
+                    node->addInputEdge(e.get());
+                    edges.push_back(e);
+                }
+            }
+        }
+
+        Graph(std::string proto_str) {
+            // TODO impl, parse a subgraph from prototext, 
+            // Could be used as pattern for GraphRewriter 
+        }
+
+        std::shared_ptr<Node> getNodeByBlobName(std::string &blob_name) {
+            if (blob_2_node.find(blob_name) != blob_2_node.end()) {
+                return blob_2_node[blob_name];
+            }
+            auto input = std::make_shared<Node>(blob_name);
+            placeholders.push_back(input);
+            blob_2_node[blob_name] = input;
+            return input;
+        }
+
+        typedef std::function<std::shared_ptr<Graph>(std::shared_ptr<Graph>)> graph_generator;
+
+        void Rewrite(std::shared_ptr<Graph> &pattern, graph_generator generator) {
+            // TODO Impl 
+        }
+
+
+    public:
+        std::vector<std::shared_ptr<Node>> nodes;
+        std::vector<std::shared_ptr<Edge>> edges;
+        std::vector<std::shared_ptr<Node>> placeholders;
+        std::map<std::string, std::shared_ptr<Node> > blob_2_node;
+    };
+
+    NetOptimizerRegister<NetOptimizerContextMarker> g_net_optimizer_bert_ffn_marker(OptPriority::P1);
+
+    std::string NetOptimizerContextMarker::Strategy() {
+        return kNetOptimizerContextMarker;
+    }
+
+    bool NetOptimizerContextMarker::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerContextMarker::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        try {
+            auto graph = std::make_shared<Graph>(structure->layers);
+
+            // bool(reverse) indicates this squence is backward or not
+            typedef std::tuple<std::vector<std::pair<int, LayerType>>, bool, std::string> NodeSequence;
+
+            std::map<LayerType, std::vector<NodeSequence> > rules;
+
+            NodeSequence matmul_ffn_seq = {{{-1, LAYER_ADD}, {-1, LAYER_MUL} }, false, "ffn"};
+            NodeSequence matmul_ffn_backward_seq = {{{0, LAYER_MUL}, {0, LAYER_MUL}, {0, LAYER_ADD} }, true, "ffn"};
+            rules[LAYER_MATMUL] = {matmul_ffn_seq, matmul_ffn_backward_seq};
+
+            const int count   = (const int)structure->layers.size();
+            for (int index = 0; index < count; index++) {
+                auto layer = structure->layers[index];
+
+                if (rules.find(layer->type) != rules.end()){
+                    auto n = graph->getNodeByBlobName(layer->outputs[0]);
+
+                    auto &seq_list = rules[layer->type];
+                    for (auto &seq : seq_list) {
+                        auto &lst = std::get<0>(seq);
+                        bool reverse = std::get<1>(seq);
+                        std::string label = std::get<2>(seq);
+
+                        if (n->matchSequence(&lst[0], int(lst.size()), reverse)) {
+                            layer->param->extra_config.insert(label);
+                            // printf("layer %s is marked as %s\n", layer->name.c_str(), label.c_str());
+                        } 
+                    }
+                }
+            }
+
+        }catch(...) {
+            return Status(TNNERR_INST_ERR, "ContextMarker got exception");
+        }
+
+        return TNN_OK;
+    }
+
+}  // namespace context_marker
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_context_marker.h b/source/tnn/optimizer/net_optimizer_context_marker.h
new file mode 100644
index 000000000..0ab5eddf6
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_context_marker.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_CONTEXT_MARKER_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_CONTEXT_MARKER_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+namespace context_marker {
+
+    //@brief net optimize: mark the specified op accounding to the ops around it.
+    class NetOptimizerContextMarker : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+}  // namespace context_marker
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_CONTEXT_MARKER_H_
diff --git a/source/tnn/optimizer/net_optimizer_effective_transformer.cc b/source/tnn/optimizer/net_optimizer_effective_transformer.cc
new file mode 100644
index 000000000..5b86bef00
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_effective_transformer.cc
@@ -0,0 +1,686 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_effective_transformer.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+// #include "tnn/interpreter/tnn/model_packer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerEffectiveTransformer> g_net_optimizer_effective_transformer(OptPriority::P2);
+
+    std::string NetOptimizerEffectiveTransformer::Strategy() {
+        return kNetOptimizerEffectiveTransformer;
+    }
+
+    bool NetOptimizerEffectiveTransformer::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            return true;
+        }
+        return false;
+    }
+
+    Status NetOptimizerEffectiveTransformer::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        // TNN_NS::Logger::instance().set_verbose_level("D");
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        RETURN_ON_FAIL(OptimizeForAttention(graph));
+        RETURN_ON_FAIL(OptimizeForFFN(graph));
+        RETURN_ON_FAIL(EliminateRedundantReformats(structure, resource));
+        RETURN_ON_FAIL(ReorderDenseOps(structure, resource));
+
+        // ModelPacker packer(structure, resource);
+        // packer.Pack("pack.tnnproto", "pack.tnnmodel");
+
+        return TNN_OK;
+    }
+
+#define TNN_GRAPH_PREPARE_NODE(name)                                    \
+    auto name## _name = #name;                                          \
+    auto name## _node = g->getNodeOrCreatePlaceHolder(name## _name);
+
+    static std::vector<std::string> RemovePadding(std::shared_ptr<Graph> g, const std::vector<std::string> &in_names, const std::string &layer_name) {
+        std::vector<std::string> out_names = {layer_name + std::string("__eff_to_dense__"), "pad_offset", "trt_offset"};
+        auto status = g->createNode(LAYER_EFFECTIVE_TRANSFORMER, in_names, out_names);
+        if (status != TNN_OK) {
+            return {};
+        }
+        auto remove_pad_node = g->getNodeByTensorName(out_names[0]);
+        auto remove_pad_param = std::make_shared<EffectiveTransformerLayerParam>();
+        remove_pad_param->is_remove_padding = true;
+        remove_pad_node->info->param = remove_pad_param;
+        return out_names;
+    }
+
+    static std::vector<std::string> RebuildPadding(std::shared_ptr<Graph> g, const std::vector<std::string> &in_names, const std::string &layer_name) {
+        std::vector<std::string> out_names = {layer_name + std::string("__eff_to_sparse__")};
+        auto status = g->createNode(LAYER_EFFECTIVE_TRANSFORMER, in_names, out_names);
+        if (status != TNN_OK) {
+            return {};
+        }
+        auto rebuild_pad_node = g->getNodeByTensorName(out_names[0]);
+        auto rebuild_pad_param = std::make_shared<EffectiveTransformerLayerParam>();
+        rebuild_pad_param->is_remove_padding = false;
+        rebuild_pad_node->info->param = rebuild_pad_param;
+        return out_names;
+    }
+
+    struct FusedAttentionPattenInfo {
+        std::string graph_str;
+        std::string graph_str_with_layernorm;
+        int nb_inputs;
+        int nb_outputs;
+        std::string att_node_name;
+        std::string ln_node_name;
+    };
+
+    class FusedAttentionRewriter {
+        public:
+            FusedAttentionRewriter(std::shared_ptr<Graph> graph)
+                : graph_(graph) {
+            }
+            Status Rewrite(const FusedAttentionPattenInfo &patten_info);
+            Status RewriteWithLayerNorm(const FusedAttentionPattenInfo &patten_info);
+        private:
+            std::vector<std::string> GetInputs(std::shared_ptr<Graph> g, const FusedAttentionPattenInfo &info);
+            std::shared_ptr<Graph> graph_;
+    };
+
+    std::vector<std::string> FusedAttentionRewriter::GetInputs(std::shared_ptr<Graph> g, const FusedAttentionPattenInfo &info) {
+        std::vector<std::string> inputs;
+        const std::string prefix = "input_";
+        for (int i = 0; i < info.nb_inputs; ++i) {
+            auto in_name = prefix + std::to_string(i);
+            g->getNodeOrCreatePlaceHolder(in_name);
+            inputs.push_back(in_name);
+        }
+        return inputs;
+    }
+
+    Status FusedAttentionRewriter::Rewrite(const FusedAttentionPattenInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != patten_info.nb_inputs || in->outputs().size() != patten_info.nb_outputs) {
+                return nullptr;
+            }
+
+            auto attention_node = in->getNodeByTensorName(patten_info.att_node_name);
+            if (!attention_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto att_name = attention_node->name();
+
+            auto g = std::make_shared<Graph>();
+            auto in_names  = GetInputs(g, patten_info);
+
+            std::vector<std::string> dense_outs = RemovePadding(g, in_names, att_name);
+            if (dense_outs.size() != 3) {
+                WARN("create remove padding node failed");
+                return nullptr;
+            }
+
+            // in_names[1]: attention mask
+            auto status = g->createNode(LAYER_FUSED, {dense_outs[0], in_names[1], dense_outs[2], dense_outs[1]}, {att_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_attention_node = g->getNodeByTensorName(att_name);
+            new_attention_node->info->param = attention_node->info->param->Copy();
+            auto attention_param = dynamic_cast<FusedLayerParam *>(new_attention_node->info->param.get());
+            if (!attention_param) {
+                WARN("attention_param is nil");
+                return nullptr;
+            }
+            if (attention_param->type != FusionType_Attention) {
+                WARN("type is not attention layer");
+                return nullptr;
+            }
+            attention_param->dense_mode = true;
+
+            std::vector<std::string> sparse_outs = RebuildPadding(g, {att_name, dense_outs[1]}, att_name);
+            if (sparse_outs.size() != 1) {
+                WARN("create rebuild padding node failed");
+                return nullptr;
+            }
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    Status FusedAttentionRewriter::RewriteWithLayerNorm(const FusedAttentionPattenInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str_with_layernorm)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != patten_info.nb_inputs + 2 || in->outputs().size() != patten_info.nb_outputs) {
+                return nullptr;
+            }
+
+            auto attention_node = in->getNodeByTensorName(patten_info.att_node_name);
+            if (!attention_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto att_name = attention_node->name();
+
+            auto ln_node = in->getNodeByTensorName(patten_info.ln_node_name);
+            if (!ln_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto ln_name = ln_node->name();
+
+            auto g        = std::make_shared<Graph>();
+            auto in_names = GetInputs(g, patten_info);
+            TNN_GRAPH_PREPARE_NODE(scale);
+            TNN_GRAPH_PREPARE_NODE(bias);
+
+            std::vector<std::string> dense_outs = RemovePadding(g, in_names, att_name);
+            if (dense_outs.size() != 3) {
+                WARN("create remove padding node failed");
+                return nullptr;
+            }
+
+            // in_names[1]: attention mask
+            auto status = g->createNode(LAYER_FUSED, {dense_outs[0], in_names[1], dense_outs[2], dense_outs[1]}, {att_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_attention_node = g->getNodeByTensorName(att_name);
+            new_attention_node->info->param = attention_node->info->param->Copy();
+            auto attention_param = dynamic_cast<FusedLayerParam *>(new_attention_node->info->param.get());
+            if (!attention_param) {
+                WARN("attention_param is nil");
+                return nullptr;
+            }
+            if (attention_param->type != FusionType_Attention) {
+                WARN("type is not attention layer");
+                return nullptr;
+            }
+
+            status = g->createNode(LAYER_FUSED, {att_name, dense_outs[0], scale_name, bias_name}, {ln_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_ln_node = g->getNodeByTensorName(ln_name);
+            new_ln_node->info->param = ln_node->info->param->Copy();
+            auto ln_param = dynamic_cast<FusedLayerParam *>(new_ln_node->info->param.get());
+            if (!ln_param) {
+                WARN("ln_param is nil");
+                return nullptr;
+            }
+            if (ln_param->type != FusionType_AddBiasResidualLayerNorm) {
+                WARN("type is not layernorm layer");
+                return nullptr;
+            }
+
+            std::vector<std::string> sparse_outs = RebuildPadding(g, {ln_name, dense_outs[1]}, ln_name);
+            if (sparse_outs.size() != 1) {
+                WARN("create rebuild padding node failed");
+                return nullptr;
+            }
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    std::vector<FusedAttentionPattenInfo> GetFusedAttentionPattens() {
+        std::vector<FusedAttentionPattenInfo> pattens;
+
+        // Bert
+        {
+            FusedAttentionPattenInfo bert_patten;
+            bert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size):
+                    %att_out = Fused(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size)
+                    return (%att_out)
+            )";
+            bert_patten.graph_str_with_layernorm = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %scale, %bias):
+                    %dense, %pad_offset, %trt_offset = EffectiveTransformer(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size)
+                    %att_out                         = Fused(%dense, %attn_mask, %trt_offset, %pad_offset)
+                    %sparse                          = EffectiveTransformer(%att_out, %pad_offset)
+                    %ln_out                          = Fused(%sparse, %in, %scale, %bias)
+                    return (%ln_out)
+            )";
+            bert_patten.nb_inputs     = 5;
+            bert_patten.nb_outputs    = 1;
+            bert_patten.att_node_name = "@att_out";
+            bert_patten.ln_node_name  = "@ln_out";
+            pattens.push_back(bert_patten);
+        }
+
+        // DistilBert
+        {
+            FusedAttentionPattenInfo distil_bert_patten;
+            distil_bert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %minus_one, %one):
+                    %att_out = Fused(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %minus_one, %one)
+                    return (%att_out)
+            )";
+            distil_bert_patten.graph_str_with_layernorm = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %minus_one, %one, %scale, %bias):
+                    %dense, %pad_offset, %trt_offset = EffectiveTransformer(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %minus_one, %one)
+                    %att_out                         = Fused(%dense, %attn_mask, %trt_offset, %pad_offset)
+                    %sparse                          = EffectiveTransformer(%att_out, %pad_offset)
+                    %ln_out                          = Fused(%sparse, %in, %scale, %bias)
+                    return (%ln_out)
+            )";
+            distil_bert_patten.nb_inputs     = 7;
+            distil_bert_patten.nb_outputs    = 1;
+            distil_bert_patten.att_node_name = "@att_out";
+            distil_bert_patten.ln_node_name  = "@ln_out";
+            pattens.push_back(distil_bert_patten);
+        }
+
+        // Albert
+        {
+            FusedAttentionPattenInfo albert_patten;
+            albert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size):
+                    %att_out = Fused(%in, %attn_mask, %num_heads, %per_head_size)
+                    return (%att_out)
+            )";
+            albert_patten.graph_str_with_layernorm = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %scale, %bias):
+                    %dense, %pad_offset, %trt_offset = EffectiveTransformer(%in, %attn_mask, %num_heads, %per_head_size)
+                    %att_out                         = Fused(%dense, %attn_mask, %trt_offset, %pad_offset)
+                    %sparse                          = EffectiveTransformer(%att_out, %pad_offset)
+                    %ln_out                          = Fused(%sparse, %in, %scale, %bias)
+                    return (%ln_out)
+            )";
+            albert_patten.nb_inputs     = 4;
+            albert_patten.nb_outputs    = 1;
+            albert_patten.att_node_name = "@att_out";
+            albert_patten.ln_node_name  = "@ln_out";
+            pattens.push_back(albert_patten);
+        }
+
+        return pattens;
+    }
+
+    Status NetOptimizerEffectiveTransformer::OptimizeForAttention(std::shared_ptr<Graph> graph) {
+        FusedAttentionRewriter rewriter(graph);
+
+        for (const auto &patten : GetFusedAttentionPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+            RETURN_ON_FAIL(rewriter.RewriteWithLayerNorm(patten));
+        }
+
+        return TNN_OK;
+    }
+
+    Status NetOptimizerEffectiveTransformer::OptimizeForFFN(std::shared_ptr<Graph> graph) {
+        std::string graph_str = R"(
+            graph(%dense, %pad_offset, %scale, %bias):
+                %sparse  = EffectiveTransformer(%dense, %pad_offset)
+                %ffn_out = Fused(%sparse)
+                %out     = Fused(%ffn_out, %sparse, %scale, %bias)
+                return (%out)
+        )";
+
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != 4 || in->outputs().size() != 1 ) {
+                return nullptr;
+            }
+
+            auto ffn_node = in->getNodeByTensorName(std::string("@ffn_out"));
+            if (!ffn_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto ffn_name = ffn_node->name();
+
+            auto ln_node = in->getNodeByTensorName(std::string("@out"));
+            if (!ln_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto ln_name = ln_node->name();
+
+            auto g = std::make_shared<Graph>();
+
+            TNN_GRAPH_PREPARE_NODE(dense);
+            TNN_GRAPH_PREPARE_NODE(pad_offset);
+            TNN_GRAPH_PREPARE_NODE(scale);
+            TNN_GRAPH_PREPARE_NODE(bias);
+
+            auto status = g->createNode(LAYER_FUSED, {dense_name}, {ffn_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_ffn_node = g->getNodeByTensorName(ffn_name);
+            new_ffn_node->info->param = ffn_node->info->param->Copy();
+            auto ffn_param = dynamic_cast<FusedLayerParam *>(new_ffn_node->info->param.get());
+            if (!ffn_param) {
+                WARN("ffn_param is nil");
+                return nullptr;
+            }
+            if (ffn_param->type != FusionType_FFN) {
+                WARN("type is not ffn layer");
+                return nullptr;
+            }
+
+            status = g->createNode(LAYER_FUSED, {ffn_name, dense_name, scale_name, bias_name}, {ln_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_ln_node = g->getNodeByTensorName(ln_name);
+            new_ln_node->info->param = ln_node->info->param->Copy();
+            auto ln_param = dynamic_cast<FusedLayerParam *>(new_ln_node->info->param.get());
+            if (!ln_param) {
+                WARN("ln_param is nil");
+                return nullptr;
+            }
+            if (ln_param->type != FusionType_AddBiasResidualLayerNorm) {
+                WARN("type is not layernorm layer");
+                return nullptr;
+            }
+
+            std::vector<std::string> sparse_outs = RebuildPadding(g, {ln_name, pad_offset_name}, ln_name);
+            if (sparse_outs.size() != 1) {
+                WARN("create rebuild padding node failed");
+                return nullptr;
+            }
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    Status NetOptimizerEffectiveTransformer::EliminateRedundantReformats(NetStructure *structure, NetResource *resource) {
+        auto ret = Status(TNN_OK);
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_optimized;
+        layers_optimized.push_back(layers_orig[0]);
+        for (int index = 1; index < count; index++) {
+            layers_optimized.push_back(layers_orig[index]);
+
+            auto layer_info_curr = layers_orig[index];
+            auto layer_info_prev = layers_orig[index - 1];
+
+            if (layer_info_curr->type != LAYER_EFFECTIVE_TRANSFORMER ||
+                layer_info_prev->type != LAYER_EFFECTIVE_TRANSFORMER) {
+                continue;
+            }
+
+            auto curr_param = dynamic_cast<EffectiveTransformerLayerParam *>(layer_info_curr->param.get());
+            auto prev_param = dynamic_cast<EffectiveTransformerLayerParam *>(layer_info_prev->param.get());
+            if (!prev_param || prev_param->is_remove_padding ||
+                !curr_param || !curr_param->is_remove_padding) {
+                continue;
+            }
+
+            if (layer_info_prev->inputs.size() != 2 || layer_info_prev->outputs.size() != 1 ||
+                layer_info_curr->inputs.size() <= 2 || layer_info_curr->outputs.size() != 3) {
+                LOGE("Error: effective transformer io size error\n");
+                return Status(TNNERR_NET_ERR, "Error: effective transformer io size error");
+            }
+            if (layer_info_prev->outputs[0] != layer_info_curr->inputs[0]) {
+                continue;
+            }
+
+            auto dense_in   = layer_info_prev->inputs[0];
+            auto pad_offset = layer_info_prev->inputs[1];
+            std::shared_ptr<LayerInfo> prev_eff_node = nullptr;
+            for (const auto & info : layers_optimized) {
+                for (const auto & out : info->outputs) {
+                    if (out == pad_offset) {
+                        prev_eff_node = info;
+                        break;
+                    }
+                }
+            }
+            if (!prev_eff_node || prev_eff_node->outputs.size() != 3 || prev_eff_node->outputs[1] != pad_offset) {
+                LOGE("Error: find prev_eff_node error\n");
+                return Status(TNNERR_NET_ERR, "Error: find prev_eff_node error");
+            }
+            auto trt_offset = prev_eff_node->outputs[2];
+
+            if (layer_info_curr->inputs.size() != prev_eff_node->inputs.size()) {
+                continue;
+            }
+            for (int j = 1; j < layer_info_curr->inputs.size(); ++j) {
+                if (layer_info_curr->inputs[j] != prev_eff_node->inputs[j]) {
+                    continue;
+                }
+            }
+
+            std::unordered_map<std::string, std::string> replace_inputs;
+            replace_inputs[layer_info_curr->outputs[0]] = dense_in;
+            replace_inputs[layer_info_curr->outputs[1]] = pad_offset;
+            replace_inputs[layer_info_curr->outputs[2]] = trt_offset;
+            for (int j = index; j < count; ++j) {
+                auto layer = layers_orig[j];
+                for (int k = 0; k < layer->inputs.size(); ++k) {
+                    auto input = layer->inputs[k];
+                    if (replace_inputs.find(input) != replace_inputs.end()) {
+                        layer->inputs[k] = replace_inputs[input];
+                    }
+                }
+            }
+
+            layers_optimized.pop_back();
+            layers_optimized.pop_back();
+        }
+        structure->layers = layers_optimized;
+
+        return ret;
+    }
+
+    class NetOptimizerEffectiveTransformer::LayerReorder {
+    public:
+        LayerReorder(NetStructure *structure);
+        void Run();
+
+    private:
+        void Union(int x, int y);
+        int Find(int x);
+        bool IsOrdered(int x, int y);
+
+        NetStructure *structure_;
+        int layer_count_;
+        std::vector<int> p_;
+        std::unordered_map<std::string, int> blob_to_layerid_;
+        std::unordered_set<int> dense_to_sparse_idx_;
+        std::vector<std::pair<int, int>> dense_ranges_;
+    };
+
+    Status NetOptimizerEffectiveTransformer::ReorderDenseOps(NetStructure *structure, NetResource *resource) {
+        auto ret = Status(TNN_OK);
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        LayerReorder reorder(structure);
+        reorder.Run();
+
+        return ret;
+    }
+
+    NetOptimizerEffectiveTransformer::LayerReorder::LayerReorder(NetStructure *structure)
+        : structure_(structure) {
+        layer_count_ = structure->layers.size();
+        p_.resize(layer_count_, -1);
+        for (int i = 0; i < layer_count_; ++i) {
+            auto layer_info = structure->layers[i];
+            if (layer_info->type == LAYER_EFFECTIVE_TRANSFORMER) {
+                auto layer_param = dynamic_cast<EffectiveTransformerLayerParam *>(layer_info->param.get());
+                if (layer_param && layer_param->is_remove_padding) {
+                    p_[i] = i;
+                }
+                if (layer_param && !layer_param->is_remove_padding) {
+                    dense_to_sparse_idx_.insert(i);
+                }
+            }
+            for (const auto &out : layer_info->outputs) {
+                blob_to_layerid_[out] = i;
+            }
+        }
+    }
+
+    void NetOptimizerEffectiveTransformer::LayerReorder::Run() {
+        for (int i = 0; i < layer_count_; ++i) {
+            auto layer_info = structure_->layers[i];
+            for (const auto &in : layer_info->inputs) {
+                if (blob_to_layerid_.find(in) != blob_to_layerid_.end()) {
+                    Union(blob_to_layerid_[in], i);
+                }
+            }
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_reordered;
+        std::unordered_set<int> visited_layers;
+        int prev_dense_to_sparse = -1;
+        for (int i = 0; i < layer_count_; ++i) {
+            if (visited_layers.find(i) != visited_layers.end()) {
+                continue;
+            }
+            int root = p_[i];
+            if (root < 0) {
+                layers_reordered.push_back(structure_->layers[i]);
+            } else {
+                int sparse_to_dense = i;
+                if (prev_dense_to_sparse > 0 && !IsOrdered(prev_dense_to_sparse, sparse_to_dense)) {
+                    auto dense_to_sparse_layer = structure_->layers[prev_dense_to_sparse];
+                    auto sparse_to_dense_layer = structure_->layers[sparse_to_dense];
+                    auto control_edge = dense_to_sparse_layer->name + "control__";
+                    dense_to_sparse_layer->outputs.push_back(control_edge);
+                    sparse_to_dense_layer->inputs.push_back(control_edge);
+                    structure_->blobs.insert(control_edge);
+                }
+                int paired_dense_to_sparse = -1;
+                for (int j = i + 1; j < layer_count_; ++j) {
+                    if (Find(j) == root) {
+                        paired_dense_to_sparse = j;
+                    }
+                }
+                prev_dense_to_sparse = paired_dense_to_sparse;
+                if (paired_dense_to_sparse < 0) {
+                    LOGE("Error: effective transformer operation is not paired\n");
+                    return;
+                }
+                for (int j = sparse_to_dense; j <= paired_dense_to_sparse; ++j) {
+                    if (Find(j) < 0) {
+                        layers_reordered.push_back(structure_->layers[j]);
+                        visited_layers.insert(j);
+                    }
+                }
+                for (int j = sparse_to_dense; j <= paired_dense_to_sparse; ++j) {
+                    if (Find(j) == root) {
+                        layers_reordered.push_back(structure_->layers[j]);
+                        visited_layers.insert(j);
+                    }
+                }
+            }
+        }
+        structure_->layers = layers_reordered;
+    }
+
+    void NetOptimizerEffectiveTransformer::LayerReorder::Union(int x, int y) {
+        if (x < 0 || x >= layer_count_ || y <= x || y >= layer_count_) return;
+        if (dense_to_sparse_idx_.find(x) != dense_to_sparse_idx_.end()) return;
+        if (Find(x) < 0 || Find(y) >= 0) return;
+        p_[y] = Find(x);
+    }
+
+    int NetOptimizerEffectiveTransformer::LayerReorder::Find(int x) {
+        if (p_[x] < 0) return -1;
+        if (p_[x] != x) p_[x] = Find(p_[x]);
+        return p_[x];
+    }
+
+    bool NetOptimizerEffectiveTransformer::LayerReorder::IsOrdered(int x, int y) {
+        auto layer_info = structure_->layers[y];
+        for (const auto& in : layer_info->inputs) {
+            if (blob_to_layerid_.find(in) != blob_to_layerid_.end()) {
+                if (blob_to_layerid_[in] == x || IsOrdered(x, blob_to_layerid_[in])) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+#undef TNN_GRAPH_PREPARE_NODE
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_effective_transformer.h b/source/tnn/optimizer/net_optimizer_effective_transformer.h
new file mode 100644
index 000000000..9dd36f5e1
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_effective_transformer.h
@@ -0,0 +1,50 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_EFFECTIVE_TRANSFORMER_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_EFFECTIVE_TRANSFORMER_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: remove and rebuild padding for effective transformer
+    class NetOptimizerEffectiveTransformer : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        Status OptimizeForAttention(std::shared_ptr<Graph> graph);
+        Status OptimizeForFFN(std::shared_ptr<Graph> graph);
+
+        Status EliminateRedundantReformats(NetStructure *structure, NetResource *resource);
+        Status ReorderDenseOps(NetStructure *structure, NetResource *resource);
+        class LayerReorder;
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_EFFECTIVE_TRANSFORMER_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.cc b/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.cc
new file mode 100644
index 000000000..a2c21ea3f
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.cc
@@ -0,0 +1,187 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_add_layernorm.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+// #include "tnn/interpreter/tnn/model_packer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseAddLayerNorm> g_net_optimizer_fuse_add_layernorm(OptPriority::P1);
+
+    std::string NetOptimizerFuseAddLayerNorm::Strategy() {
+        return kNetOptimizerFuseAddLayerNorm;
+    }
+
+    bool NetOptimizerFuseAddLayerNorm::IsSupported(const NetworkConfig &net_config) {
+        if (net_config.precision == PRECISION_HIGH) {
+            return false;
+        }
+
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            return true;
+        }
+        return false;
+    }
+
+    struct LNPattenInfo {
+        std::string graph_str;
+        std::string bias_node_name;
+        std::string ln_node_name;
+    };
+
+    class LNRewriter {
+        public:
+            LNRewriter(std::shared_ptr<Graph> graph, NetStructure *structure, NetResource *resource)
+                : graph_(graph), structure_(structure), resource_(resource) {
+            }
+
+            Status Rewrite(const LNPattenInfo &patten_info);
+
+        private:
+            std::shared_ptr<Graph> graph_;
+            NetStructure *structure_;
+            NetResource *resource_;
+    };
+
+    Status NetOptimizerFuseAddLayerNorm::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        // TNN_NS::Logger::instance().set_verbose_level("D");
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        LNRewriter rewriter(graph, structure, resource);
+
+        for (const auto &patten : GetLNPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+        }
+
+        // TNN_NS::Logger::instance().set_verbose_level("W");
+
+        return TNN_OK;
+    }
+
+    std::vector<LNPattenInfo> NetOptimizerFuseAddLayerNorm::GetLNPattens() {
+        std::vector<LNPattenInfo> pattens;
+
+        {
+            LNPattenInfo ln_patten;
+            ln_patten.graph_str = R"(
+                graph(%att_out, %res_in, %scale, %bias):
+                    %bias_out     = Add(%att_out)
+                    %residual_out = Add(%bias_out, %res_in)
+                    %out          = LayerNorm(%residual_out, %scale, %bias)
+                    return (%out)
+            )";
+            ln_patten.bias_node_name = "@bias_out";
+            ln_patten.ln_node_name   = "@out";
+            pattens.push_back(ln_patten);
+        }
+
+        {
+            LNPattenInfo ln_patten;
+            ln_patten.graph_str = R"(
+                graph(%att_out, %res_in, %scale, %bias):
+                    %bias_out     = Add(%att_out)
+                    %residual_out = Add(%res_in, %bias_out)
+                    %out          = LayerNorm(%residual_out, %scale, %bias)
+                    return (%out)
+            )";
+            ln_patten.bias_node_name = "@bias_out";
+            ln_patten.ln_node_name   = "@out";
+            pattens.push_back(ln_patten);
+        }
+
+        return pattens;
+    }
+
+    Status LNRewriter::Rewrite(const LNPattenInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != 4 || in->outputs().size() != 1 ) {
+                return nullptr;
+            }
+
+            auto bias_add_node = in->getNodeByTensorName(patten_info.bias_node_name);
+            if (!bias_add_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+
+            auto layernorm_node = in->getNodeByTensorName(patten_info.ln_node_name);
+            if (!layernorm_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+
+            auto layer_norm_param = dynamic_cast<LayerNormLayerParam *>(layernorm_node->info->param.get());
+            if (!layer_norm_param) {
+                WARN("layer_norm_param is nil");
+                return nullptr;
+            }
+
+            auto g = std::make_shared<Graph>();
+            auto in_att_out_name = "input_att_out";
+            auto in1 = g->getNodeOrCreatePlaceHolder(in_att_out_name);
+            auto in_res_in_name = "input_res_in";
+            auto in2 = g->getNodeOrCreatePlaceHolder(in_res_in_name);
+            auto in_scale_name = "input_scale";
+            auto in3 = g->getNodeOrCreatePlaceHolder(in_scale_name);
+            auto in_bias_name = "input_bias";
+            auto in4 = g->getNodeOrCreatePlaceHolder(in_bias_name);
+
+            auto out_name = bias_add_node->name();
+            CREATE_NODE(fused_node, g, LAYER_FUSED, NAMES({in_att_out_name, in_res_in_name, in_scale_name, in_bias_name}), {out_name});
+
+            RETURN_VALUE_ON_NEQ(fused_node->createParam<FusedLayerParam>(), TNN_OK, nullptr);
+            fused_node->param<FusedLayerParam>()->type = FusionType_AddBiasResidualLayerNorm;
+            fused_node->param<FusedLayerParam>()->layer_norm_param = *layer_norm_param;
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.h b/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.h
new file mode 100644
index 000000000..8016f4dc3
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_add_layernorm.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ADD_LAYERNORM_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ADD_LAYERNORM_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class LNPattenInfo;
+
+    //@brief net optimize: fuse add and layernorm
+    class NetOptimizerFuseAddLayerNorm : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<LNPattenInfo> GetLNPattens();
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ADD_LAYERNORM_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_attention.cc b/source/tnn/optimizer/net_optimizer_fuse_attention.cc
new file mode 100644
index 000000000..dcdf4e437
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_attention.cc
@@ -0,0 +1,629 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_attention.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseAttention> g_net_optimizer_fuse_attention(OptPriority::P1);
+
+    std::string NetOptimizerFuseAttention::Strategy() {
+        return kNetOptimizerFuseAttention;
+    }
+
+    bool NetOptimizerFuseAttention::IsSupported(const NetworkConfig &net_config) {
+        if (net_config.precision == PRECISION_HIGH) {
+            return false;
+        }
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            // TODO: only support several sm version
+            return true;
+        }
+        return false;
+    }
+
+    struct AttentionPattenInfo {
+        std::string graph_str;
+        int nb_inputs;
+        int nb_outputs;
+        std::string shape_node_name;
+        std::string div_node_name;
+        std::string output_node_name;
+        int cast_mask_ = -1;
+        bool has_attention_mask = true;
+    };
+
+    class AttentionRewriter {
+        public:
+            AttentionRewriter(std::shared_ptr<Graph> graph, NetStructure *structure, NetResource *resource)
+                : graph_(graph), structure_(structure), resource_(resource) {
+            }
+
+            Status Rewrite(const AttentionPattenInfo &patten_info);
+
+        private:
+            Status GetHeadSize(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info);
+            Status GetQScaling(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info);
+            MatMulLayerResource *GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name);
+            EltwiseLayerResource *GetBias(std::shared_ptr<TNN_NS::AnchorGraph> in, const std::string &add_node_name);
+
+            std::vector<std::string> GetInputs(std::shared_ptr<Graph> g, const AttentionPattenInfo &info);
+            std::vector<std::string> GetOutputs(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info);
+            Status ModifyIOBinding(std::shared_ptr<Graph> g, const AttentionPattenInfo &info,
+                                   std::vector<std::string> &in_names, std::vector<std::string> &out_names);
+
+            std::shared_ptr<Graph> graph_;
+            NetStructure *structure_;
+            NetResource *resource_;
+
+            int head_num_;
+            int size_per_head_;
+            int hidden_size_;
+            float q_scaling_;
+    };
+
+    Status NetOptimizerFuseAttention::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        if (!resource) {
+            LOGE("Error: empty NetResource\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetResource");
+        }
+        // TNN_NS::Logger::instance().set_verbose_level("D");
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        AttentionRewriter rewriter(graph, structure, resource);
+
+        for (const auto &patten : GetAttentionPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+        }
+
+        // TNN_NS::Logger::instance().set_verbose_level("W");
+        RETURN_ON_FAIL(EliminateRedundantCasts(structure, resource));
+
+        return TNN_OK;
+    }
+
+    Status NetOptimizerFuseAttention::EliminateRedundantCasts(NetStructure *structure, NetResource *resource) {
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+
+        std::unordered_map<std::string, std::string> cast_map;
+        std::vector<std::shared_ptr<LayerInfo>> layers_optimized;
+        for (int index = 0; index < count; index++) {
+            auto layer_info_curr = layers_orig[index];
+
+            if (layer_info_curr->type != LAYER_CAST || layer_info_curr->inputs.size() != 1 || layer_info_curr->outputs.size() != 1 ||
+                structure->outputs.find(layer_info_curr->outputs[0]) != structure->outputs.end()) {
+                layers_optimized.push_back(layers_orig[index]);
+                continue;
+            }
+
+            auto curr_param = dynamic_cast<CastLayerParam *>(layer_info_curr->param.get());
+            if (!curr_param) {
+                continue;
+            }
+
+            std::string key = layer_info_curr->inputs[0] + "_cast_to_" + std::to_string(curr_param->to);
+            if (cast_map.find(key) == cast_map.end()) {
+                cast_map[key] = layer_info_curr->outputs[0];
+                layers_optimized.push_back(layers_orig[index]);
+            } else {
+                for (int j = index; j < count; ++j) {
+                    auto layer_info_after = layers_orig[j];
+                    for (int i = 0; i < layer_info_after->inputs.size(); ++i) {
+                        if (layer_info_after->inputs[i] == layer_info_curr->outputs[0]) {
+                            layer_info_after->inputs[i] = cast_map[key];
+                        }
+                    }
+                }
+            }
+        }
+        structure->layers = layers_optimized;
+
+        return TNN_OK;
+    }
+
+    std::vector<AttentionPattenInfo> NetOptimizerFuseAttention::GetAttentionPattens() {
+        std::vector<AttentionPattenInfo> pattens;
+
+        // Bert
+        {
+            AttentionPattenInfo bert_patten;
+            bert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size):
+                    %q_linear_mm          = MatMul(%in)
+                    %q_linear_add         = Add(%q_linear_mm)
+                    %k_linear_mm          = MatMul(%in)
+                    %k_linear_add         = Add(%k_linear_mm)
+                    %v_linear_mm          = MatMul(%in)
+                    %v_linear_add         = Add(%v_linear_mm)
+                    %q_batch_shape        = Shape(%q_linear_add)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape       = Shape(%q_linear_add)
+                    %q_seqlen_gather      = Gather(%q_seqlen_shape)
+                    %q_seqlen             = Unsqueeze(%q_seqlen_gather)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_add, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %k_batch_shape        = Shape(%k_linear_add)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape       = Shape(%k_linear_add)
+                    %k_seqlen_gather      = Gather(%k_seqlen_shape)
+                    %k_seqlen             = Unsqueeze(%k_seqlen_gather)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %per_head_size)
+                    %k_reshape            = ReshapeTorch(%k_linear_add, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)
+                    %v_batch_shape        = Shape(%v_linear_add)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather)
+                    %v_seqlen_shape       = Shape(%v_linear_add)
+                    %v_seqlen_gather      = Gather(%v_seqlen_shape)
+                    %v_seqlen             = Unsqueeze(%v_seqlen_gather)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %per_head_size)
+                    %v_reshape            = ReshapeTorch(%v_linear_add, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)
+                    %k_permute_trans      = PermuteV2(%k_permute)
+                    %attn_score           = MatMul(%q_permute, %k_permute_trans)
+                    %attn_score_div       = Div(%attn_score)
+                    %attn_score_mask      = Add(%attn_score_div, %attn_mask)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mask)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_permute)
+                    %attn_context_permute = Permute(%attn_context)
+                    %ac_batch_shape       = Shape(%attn_context_permute)
+                    %ac_batch_gather      = Gather(%ac_batch_shape)
+                    %ac_batch             = Unsqueeze(%ac_batch_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context_permute)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_reshape_shape     = Concat(%ac_batch, %ac_seqlen, %hidden_size)
+                    %attn_context_reshape = ReshapeTorch(%attn_context_permute, %ac_reshape_shape)
+                    %o_linear_mm          = MatMul(%attn_context_reshape)
+                    return (%o_linear_mm)
+            )";
+            bert_patten.nb_inputs        = 5;
+            bert_patten.nb_outputs       = 1;
+            bert_patten.shape_node_name  = "@q_reshape_shape";
+            bert_patten.div_node_name    = "@attn_score_div";
+            bert_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(bert_patten);
+        }
+
+        // DistilBert
+        {
+            AttentionPattenInfo distil_bert_patten;
+            distil_bert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size, %hidden_size, %minus_one, %one):
+                    %batch_shape          = Shape(%in)
+                    %batch_gather         = Gather(%batch_shape)
+                    %batch                = Unsqueeze(%batch_gather)
+                    %seqlen_shape         = Shape(%in)
+                    %seqlen_gather        = Gather(%seqlen_shape)
+                    %seqlen               = Unsqueeze(%seqlen_gather)
+                    %q_linear_mm          = MatMul(%in)
+                    %q_linear_add         = Add(%q_linear_mm)
+                    %reshape_shape        = Concat(%batch, %minus_one, %num_heads, %per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_add, %reshape_shape)
+                    %q_permute            = PermuteV2(%q_reshape)
+                    %k_linear_mm          = MatMul(%in)
+                    %k_linear_add         = Add(%k_linear_mm)
+                    %k_reshape            = ReshapeTorch(%k_linear_add, %reshape_shape)
+                    %k_permute            = PermuteV2(%k_reshape)
+                    %v_linear_mm          = MatMul(%in)
+                    %v_linear_add         = Add(%v_linear_mm)
+                    %v_reshape            = ReshapeTorch(%v_linear_add, %reshape_shape)
+                    %v_permute            = PermuteV2(%v_reshape)
+                    %q_permute_div        = Div(%q_permute)
+                    %k_permute_trans      = PermuteV2(%k_permute)
+                    %attn_score           = MatMul(%q_permute_div, %k_permute_trans)
+                    %mask_reshape_shape   = Concat(%batch, %one, %one, %seqlen)
+                    %mask_reshape         = ReshapeTorch(%attn_mask, %mask_reshape_shape)
+                    %attn_score_shape     = Shape(%attn_score)
+                    %mask_expand          = Expand(%mask_reshape, %attn_score_shape)
+                    %attn_score_mask      = Where(%attn_score, %mask_expand)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mask)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_permute)
+                    %attn_context_permute = PermuteV2(%attn_context)
+                    %ac_reshape_shape     = Concat(%batch, %minus_one, %hidden_size)
+                    %attn_context_reshape = ReshapeTorch(%attn_context_permute, %ac_reshape_shape)
+                    %o_linear_mm          = MatMul(%attn_context_reshape)
+                    return (%o_linear_mm)
+            )";
+            distil_bert_patten.nb_inputs        = 7;
+            distil_bert_patten.nb_outputs       = 1;
+            distil_bert_patten.shape_node_name  = "@reshape_shape";
+            distil_bert_patten.div_node_name    = "@q_permute_div";
+            distil_bert_patten.output_node_name = "@o_linear_mm";
+            distil_bert_patten.cast_mask_       = 1;
+            pattens.push_back(distil_bert_patten);
+        }
+
+        // Lxmert
+        {
+            AttentionPattenInfo lxmert_patten;
+            lxmert_patten.graph_str = R"(
+                graph(%in, %num_heads, %per_head_size, %hidden_size):
+                    %q_linear_mm          = MatMul(%in)
+                    %q_linear_add         = Add(%q_linear_mm)
+                    %k_linear_mm          = MatMul(%in)
+                    %k_linear_add         = Add(%k_linear_mm)
+                    %v_linear_mm          = MatMul(%in)
+                    %v_linear_add         = Add(%v_linear_mm)
+                    %q_batch_shape        = Shape(%q_linear_add)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape       = Shape(%q_linear_add)
+                    %q_seqlen_gather      = Gather(%q_seqlen_shape)
+                    %q_seqlen             = Unsqueeze(%q_seqlen_gather)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_add, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %k_batch_shape        = Shape(%k_linear_add)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape       = Shape(%k_linear_add)
+                    %k_seqlen_gather      = Gather(%k_seqlen_shape)
+                    %k_seqlen             = Unsqueeze(%k_seqlen_gather)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %per_head_size)
+                    %k_reshape            = ReshapeTorch(%k_linear_add, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)
+                    %v_batch_shape        = Shape(%v_linear_add)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather)
+                    %v_seqlen_shape       = Shape(%v_linear_add)
+                    %v_seqlen_gather      = Gather(%v_seqlen_shape)
+                    %v_seqlen             = Unsqueeze(%v_seqlen_gather)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %per_head_size)
+                    %v_reshape            = ReshapeTorch(%v_linear_add, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)
+                    %k_permute_trans      = PermuteV2(%k_permute)
+                    %attn_score           = MatMul(%q_permute, %k_permute_trans)
+                    %attn_score_div       = Div(%attn_score)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_div)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_permute)
+                    %attn_context_permute = Permute(%attn_context)
+                    %ac_batch_shape       = Shape(%attn_context_permute)
+                    %ac_batch_gather      = Gather(%ac_batch_shape)
+                    %ac_batch             = Unsqueeze(%ac_batch_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context_permute)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_reshape_shape     = Concat(%ac_batch, %ac_seqlen, %hidden_size)
+                    %attn_context_reshape = ReshapeTorch(%attn_context_permute, %ac_reshape_shape)
+                    %o_linear_mm          = MatMul(%attn_context_reshape)
+                    return (%o_linear_mm)
+            )";
+            lxmert_patten.nb_inputs          = 4;
+            lxmert_patten.nb_outputs         = 1;
+            lxmert_patten.has_attention_mask = false;
+            lxmert_patten.shape_node_name    = "@q_reshape_shape";
+            lxmert_patten.div_node_name      = "@attn_score_div";
+            lxmert_patten.output_node_name   = "@o_linear_mm";
+            pattens.push_back(lxmert_patten);
+        }
+
+        // Albert
+        {
+            AttentionPattenInfo albert_patten;
+            albert_patten.graph_str = R"(
+                graph(%in, %attn_mask, %num_heads, %per_head_size):
+                    %q_linear_mm          = MatMul(%in)
+                    %q_linear_add         = Add(%q_linear_mm)
+                    %k_linear_mm          = MatMul(%in)
+                    %k_linear_add         = Add(%k_linear_mm)
+                    %v_linear_mm          = MatMul(%in)
+                    %v_linear_add         = Add(%v_linear_mm)
+                    %q_batch_shape        = Shape(%q_linear_add)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape       = Shape(%q_linear_add)
+                    %q_seqlen_gather      = Gather(%q_seqlen_shape)
+                    %q_seqlen             = Unsqueeze(%q_seqlen_gather)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_add, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %k_batch_shape        = Shape(%k_linear_add)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape       = Shape(%k_linear_add)
+                    %k_seqlen_gather      = Gather(%k_seqlen_shape)
+                    %k_seqlen             = Unsqueeze(%k_seqlen_gather)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %per_head_size)
+                    %k_reshape            = ReshapeTorch(%k_linear_add, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)
+                    %v_batch_shape        = Shape(%v_linear_add)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather)
+                    %v_seqlen_shape       = Shape(%v_linear_add)
+                    %v_seqlen_gather      = Gather(%v_seqlen_shape)
+                    %v_seqlen             = Unsqueeze(%v_seqlen_gather)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %per_head_size)
+                    %v_reshape            = ReshapeTorch(%v_linear_add, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)
+                    %k_permute_trans      = PermuteV2(%k_permute)
+                    %attn_score           = MatMul(%q_permute, %k_permute_trans)
+                    %attn_score_div       = Div(%attn_score)
+                    %attn_score_mask      = Add(%attn_score_div, %attn_mask)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mask)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_permute)
+                    %attn_context_permute = PermuteV2(%attn_context)
+                    %attn_context_reshape = FlattenTorch(%attn_context_permute)
+                    %o_linear_mm          = MatMul(%attn_context_reshape)
+                    return (%o_linear_mm)
+            )";
+            albert_patten.nb_inputs        = 4;
+            albert_patten.nb_outputs       = 1;
+            albert_patten.shape_node_name  = "@q_reshape_shape";
+            albert_patten.div_node_name    = "@attn_score_div";
+            albert_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(albert_patten);
+        }
+
+        return pattens;
+    }
+
+    Status AttentionRewriter::Rewrite(const AttentionPattenInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != patten_info.nb_inputs || in->outputs().size() != patten_info.nb_outputs) {
+                return nullptr;
+            }
+
+            if (GetHeadSize(in, patten_info) != TNN_OK) {
+                return nullptr;
+            }
+
+            if (GetQScaling(in, patten_info) != TNN_OK) {
+                return nullptr;
+            }
+
+            auto matmul_q = GetWeight(in, "@q_linear_mm");
+            auto matmul_k = GetWeight(in, "@k_linear_mm");
+            auto matmul_v = GetWeight(in, "@v_linear_mm");
+            auto matmul_o = GetWeight(in, "@o_linear_mm");
+            if (!matmul_q || !matmul_k || !matmul_v || !matmul_o) {
+                WARN("matmul resource is nil");
+                return nullptr;
+            }
+
+            auto add_q = GetBias(in, "@q_linear_add");
+            auto add_k = GetBias(in, "@k_linear_add");
+            auto add_v = GetBias(in, "@v_linear_add");
+            if (!add_q || !add_k || !add_v) {
+                WARN("bias resource is nil");
+                return nullptr;
+            }
+
+            auto g         = std::make_shared<Graph>();
+            auto in_names  = GetInputs(g, patten_info);
+            auto out_names = GetOutputs(in, patten_info);
+            if (ModifyIOBinding(g, patten_info, in_names, out_names) != TNN_OK) {
+                return nullptr;
+            }
+            auto status = g->createNode(LAYER_FUSED, in_names, out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+
+            auto fused_node = g->getNodeByTensorName(out_names[0]);
+
+            auto fused_param                     = std::make_shared<FusedLayerParam>();
+            fused_param->type                    = FusionType_Attention;
+            fused_param->attention_head_num      = head_num_;
+            fused_param->attention_size_per_head = size_per_head_;
+            fused_param->attention_q_scaling     = q_scaling_;
+            fused_param->has_attention_mask      = patten_info.has_attention_mask;
+            fused_node->info->param              = fused_param;
+
+            auto fused_resource = std::make_shared<FusedLayerResource>();
+            if (resource_->resource_map.find(out_names[0]) != resource_->resource_map.end()) {
+                WARN("fused_resource name conflict");
+                return nullptr;
+            }
+            fused_resource->attention_q_mm        = *matmul_q;
+            fused_resource->attention_k_mm        = *matmul_k;
+            fused_resource->attention_v_mm        = *matmul_v;
+            fused_resource->attention_o_mm        = *matmul_o;
+            fused_resource->attention_q_bias      = *add_q;
+            fused_resource->attention_k_bias      = *add_k;
+            fused_resource->attention_v_bias      = *add_v;
+            resource_->resource_map[out_names[0]] = fused_resource;
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    Status AttentionRewriter::GetHeadSize(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info) {
+        auto reshape_shape_node = in->getNodeByTensorName(info.shape_node_name);
+        if (!reshape_shape_node) {
+            WARN("reshape node not found");
+            return Status(TNNERR_NET_ERR, "reshape node not found");
+        }
+        if (reshape_shape_node->info->inputs.size() != 4) {
+            WARN("reshape node inputs size error");
+            return Status(TNNERR_NET_ERR, "reshape node inputs size error");
+        }
+        if (resource_->constant_map.find(reshape_shape_node->info->inputs[2]) == resource_->constant_map.end() ||
+            resource_->constant_map.find(reshape_shape_node->info->inputs[3]) == resource_->constant_map.end()) {
+            WARN("reshape node input not found in constant_map");
+            return Status(TNNERR_NET_ERR, "reshape node input not found in constant_map");
+        }
+        head_num_      = resource_->constant_map[reshape_shape_node->info->inputs[2]]->force_to<int*>()[0];
+        size_per_head_ = resource_->constant_map[reshape_shape_node->info->inputs[3]]->force_to<int*>()[0];
+
+        hidden_size_ = head_num_ * size_per_head_;
+        return TNN_OK;
+    }
+
+    Status AttentionRewriter::GetQScaling(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info) {
+        auto attn_score_div_node = in->getNodeByTensorName(info.div_node_name);
+        if (!attn_score_div_node) {
+            WARN("div node not found");
+            return Status(TNNERR_NET_ERR, "div node not found");
+        }
+        if (attn_score_div_node->info->inputs.size() != 1) {
+            WARN("div node inputs size error");
+            return Status(TNNERR_NET_ERR, "div node inputs size error");
+        }
+        if (resource_->resource_map.find(attn_score_div_node->info->name) == resource_->resource_map.end()) {
+            WARN("div node resource not found in resource_map");
+            return Status(TNNERR_NET_ERR, "div node resource not found in resource_map");
+        }
+        auto div_layer_res = dynamic_cast<EltwiseLayerResource *>(resource_->resource_map[attn_score_div_node->info->name].get());
+        if (!div_layer_res) {
+            WARN("div node resource is nil");
+            return Status(TNNERR_NET_ERR, "div node resource is nil");
+        }
+        const float denom = div_layer_res->element_handle.force_to<float*>()[0];
+        q_scaling_ = denom / sqrt(size_per_head_); // denom = sqrt(size_per_head) * q_scaling
+        return TNN_OK;
+    }
+
+    MatMulLayerResource *AttentionRewriter::GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name) {
+        auto matmul_node = in->getNodeByTensorName(mm_node_name);
+        if (!matmul_node) {
+            WARN("node of interest not found");
+            return nullptr;
+        }
+        auto matmul_param = dynamic_cast<MatMulLayerParam *>(matmul_node->info->param.get());
+        if (!matmul_param) {
+            WARN("matmul_param is nil");
+            return nullptr;
+        }
+        if (matmul_param->weight_position != 1) {
+            WARN("matmul_param weight_position not supported");
+            return nullptr;
+        }
+        auto node_info_name = matmul_node->info->name;
+        if (resource_->resource_map.find(node_info_name) == resource_->resource_map.end()) {
+            WARN("matmul_resource is not found");
+            return nullptr;
+        }
+        auto matmul_res = dynamic_cast<MatMulLayerResource *>(resource_->resource_map[node_info_name].get());
+        if (!matmul_res) {
+            WARN("matmul_resource is nil");
+            return nullptr;
+        }
+        auto matmul_weight_dims = matmul_res->weight.GetBufferDims();
+        if (matmul_weight_dims.size() != 2) {
+            WARN("matmul_resource dims not support");
+            return nullptr;
+        }
+        if (matmul_weight_dims[0] != hidden_size_ || matmul_weight_dims[1] != hidden_size_) {
+            WARN("matmul_resource shape not supported");
+            return nullptr;
+        }
+        return matmul_res;
+    }
+
+    EltwiseLayerResource *AttentionRewriter::GetBias(std::shared_ptr<TNN_NS::AnchorGraph> in, const std::string &add_node_name) {
+        auto add_node = in->getNodeByTensorName(add_node_name);
+        if (!add_node) {
+            WARN("node of interest not found");
+            return nullptr;
+        }
+        auto node_info_name = add_node->info->name;
+        if (resource_->resource_map.find(node_info_name) == resource_->resource_map.end()) {
+            WARN("add_resource is not found");
+            return nullptr;
+        }
+        auto add_res = dynamic_cast<EltwiseLayerResource *>(resource_->resource_map[node_info_name].get());
+        if (!add_res) {
+            WARN("add_resource is nil");
+            return nullptr;
+        }
+        auto add_bias_dims = add_res->element_handle.GetBufferDims();
+        if (add_bias_dims.size() != 1) {
+            WARN("add_resource dims not support");
+            return nullptr;
+        }
+        if (add_bias_dims[0] != hidden_size_) {
+            WARN("add_resource shape not supported");
+            return nullptr;
+        }
+        return add_res;
+    }
+
+    std::vector<std::string> AttentionRewriter::GetInputs(std::shared_ptr<Graph> g, const AttentionPattenInfo &info) {
+        std::vector<std::string> inputs;
+        const std::string prefix = "input_";
+        for (int i = 0; i < info.nb_inputs; ++i) {
+            auto in_name = prefix + std::to_string(i);
+            g->getNodeOrCreatePlaceHolder(in_name);
+            inputs.push_back(in_name);
+        }
+        return inputs;
+    }
+
+    std::vector<std::string> AttentionRewriter::GetOutputs(std::shared_ptr<AnchorGraph> in, const AttentionPattenInfo &info) {
+        std::vector<std::string> outputs;
+        auto out_node = in->getNodeByTensorName(info.output_node_name);
+        const std::string prefix = out_node->name() + "__attention__";
+        for (int i = 0; i < info.nb_outputs; ++i) {
+            auto out_name = prefix + std::to_string(i);
+            outputs.push_back(out_name);
+        }
+        return outputs;
+    }
+
+    Status AttentionRewriter::ModifyIOBinding(std::shared_ptr<Graph> g, const AttentionPattenInfo &info,
+                                              std::vector<std::string> &in_names, std::vector<std::string> &out_names) {
+        if (info.cast_mask_ >= 0 && info.cast_mask_ < in_names.size()) {
+            auto cast_name = out_names[0] + "_cast";
+            g->createNode(LAYER_CAST, {in_names[info.cast_mask_]}, {cast_name});
+            auto cast_node         = g->getNodeByTensorName(cast_name);
+            auto cast_param        = std::make_shared<CastLayerParam>();
+            cast_param->to         = 1;
+            cast_node->info->param = cast_param;
+            in_names[info.cast_mask_] = cast_name;
+        }
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_attention.h b/source/tnn/optimizer/net_optimizer_fuse_attention.h
new file mode 100644
index 000000000..c8713f425
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_attention.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ATTENTION_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ATTENTION_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class AttentionPattenInfo;
+
+    //@brief net optimize: fuse attention of transformer
+    class NetOptimizerFuseAttention : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<AttentionPattenInfo> GetAttentionPattens();
+        Status EliminateRedundantCasts(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_ATTENTION_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_cross_attention.cc b/source/tnn/optimizer/net_optimizer_fuse_cross_attention.cc
new file mode 100644
index 000000000..da82d0bfe
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_cross_attention.cc
@@ -0,0 +1,563 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_cross_attention.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseCrossAttention> g_net_optimizer_fuse_cross_attention(OptPriority::P1);
+
+    std::string NetOptimizerFuseCrossAttention::Strategy() {
+        return kNetOptimizerFuseCrossAttention;
+    }
+
+    bool NetOptimizerFuseCrossAttention::IsSupported(const NetworkConfig &net_config) {
+        if (net_config.precision == PRECISION_HIGH) {
+            return false;
+        }
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            // TODO: only support several sm version
+            return true;
+        }
+        return false;
+    }
+
+    struct CrossAttentionPatternInfo {
+        std::string graph_str;
+        int nb_inputs;
+        int nb_outputs;
+        std::string shape_node_name;
+        std::string output_node_name;
+    };
+
+    class CrossAttentionRewriter {
+        public:
+            CrossAttentionRewriter(std::shared_ptr<Graph> graph, NetStructure *structure, NetResource *resource)
+                : graph_(graph), structure_(structure), resource_(resource) {
+            }
+
+            Status Rewrite(const CrossAttentionPatternInfo &patten_info);
+
+        private:
+            Status GetHeadSize(std::shared_ptr<AnchorGraph> in, const CrossAttentionPatternInfo &info);
+            MatMulLayerResource *GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name);
+            std::vector<std::string> GetInputs(std::shared_ptr<Graph> g, const CrossAttentionPatternInfo &info);
+            std::vector<std::string> GetOutputs(std::shared_ptr<AnchorGraph> in, const CrossAttentionPatternInfo &info);
+
+            std::shared_ptr<Graph> graph_;
+            NetStructure *structure_;
+            NetResource *resource_;
+
+            int head_num_;
+    };
+
+    Status NetOptimizerFuseCrossAttention::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        if (!resource) {
+            LOGE("Error: empty NetResource\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetResource");
+        }
+        //TNN_NS::Logger::instance().set_verbose_level("D");
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        CrossAttentionRewriter rewriter(graph, structure, resource);
+
+        for (const auto &patten : GetAttentionPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+        }
+        //TNN_NS::Logger::instance().set_verbose_level("F");
+
+        return TNN_OK;
+    }
+
+    std::vector<CrossAttentionPatternInfo> NetOptimizerFuseCrossAttention::GetAttentionPattens() {
+        std::vector<CrossAttentionPatternInfo> pattens;
+
+        // CrossAttention SD v1.4 & v2.0
+        {
+            CrossAttentionPatternInfo cross_attention_patten;
+            cross_attention_patten.graph_str = R"(
+                graph(%q_in, %kv_in, %num_heads):
+                    %q_linear_mm          = MatMul(%q_in)
+                    %q_batch_shape        = Shape(%q_linear_mm)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape        = Shape(%q_linear_mm)
+                    %q_seqlen_gather       = Gather(%q_seqlen_shape)
+                    %q_seqlen              = Unsqueeze(%q_seqlen_gather)
+                    %q_hidden_size_shape       = Shape(%q_linear_mm)
+                    %q_hidden_size_gather      = Gather(%q_hidden_size_shape)
+                    %q_hidden_size             = Unsqueeze(%q_hidden_size_gather)
+                    %q_per_head_size        = Div(%q_hidden_size)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %q_per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_mm, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %q_batch_mul_heads  = Mul(%q_batch)
+                    %q_reshape_batch_mul_heads_shape = Concat(%q_batch_mul_heads, %q_seqlen, %q_per_head_size)
+                    %q_reshape_batch_mul_heads = ReshapeTorch(%q_permute, %q_reshape_batch_mul_heads_shape)
+                    %k_linear_mm          = MatMul(%kv_in)
+                    %v_linear_mm          = MatMul(%kv_in)
+                    %k_batch_shape        = Shape(%k_linear_mm)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape        = Shape(%k_linear_mm)
+                    %k_seqlen_gather       = Gather(%k_seqlen_shape)
+                    %k_seqlen              = Unsqueeze(%k_seqlen_gather) 
+                    %k_hidden_size_shape         = Shape(%k_linear_mm)
+                    %k_hidden_size_gather        = Gather(%k_hidden_size_shape)
+                    %k_hidden_size               = Unsqueeze(%k_hidden_size_gather) 
+                    %k_per_head_size             = Div(%k_hidden_size)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %k_per_head_size)
+                    %k_reshape            = ReshapeTorch(%k_linear_mm, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)                   
+                    %k_batch_mul_heads  = Mul(%k_batch)
+                    %k_reshape_batch_mul_heads_shape = Concat(%k_batch_mul_heads, %k_seqlen, %k_per_head_size)
+                    %k_reshape_batch_mul_heads = ReshapeTorch(%k_permute, %k_reshape_batch_mul_heads_shape)
+                    %v_batch_shape        = Shape(%v_linear_mm)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather) 
+                    %v_seqlen_shape        = Shape(%v_linear_mm)
+                    %v_seqlen_gather       = Gather(%v_seqlen_shape)
+                    %v_seqlen              = Unsqueeze(%v_seqlen_gather)
+                    %v_hidden_size_shape         = Shape(%v_linear_mm)
+                    %v_hidden_size_gather        = Gather(%v_hidden_size_shape)
+                    %v_hidden_size               = Unsqueeze(%v_hidden_size_gather) 
+                    %v_per_head_size             = Div(%v_hidden_size)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %v_per_head_size)
+                    %v_reshape            = ReshapeTorch(%v_linear_mm, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)                   
+                    %v_batch_mul_heads  = Mul(%v_batch)
+                    %v_reshape_batch_mul_heads_shape = Concat(%v_batch_mul_heads, %v_seqlen, %v_per_head_size)
+                    %v_reshape_batch_mul_heads = ReshapeTorch(%v_permute, %v_reshape_batch_mul_heads_shape)
+                    %q_remove_batch_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_batch_gather = Gather(%q_remove_batch_shape)
+                    %q_remove_batch = Unsqueeze(%q_remove_batch_gather)
+                    %q_remove_seqlen_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_seqlen_gather = Gather(%q_remove_seqlen_shape)
+                    %q_remove_seqlen = Unsqueeze(%q_remove_seqlen_gather)
+                    %k_remove_hidden_size_shape = Shape(%k_reshape_batch_mul_heads)
+                    %k_remove_hidden_size_gather = Gather(%k_remove_hidden_size_shape)
+                    %k_remove_hidden_size = Unsqueeze(%k_remove_hidden_size_gather)
+                    %remove_shape = Concat(%q_remove_batch, %q_remove_seqlen, %k_remove_hidden_size)
+                    %k_permute_trans      = PermuteV2(%k_reshape_batch_mul_heads)
+                    %attn_score           = MatMul(%q_reshape_batch_mul_heads, %k_permute_trans)
+                    %attn_score_mul       = Mul(%attn_score)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mul)
+                    %attn_score_softmax_cast = Cast(%attn_score_softmax)
+                    %attn_context         = MatMul(%attn_score_softmax_cast, %v_reshape_batch_mul_heads)
+                    %ac_batch_mul_heads_shape       = Shape(%attn_context)
+                    %ac_batch_mul_heads_gather      = Gather(%ac_batch_mul_heads_shape)
+                    %ac_batch_mul_heads             = Unsqueeze(%ac_batch_mul_heads_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_per_head_size_shape       = Shape(%attn_context)
+                    %ac_per_head_size_gather      = Gather(%ac_per_head_size_shape)
+                    %ac_per_head_size             = Unsqueeze(%ac_per_head_size_gather)
+                    %ac_batch                   = Div(%ac_batch_mul_heads)
+                    %ac_reshape_shape           = Concat(%ac_batch, %num_heads, %ac_seqlen, %ac_per_head_size)
+                    %ac_reshape                 = ReshapeTorch(%attn_context, %ac_reshape_shape)
+                    %ac_permute                 = Permute(%ac_reshape)
+                    %ac_hidden_size             = Mul(%ac_per_head_size)
+                    %ac_permute_reshape_shape = Concat(%ac_batch, %ac_seqlen, %ac_hidden_size)
+                    %ac_permute_reshape = ReshapeTorch(%ac_permute, %ac_permute_reshape_shape)
+                    %o_linear_mm = MatMul(%ac_permute_reshape)
+                    return (%o_linear_mm, %remove_shape) 
+            )";
+
+            cross_attention_patten.nb_inputs        = 3;
+            cross_attention_patten.nb_outputs       = 2;
+            cross_attention_patten.shape_node_name  = "@q_reshape_shape";
+            cross_attention_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(cross_attention_patten);
+        }
+
+        // CrossAttention SD v1.5
+        {
+            CrossAttentionPatternInfo cross_attention_patten;
+            cross_attention_patten.graph_str = R"(
+                graph(%q_in, %kv_in, %num_heads):
+                    %q_linear_mm          = MatMul(%q_in)
+                    %q_batch_shape        = Shape(%q_linear_mm)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape        = Shape(%q_linear_mm)
+                    %q_seqlen_gather       = Gather(%q_seqlen_shape)
+                    %q_seqlen              = Unsqueeze(%q_seqlen_gather)
+                    %q_hidden_size_shape       = Shape(%q_linear_mm)
+                    %q_hidden_size_gather      = Gather(%q_hidden_size_shape)
+                    %q_hidden_size             = Unsqueeze(%q_hidden_size_gather)
+                    %q_per_head_size        = Div(%q_hidden_size)
+                    %q_per_head_size_floor  = Floor(%q_per_head_size)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %q_per_head_size_floor)
+                    %q_reshape            = ReshapeTorch(%q_linear_mm, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %q_batch_mul_heads  = Mul(%q_batch)
+                    %q_reshape_batch_mul_heads_shape = Concat(%q_batch_mul_heads, %q_seqlen, %q_per_head_size_floor)
+                    %q_reshape_batch_mul_heads = ReshapeTorch(%q_permute, %q_reshape_batch_mul_heads_shape)
+                    %k_linear_mm          = MatMul(%kv_in)
+                    %v_linear_mm          = MatMul(%kv_in)
+                    %k_batch_shape        = Shape(%k_linear_mm)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape        = Shape(%k_linear_mm)
+                    %k_seqlen_gather       = Gather(%k_seqlen_shape)
+                    %k_seqlen              = Unsqueeze(%k_seqlen_gather) 
+                    %k_hidden_size_shape         = Shape(%k_linear_mm)
+                    %k_hidden_size_gather        = Gather(%k_hidden_size_shape)
+                    %k_hidden_size               = Unsqueeze(%k_hidden_size_gather) 
+                    %k_per_head_size             = Div(%k_hidden_size)
+                    %k_per_head_size_floor       =Floor(%k_per_head_size)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %k_per_head_size_floor)
+                    %k_reshape            = ReshapeTorch(%k_linear_mm, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)                   
+                    %k_batch_mul_heads  = Mul(%k_batch)
+                    %k_reshape_batch_mul_heads_shape = Concat(%k_batch_mul_heads, %k_seqlen, %k_per_head_size_floor)
+                    %k_reshape_batch_mul_heads = ReshapeTorch(%k_permute, %k_reshape_batch_mul_heads_shape)
+                    %v_batch_shape        = Shape(%v_linear_mm)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather) 
+                    %v_seqlen_shape        = Shape(%v_linear_mm)
+                    %v_seqlen_gather       = Gather(%v_seqlen_shape)
+                    %v_seqlen              = Unsqueeze(%v_seqlen_gather)
+                    %v_hidden_size_shape         = Shape(%v_linear_mm)
+                    %v_hidden_size_gather        = Gather(%v_hidden_size_shape)
+                    %v_hidden_size               = Unsqueeze(%v_hidden_size_gather) 
+                    %v_per_head_size             = Div(%v_hidden_size)
+                    %v_per_head_size_floor       = Floor(%v_per_head_size)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %v_per_head_size_floor)
+                    %v_reshape            = ReshapeTorch(%v_linear_mm, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)                   
+                    %v_batch_mul_heads  = Mul(%v_batch)
+                    %v_reshape_batch_mul_heads_shape = Concat(%v_batch_mul_heads, %v_seqlen, %v_per_head_size_floor)
+                    %v_reshape_batch_mul_heads = ReshapeTorch(%v_permute, %v_reshape_batch_mul_heads_shape)
+                    %q_remove_batch_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_batch_gather = Gather(%q_remove_batch_shape)
+                    %q_remove_batch = Unsqueeze(%q_remove_batch_gather)
+                    %q_remove_seqlen_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_seqlen_gather = Gather(%q_remove_seqlen_shape)
+                    %q_remove_seqlen = Unsqueeze(%q_remove_seqlen_gather)
+                    %k_remove_hidden_size_shape = Shape(%k_reshape_batch_mul_heads)
+                    %k_remove_hidden_size_gather = Gather(%k_remove_hidden_size_shape)
+                    %k_remove_hidden_size = Unsqueeze(%k_remove_hidden_size_gather)
+                    %remove_shape = Concat(%q_remove_batch, %q_remove_seqlen, %k_remove_hidden_size)
+                    %k_permute_trans      = PermuteV2(%k_reshape_batch_mul_heads)
+                    %attn_score           = MatMul(%q_reshape_batch_mul_heads, %k_permute_trans)
+                    %attn_score_mul       = Mul(%attn_score)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mul)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_reshape_batch_mul_heads)
+                    %ac_batch_mul_heads_shape       = Shape(%attn_context)
+                    %ac_batch_mul_heads_gather      = Gather(%ac_batch_mul_heads_shape)
+                    %ac_batch_mul_heads             = Unsqueeze(%ac_batch_mul_heads_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_per_head_size_shape       = Shape(%attn_context)
+                    %ac_per_head_size_gather      = Gather(%ac_per_head_size_shape)
+                    %ac_per_head_size             = Unsqueeze(%ac_per_head_size_gather)
+                    %ac_batch                   = Div(%ac_batch_mul_heads)
+                    %ac_batch_floor             = Floor(%ac_batch)
+                    %ac_reshape_shape           = Concat(%ac_batch_floor, %num_heads, %ac_seqlen, %ac_per_head_size)
+                    %ac_reshape                 = ReshapeTorch(%attn_context, %ac_reshape_shape)
+                    %ac_permute                 = Permute(%ac_reshape)
+                    %ac_hidden_size             = Mul(%ac_per_head_size)
+                    %ac_permute_reshape_shape = Concat(%ac_batch_floor, %ac_seqlen, %ac_hidden_size)
+                    %ac_permute_reshape = ReshapeTorch(%ac_permute, %ac_permute_reshape_shape)
+                    %o_linear_mm = MatMul(%ac_permute_reshape)
+                    return (%o_linear_mm, %remove_shape) 
+            )";
+
+            cross_attention_patten.nb_inputs        = 3;
+            cross_attention_patten.nb_outputs       = 2;
+            cross_attention_patten.shape_node_name  = "@q_reshape_shape";
+            cross_attention_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(cross_attention_patten);
+        }
+
+        return pattens;
+    }
+
+    Status CrossAttentionRewriter::Rewrite(const CrossAttentionPatternInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != patten_info.nb_inputs || in->outputs().size() != patten_info.nb_outputs) {
+                return nullptr;
+            }
+
+            if (GetHeadSize(in, patten_info) != TNN_OK) {
+                return nullptr;
+            }
+
+            auto matmul_q = GetWeight(in, "@q_linear_mm");
+            auto matmul_k = GetWeight(in, "@k_linear_mm");
+            auto matmul_v = GetWeight(in, "@v_linear_mm");
+            auto matmul_o = GetWeight(in, "@o_linear_mm");
+            if (!matmul_q || !matmul_k || !matmul_v || !matmul_o) {
+                WARN("matmul resource is nil");
+                return nullptr;
+            }
+
+            auto g         = std::make_shared<Graph>(); 
+            auto in_names  = GetInputs(g, patten_info);
+            auto out_names = GetOutputs(in, patten_info);
+
+            //kv matmul
+            RawBuffer k_weight = matmul_k->weight;
+            RawBuffer v_weight = matmul_v->weight;
+            auto k_weight_dims = k_weight.GetBufferDims();
+            int channel = k_weight_dims[0];
+            int per_head_size = k_weight_dims[1] / head_num_;
+            std::vector<int> reshape_size = {channel, head_num_, per_head_size};
+            k_weight.Reshape(reshape_size);
+            v_weight.Reshape(reshape_size);
+            std::vector<RawBuffer> list = {k_weight, v_weight};
+            RawBuffer kv_weight = Concat(list, 2);
+            std::vector<int> new_shape  = {channel, head_num_ * 2 * per_head_size};
+            kv_weight.Reshape(new_shape);
+            std::vector<std::string> kv_in_names = {in_names[1]};
+            std::vector<std::string> kv_out_names = {out_names[0] + "kv_out"};
+            auto status = g->createNode(LAYER_MATMUL, kv_in_names, kv_out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto kv_matmul_node = g->getNodeByTensorName(kv_out_names[0]);
+            kv_matmul_node->createParam<MatMulLayerParam>();
+            auto kv_matmul_param = kv_matmul_node->param<MatMulLayerParam>();
+            kv_matmul_param->weight_position = 1;
+            kv_matmul_node->info->param = kv_matmul_param;
+            status = kv_matmul_node->createResource<MatMulLayerResource>();
+            auto kv_matmul_resource = kv_matmul_node->resource<MatMulLayerResource>();
+            kv_matmul_resource->weight = kv_weight;
+
+            //kv reshape to [batch, seqlen, heads, 2, per_head_size]
+            std::vector<std::string> kv_reshape_names = {out_names[0] + "kv_out_reshape"};
+            status = g->createNode(LAYER_RESHAPE, kv_out_names, kv_reshape_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto kv_matmul_reshape_node = g->getNodeByTensorName(kv_reshape_names[0]);
+            kv_matmul_reshape_node->createParam<ReshapeLayerParam>();
+            auto kv_matmul_reshape_param                     = kv_matmul_reshape_node->param<ReshapeLayerParam>();
+            kv_matmul_reshape_param->num_axes = 5;
+            kv_matmul_reshape_param->shape = {0, 0, head_num_, 2, per_head_size};
+            kv_matmul_reshape_node->info->param = kv_matmul_reshape_param;
+
+            // q matmul
+            RawBuffer q_weight = matmul_q->weight;
+            std::vector<std::string> q_in_names = {in_names[0]};
+            std::vector<std::string> q_out_names = {out_names[0] + "q_out"};
+            status = g->createNode(LAYER_MATMUL, q_in_names, q_out_names);
+            if(status != TNN_OK) {
+                return nullptr;
+            }
+            auto q_matmul_node = g->getNodeByTensorName(q_out_names[0]);
+            q_matmul_node->createParam<MatMulLayerParam>();
+            auto q_matmul_param = q_matmul_node->param<MatMulLayerParam>();
+            q_matmul_param->weight_position = 1;
+            q_matmul_node->info->param = q_matmul_param;
+            status = q_matmul_node->createResource<MatMulLayerResource>();
+            auto q_matmul_resource = q_matmul_node->resource<MatMulLayerResource>();
+            q_matmul_resource->weight = q_weight;
+
+            //q reshape to [batch, seqlen, heads, per_head_size]
+            std::vector<std::string> q_reshape_names = {out_names[0] + "q_out_reshape"};
+            status = g->createNode(LAYER_RESHAPE, q_out_names, q_reshape_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto q_matmul_reshape_node = g->getNodeByTensorName(q_reshape_names[0]);
+            q_matmul_reshape_node->createParam<ReshapeLayerParam>();
+            auto q_matmul_reshape_param                     = q_matmul_reshape_node->param<ReshapeLayerParam>();
+            q_matmul_reshape_param->num_axes = 4;
+            q_matmul_reshape_param->shape = {0, 0, head_num_, per_head_size};
+            q_matmul_reshape_node->info->param = q_matmul_reshape_param;
+
+            //cross attention
+            std::vector<std::string> attention_out_names = {out_names[0] + "attention_out"};
+            std::vector<std::string> attention_in_names = {q_reshape_names[0], kv_reshape_names[0], in_names[2]};
+            status = g->createNode(LAYER_FUSED, attention_in_names, attention_out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto attention_node = g->getNodeByTensorName(attention_out_names[0]);
+            attention_node->createParam<FusedLayerParam>();
+            auto attention_node_param                     = attention_node->param<FusedLayerParam>();
+            attention_node_param->attention_size_per_head = per_head_size;
+            attention_node_param->type                    = FusionType_Cross_Attention;
+            attention_node->info->param              = attention_node_param;
+            status = attention_node->createResource<FusedLayerResource>();
+            if(status != TNN_OK) {
+                return nullptr;
+            }
+
+            //Shape, only for output, this is not used, only for the number of outputs keep the same
+            std::vector<std::string> shape_in_names = {in_names[0]};
+            std::vector<std::string> shape_output_names_fake = {out_names[1] + "_fake"};
+            status = g->createNode(LAYER_SHAPE, shape_in_names, shape_output_names_fake);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_input_shape_node_fake         = g->getNodeByTensorName(out_names[1] + "_fake");
+            new_input_shape_node_fake->info->param = std::make_shared<LayerParam>();
+            g->markOutput(out_names[1] + "_fake");
+
+            //Shape
+            std::vector<std::string> shape_output_names = {out_names[1]};
+            status = g->createNode(LAYER_SHAPE, shape_in_names, shape_output_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_input_shape_node         = g->getNodeByTensorName(out_names[1]);
+            new_input_shape_node->info->param = std::make_shared<LayerParam>();
+
+            //Reshape
+            std::vector<std::string> attention_out_reshape_in_names = {attention_out_names[0], shape_output_names[0]};
+            std::vector<std::string> attention_out_reshape_names = {out_names[0] + "attention_out_reshape"};
+            status = g->createNode(LAYER_RESHAPE, attention_out_reshape_in_names, attention_out_reshape_names);
+            auto attention_out_reshape_node = g->getNodeByTensorName(attention_out_reshape_names[0]);
+            attention_out_reshape_node->createParam<ReshapeLayerParam>();
+
+            //output matmul
+            std::vector<std::string> output_matmul_name = {out_names[0]};
+            status = g->createNode(LAYER_MATMUL, attention_out_reshape_names, output_matmul_name);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto attention_out_matmul_node = g->getNodeByTensorName(out_names[0]);
+            attention_out_matmul_node->createParam<MatMulLayerParam>();
+            auto attention_out_matmul_param = attention_out_matmul_node->param<MatMulLayerParam>(); 
+            attention_out_matmul_param->weight_position = 1;
+            attention_out_matmul_node->info->param = attention_out_matmul_param;
+            status = attention_out_matmul_node->createResource<MatMulLayerResource>();
+            auto attention_out_matmul_resource = attention_out_matmul_node->resource<MatMulLayerResource>();
+            attention_out_matmul_resource->weight = matmul_o->weight;
+            g->markOutput(out_names[0]);
+            //new
+            //std::vector<std::string> output_order = {out_names[1], out_names[0]};
+            //old
+            std::vector<std::string> output_order = {out_names[0], out_names[1] + "_fake"};
+            g->setOutputsOrder(output_order);
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    Status CrossAttentionRewriter::GetHeadSize(std::shared_ptr<AnchorGraph> in, const CrossAttentionPatternInfo &info) {
+        auto reshape_shape_node = in->getNodeByTensorName(info.shape_node_name);
+        if (!reshape_shape_node) {
+            WARN("reshape node not found");
+            return Status(TNNERR_NET_ERR, "reshape node not found");
+        }
+        if (reshape_shape_node->info->inputs.size() != 4) {
+            WARN("reshape node inputs size error");
+            return Status(TNNERR_NET_ERR, "reshape node inputs size error");
+        }
+        if (resource_->constant_map.find(reshape_shape_node->info->inputs[2]) == resource_->constant_map.end()) {
+            WARN("reshape node input not found in constant_map");
+            return Status(TNNERR_NET_ERR, "reshape node input not found in constant_map");
+        }
+        head_num_      = resource_->constant_map[reshape_shape_node->info->inputs[2]]->force_to<int*>()[0];
+        return TNN_OK;
+    }
+
+    MatMulLayerResource *CrossAttentionRewriter::GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name) {
+        auto matmul_node = in->getNodeByTensorName(mm_node_name);
+        if (!matmul_node) {
+            WARN("node of interest not found");
+            return nullptr;
+        }
+        auto matmul_param = dynamic_cast<MatMulLayerParam *>(matmul_node->info->param.get());
+        if (!matmul_param) {
+            WARN("matmul_param is nil");
+            return nullptr;
+        }
+        if (matmul_param->weight_position != 1) {
+            WARN("matmul_param weight_position not supported");
+            return nullptr;
+        }
+        auto node_info_name = matmul_node->info->name;
+        if (resource_->resource_map.find(node_info_name) == resource_->resource_map.end()) {
+            WARN("matmul_resource is not found");
+            return nullptr;
+        }
+        auto matmul_res = dynamic_cast<MatMulLayerResource *>(resource_->resource_map[node_info_name].get());
+        if (!matmul_res) {
+            WARN("matmul_resource is nil");
+            return nullptr;
+        }
+        auto matmul_weight_dims = matmul_res->weight.GetBufferDims();
+        if (matmul_weight_dims.size() != 2) {
+            WARN("matmul_resource dims not support");
+            return nullptr;
+        }
+        return matmul_res;
+    }
+
+    std::vector<std::string> CrossAttentionRewriter::GetInputs(std::shared_ptr<Graph> g, const CrossAttentionPatternInfo &info) {
+        std::vector<std::string> inputs;
+        const std::string prefix = "input_";
+        for (int i = 0; i < info.nb_inputs; ++i) {
+            auto in_name = prefix + std::to_string(i);
+            g->getNodeOrCreatePlaceHolder(in_name);
+            inputs.push_back(in_name);
+        }
+        return inputs;
+    }
+
+    std::vector<std::string> CrossAttentionRewriter::GetOutputs(std::shared_ptr<AnchorGraph> in, const CrossAttentionPatternInfo &info) {
+        std::vector<std::string> outputs;
+        auto out_node = in->getNodeByTensorName(info.output_node_name);
+        const std::string prefix = out_node->name() + "__attention__";
+        for (int i = 0; i < info.nb_outputs; ++i) {
+            auto out_name = prefix + std::to_string(i);
+            outputs.push_back(out_name);
+        }
+        return outputs;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_cross_attention.h b/source/tnn/optimizer/net_optimizer_fuse_cross_attention.h
new file mode 100644
index 000000000..44efb3b3e
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_cross_attention.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CROSS_ATTENTION_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CROSS_ATTENTION_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class CrossAttentionPatternInfo;
+
+    //@brief net optimize: fuse attention of transformer
+    class NetOptimizerFuseCrossAttention : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<CrossAttentionPatternInfo> GetAttentionPattens();
+        Status EliminateRedundantCasts(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_CROSS_ATTENTION_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_ffn.cc b/source/tnn/optimizer/net_optimizer_fuse_ffn.cc
new file mode 100644
index 000000000..2197df734
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_ffn.cc
@@ -0,0 +1,305 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_ffn.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+// #include "tnn/interpreter/tnn/model_packer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseFFN> g_net_optimizer_fuse_ffn(OptPriority::P1);
+
+    std::string NetOptimizerFuseFFN::Strategy() {
+        return kNetOptimizerFuseFFN;
+    }
+
+    bool NetOptimizerFuseFFN::IsSupported(const NetworkConfig &net_config) {
+        // May lead to potential bugs, Closed Right now.
+        return false;
+
+        if (net_config.precision == PRECISION_HIGH) {
+            return false;
+        }
+
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            return true;
+        }
+        return false;
+    }
+
+    struct FFNPattenInfo {
+        std::string graph_str;
+        std::string mmin_node_name;
+        std::string add_node_name;
+        std::string mmout_node_name;
+    };
+
+    class FFNRewriter {
+        public:
+            FFNRewriter(std::shared_ptr<Graph> graph, NetStructure *structure, NetResource *resource)
+                : graph_(graph), structure_(structure), resource_(resource) {
+            }
+
+            Status Rewrite(const FFNPattenInfo &patten_info);
+
+        private:
+            std::shared_ptr<Graph> graph_;
+            NetStructure *structure_;
+            NetResource *resource_;
+    };
+
+    Status NetOptimizerFuseFFN::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        // TNN_NS::Logger::instance().set_verbose_level("D");
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        FFNRewriter rewriter(graph, structure, resource);
+
+        for (const auto &patten : GetFFNPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+        }
+
+        // TNN_NS::Logger::instance().set_verbose_level("W");
+
+        return TNN_OK;
+    }
+
+    std::vector<FFNPattenInfo> NetOptimizerFuseFFN::GetFFNPattens() {
+        std::vector<FFNPattenInfo> pattens;
+
+        // GELU
+        {
+            FFNPattenInfo gelu_patten;
+            gelu_patten.graph_str = R"(
+                graph(%in):
+                    %mid_out = MatMul(%in)
+                    %add_out = Add(%mid_out)
+                    %act_out = GELU(%add_out)
+                    %out     = MatMul(%act_out)
+                    return (%out)
+            )";
+            gelu_patten.mmin_node_name = "@mid_out";
+            gelu_patten.add_node_name = "@add_out";
+            gelu_patten.mmout_node_name = "@out";
+            pattens.push_back(gelu_patten);
+        }
+
+        // GELU_new
+        {
+            // 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+            FFNPattenInfo new_gelu_patten;
+            new_gelu_patten.graph_str = R"(
+                graph(%in):
+                    %mid_out  = MatMul(%in)
+                    %add_out  = Add(%mid_out)
+                    %mul_1    = Mul(%add_out)
+                    %pow_out  = Power(%add_out)
+                    %mul_2    = Mul(%pow_out)
+                    %add_1    = Add(%add_out, %mul_2)
+                    %mul_3    = Mul(%add_1)
+                    %tanh_out = Tanh(%mul_3)
+                    %add_2    = Add(%tanh_out)
+                    %mul_4    = Mul(%mul_1, %add_2)
+                    %out      = MatMul(%mul_4)
+                    return (%out)
+            )";
+            new_gelu_patten.mmin_node_name = "@mid_out";
+            new_gelu_patten.add_node_name = "@add_out";
+            new_gelu_patten.mmout_node_name = "@out";
+            pattens.push_back(new_gelu_patten);
+        }
+
+        // GELU_Fast
+        {
+            // 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * input * (1.0 + 0.044715 * input * input)))
+            FFNPattenInfo fast_gelu_patten;
+            fast_gelu_patten.graph_str = R"(
+                graph(%in):
+                    %mid_out  = MatMul(%in)
+                    %add_out  = Add(%mid_out)
+                    %mul_1    = Mul(%add_out)
+                    %mul_2    = Mul(%add_out)
+                    %mul_3    = Mul(%add_out)
+                    %mul_4    = Mul(%mul_3, %add_out)
+                    %add_1    = Add(%mul_4)
+                    %mul_5    = Mul( %mul_2, %add_1)
+                    %tanh_out = Tanh(%mul_5)
+                    %add_2    = Add(%tanh_out)
+                    %mul_6    = Mul(%mul_1, %add_2)
+                    %out      = MatMul(%mul_6)
+                    return (%out)
+            )";
+            fast_gelu_patten.mmin_node_name = "@mid_out";
+            fast_gelu_patten.add_node_name = "@add_out";
+            fast_gelu_patten.mmout_node_name = "@out";
+            pattens.push_back(fast_gelu_patten);
+        }
+
+        return pattens;
+    }
+
+    Status FFNRewriter::Rewrite(const FFNPattenInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != 1 || in->outputs().size() != 1 ) {
+                return nullptr;
+            }
+
+            // MatMul
+            auto matmul_in_node = in->getNodeByTensorName(std::string(patten_info.mmin_node_name));
+            if (!matmul_in_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto matmul_in_param = dynamic_cast<MatMulLayerParam *>(matmul_in_node->info->param.get());
+            if (!matmul_in_param) {
+                WARN("matmul_in_param is nil");
+                return nullptr;
+            }
+            if (matmul_in_param->weight_position != 1) {
+                WARN("matmul_in_param weight_position not supported");
+                return nullptr;
+            }
+            if (resource_->resource_map.find(matmul_in_node->info->name) == resource_->resource_map.end()) {
+                WARN("matmul_in_resource is not found");
+                return nullptr;
+            }
+            MatMulLayerResource *ffn_matmul_in = dynamic_cast<MatMulLayerResource *>(resource_->resource_map[matmul_in_node->info->name].get());
+            if (!ffn_matmul_in) {
+                WARN("matmul_in_resource is nil");
+                return nullptr;
+            }
+            auto weight_dims = ffn_matmul_in->weight.GetBufferDims();
+            if (weight_dims.size() != 2) {
+                WARN("matmul_in_resource dims not support");
+                return nullptr;
+            }
+            int inter_size = weight_dims[weight_dims.size() - 1];
+
+            // Add
+            auto bias_add_node = in->getNodeByTensorName(std::string(patten_info.add_node_name));
+            if (!bias_add_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            if (resource_->resource_map.find(bias_add_node->info->name) == resource_->resource_map.end()) {
+                WARN("bias_add_resource is not found");
+                return nullptr;
+            }
+            EltwiseLayerResource *ffn_bias = dynamic_cast<EltwiseLayerResource *>(resource_->resource_map[bias_add_node->info->name].get());
+            if (!ffn_bias) {
+                WARN("bias_add_resource is nil");
+                return nullptr;
+            }
+
+            // MatMul
+            auto matmul_out_node = in->getNodeByTensorName(std::string(patten_info.mmout_node_name));
+            if (!matmul_out_node) {
+                WARN("node of interest not found");
+                return nullptr;
+            }
+            auto matmul_out_param = dynamic_cast<MatMulLayerParam *>(matmul_out_node->info->param.get());
+            if (!matmul_out_param) {
+                WARN("matmul_out_param is nil");
+                return nullptr;
+            }
+            if (matmul_out_param->weight_position != 1) {
+                WARN("matmul_out_param weight_position not supported");
+                return nullptr;
+            }
+            if (resource_->resource_map.find(matmul_out_node->info->name) == resource_->resource_map.end()) {
+                WARN("matmul_out_resource is not found");
+                return nullptr;
+            }
+            MatMulLayerResource *ffn_matmul_out = dynamic_cast<MatMulLayerResource *>(resource_->resource_map[matmul_out_node->info->name].get());
+            if (!ffn_matmul_out) {
+                WARN("matmul_out_resource is nil");
+                return nullptr;
+            }
+            weight_dims = ffn_matmul_out->weight.GetBufferDims();
+            if (weight_dims.size() != 2) {
+                WARN("matmul_out_resource dims not support");
+                return nullptr;
+            }
+            if (inter_size != weight_dims[0]) {
+                WARN("matmul_out_resource inter_size not match");
+                return nullptr;
+            }
+
+            auto g = std::make_shared<Graph>();
+            auto in_name = "input";
+            auto in1 = g->getNodeOrCreatePlaceHolder(in_name);
+
+            auto out_name = matmul_out_node->name() + "__ffn__";
+            auto status = g->createNode(LAYER_FUSED, {in_name}, {out_name});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+
+            auto fused_node = g->getNodeByTensorName(out_name);
+
+            auto fused_param = std::make_shared<FusedLayerParam>();
+            fused_param->type = FusionType_FFN;
+            fused_param->ffn_activation = ActivationType_GELU;
+            fused_param->ffn_inter_size = inter_size;
+            fused_node->info->param = fused_param;
+
+            auto fused_resource = std::make_shared<FusedLayerResource>();
+            if (resource_->resource_map.find(out_name) != resource_->resource_map.end()) {
+                WARN("fused_resource name conflict");
+                return nullptr;
+            }
+            fused_resource->ffn_matmul_in = *ffn_matmul_in;
+            fused_resource->ffn_matmul_out = *ffn_matmul_out;
+            fused_resource->ffn_bias = *ffn_bias;
+            resource_->resource_map[out_name] = fused_resource;
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_ffn.h b/source/tnn/optimizer/net_optimizer_fuse_ffn.h
new file mode 100644
index 000000000..a06901855
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_ffn.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FFN_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FFN_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class FFNPattenInfo;
+
+    //@brief net optimize: fuse ffn of transformer
+    class NetOptimizerFuseFFN : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<FFNPattenInfo> GetFFNPattens();
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FFN_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_flash_attention.cc b/source/tnn/optimizer/net_optimizer_fuse_flash_attention.cc
new file mode 100644
index 000000000..9a75b8824
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_flash_attention.cc
@@ -0,0 +1,539 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_flash_attention.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseFlashAttention> g_net_optimizer_fuse_flash_attention(OptPriority::P1);
+
+    std::string NetOptimizerFuseFlashAttention::Strategy() {
+        return kNetOptimizerFuseFlashAttention;
+    }
+
+    bool NetOptimizerFuseFlashAttention::IsSupported(const NetworkConfig &net_config) {
+        if (net_config.precision == PRECISION_HIGH) {
+            return false;
+        }
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            // TODO: only support several sm version
+            return true;
+        }
+        return false;
+    }
+
+    struct FlashAttentionPatternInfo {
+        std::string graph_str;
+        int nb_inputs;
+        int nb_outputs;
+        std::string shape_node_name;
+        std::string output_node_name;
+    };
+
+    class FlashAttentionRewriter {
+        public:
+            FlashAttentionRewriter(std::shared_ptr<Graph> graph, NetStructure *structure, NetResource *resource)
+                : graph_(graph), structure_(structure), resource_(resource) {
+            }
+
+            Status Rewrite(const FlashAttentionPatternInfo &patten_info);
+
+        private:
+            Status GetHeadSize(std::shared_ptr<AnchorGraph> in, const FlashAttentionPatternInfo &info);
+            MatMulLayerResource *GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name);
+
+            std::vector<std::string> GetInputs(std::shared_ptr<Graph> g, const FlashAttentionPatternInfo &info);
+            std::vector<std::string> GetOutputs(std::shared_ptr<AnchorGraph> in, const FlashAttentionPatternInfo &info);
+            std::shared_ptr<Graph> graph_;
+            NetStructure *structure_;
+            NetResource *resource_;
+            int head_num_;
+    };
+
+    Status NetOptimizerFuseFlashAttention::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        if (!resource) {
+            LOGE("Error: empty NetResource\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetResource");
+        }
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        // TNN_NS::Logger::instance().set_verbose_level("D");
+
+        FlashAttentionRewriter rewriter(graph, structure, resource);
+
+        for (const auto &patten : GetAttentionPattens()) {
+            RETURN_ON_FAIL(rewriter.Rewrite(patten));
+        }
+
+        //TNN_NS::Logger::instance().set_verbose_level("I");
+
+        return TNN_OK;
+    }
+
+    std::vector<FlashAttentionPatternInfo> NetOptimizerFuseFlashAttention::GetAttentionPattens() {
+        std::vector<FlashAttentionPatternInfo> pattens;
+
+        // SD v1.4 & v2.0 FlashAttention
+        {
+            FlashAttentionPatternInfo flash_attention_patten;
+
+            flash_attention_patten.graph_str = R"(
+                graph(%in, %num_heads):
+                    %q_linear_mm          = MatMul(%in)
+                    %q_batch_shape        = Shape(%q_linear_mm)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape        = Shape(%q_linear_mm)
+                    %q_seqlen_gather       = Gather(%q_seqlen_shape)
+                    %q_seqlen              = Unsqueeze(%q_seqlen_gather)
+                    %q_hidden_size_shape       = Shape(%q_linear_mm)
+                    %q_hidden_size_gather      = Gather(%q_hidden_size_shape)
+                    %q_hidden_size             = Unsqueeze(%q_hidden_size_gather)
+                    %q_per_head_size        = Div(%q_hidden_size)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %q_per_head_size)
+                    %q_reshape            = ReshapeTorch(%q_linear_mm, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %q_batch_mul_heads  = Mul(%q_batch)
+                    %q_reshape_batch_mul_heads_shape = Concat(%q_batch_mul_heads, %q_seqlen, %q_per_head_size)
+                    %q_reshape_batch_mul_heads = ReshapeTorch(%q_permute, %q_reshape_batch_mul_heads_shape)
+                    %k_linear_mm          = MatMul(%in)
+                    %v_linear_mm          = MatMul(%in)
+                    %k_batch_shape        = Shape(%k_linear_mm)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape        = Shape(%k_linear_mm)
+                    %k_seqlen_gather       = Gather(%k_seqlen_shape)
+                    %k_seqlen              = Unsqueeze(%k_seqlen_gather) 
+                    %k_hidden_size_shape         = Shape(%k_linear_mm)
+                    %k_hidden_size_gather        = Gather(%k_hidden_size_shape)
+                    %k_hidden_size               = Unsqueeze(%k_hidden_size_gather) 
+                    %k_per_head_size             = Div(%k_hidden_size)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %k_per_head_size)
+                    %k_reshape            = ReshapeTorch(%k_linear_mm, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)                   
+                    %k_batch_mul_heads  = Mul(%k_batch)
+                    %k_reshape_batch_mul_heads_shape = Concat(%k_batch_mul_heads, %k_seqlen, %k_per_head_size)
+                    %k_reshape_batch_mul_heads = ReshapeTorch(%k_permute, %k_reshape_batch_mul_heads_shape)
+                    %v_batch_shape        = Shape(%v_linear_mm)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather) 
+                    %v_seqlen_shape        = Shape(%v_linear_mm)
+                    %v_seqlen_gather       = Gather(%v_seqlen_shape)
+                    %v_seqlen              = Unsqueeze(%v_seqlen_gather)
+                    %v_hidden_size_shape         = Shape(%v_linear_mm)
+                    %v_hidden_size_gather        = Gather(%v_hidden_size_shape)
+                    %v_hidden_size               = Unsqueeze(%v_hidden_size_gather) 
+                    %v_per_head_size             = Div(%v_hidden_size)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %v_per_head_size)
+                    %v_reshape            = ReshapeTorch(%v_linear_mm, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)                   
+                    %v_batch_mul_heads  = Mul(%v_batch)
+                    %v_reshape_batch_mul_heads_shape = Concat(%v_batch_mul_heads, %v_seqlen, %v_per_head_size)
+                    %v_reshape_batch_mul_heads = ReshapeTorch(%v_permute, %v_reshape_batch_mul_heads_shape)
+                    %q_remove_batch_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_batch_gather = Gather(%q_remove_batch_shape)
+                    %q_remove_batch = Unsqueeze(%q_remove_batch_gather)
+                    %q_remove_seqlen_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_seqlen_gather = Gather(%q_remove_seqlen_shape)
+                    %q_remove_seqlen = Unsqueeze(%q_remove_seqlen_gather)
+                    %k_remove_hidden_size_shape = Shape(%k_reshape_batch_mul_heads)
+                    %k_remove_hidden_size_gather = Gather(%k_remove_hidden_size_shape)
+                    %k_remove_hidden_size = Unsqueeze(%k_remove_hidden_size_gather)
+                    %remove_shape = Concat(%q_remove_batch, %q_remove_seqlen, %k_remove_hidden_size)
+                    %k_permute_trans      = PermuteV2(%k_reshape_batch_mul_heads)
+                    %attn_score           = MatMul(%q_reshape_batch_mul_heads, %k_permute_trans)
+                    %attn_score_mul       = Mul(%attn_score)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mul)
+                    %attn_score_softmax_cast = Cast(%attn_score_softmax)
+                    %attn_context         = MatMul(%attn_score_softmax_cast, %v_reshape_batch_mul_heads)
+                    %ac_batch_mul_heads_shape       = Shape(%attn_context)
+                    %ac_batch_mul_heads_gather      = Gather(%ac_batch_mul_heads_shape)
+                    %ac_batch_mul_heads             = Unsqueeze(%ac_batch_mul_heads_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_per_head_size_shape       = Shape(%attn_context)
+                    %ac_per_head_size_gather      = Gather(%ac_per_head_size_shape)
+                    %ac_per_head_size             = Unsqueeze(%ac_per_head_size_gather)
+                    %ac_batch                   = Div(%ac_batch_mul_heads)
+                    %ac_reshape_shape           = Concat(%ac_batch, %num_heads, %ac_seqlen, %ac_per_head_size)
+                    %ac_reshape                 = ReshapeTorch(%attn_context, %ac_reshape_shape)
+                    %ac_permute                 = Permute(%ac_reshape)
+                    %ac_hidden_size             = Mul(%ac_per_head_size)
+                    %ac_permute_reshape_shape = Concat(%ac_batch, %ac_seqlen, %ac_hidden_size)
+                    %ac_permute_reshape = ReshapeTorch(%ac_permute, %ac_permute_reshape_shape)
+                    %o_linear_mm = MatMul(%ac_permute_reshape)
+                    return (%o_linear_mm, %remove_shape) 
+            )";
+
+            flash_attention_patten.nb_inputs        = 2;
+            flash_attention_patten.nb_outputs       = 2;
+            flash_attention_patten.shape_node_name  = "@q_reshape_shape";
+            flash_attention_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(flash_attention_patten);
+        }
+
+        // SD v1.5 FlashAttention
+        {
+            FlashAttentionPatternInfo flash_attention_patten;
+
+            flash_attention_patten.graph_str = R"(
+                graph(%in, %num_heads):
+                    %q_linear_mm          = MatMul(%in)
+                    %q_batch_shape        = Shape(%q_linear_mm)
+                    %q_batch_gather       = Gather(%q_batch_shape)
+                    %q_batch              = Unsqueeze(%q_batch_gather)
+                    %q_seqlen_shape        = Shape(%q_linear_mm)
+                    %q_seqlen_gather       = Gather(%q_seqlen_shape)
+                    %q_seqlen              = Unsqueeze(%q_seqlen_gather)
+                    %q_hidden_size_shape       = Shape(%q_linear_mm)
+                    %q_hidden_size_gather      = Gather(%q_hidden_size_shape)
+                    %q_hidden_size             = Unsqueeze(%q_hidden_size_gather)
+                    %q_per_head_size        = Div(%q_hidden_size)
+                    %q_per_head_size_floor  = Floor(%q_per_head_size)
+                    %q_reshape_shape      = Concat(%q_batch, %q_seqlen, %num_heads, %q_per_head_size_floor)
+                    %q_reshape            = ReshapeTorch(%q_linear_mm, %q_reshape_shape)
+                    %q_permute            = Permute(%q_reshape)
+                    %q_batch_mul_heads  = Mul(%q_batch)
+                    %q_reshape_batch_mul_heads_shape = Concat(%q_batch_mul_heads, %q_seqlen, %q_per_head_size_floor)
+                    %q_reshape_batch_mul_heads = ReshapeTorch(%q_permute, %q_reshape_batch_mul_heads_shape)
+                    %k_linear_mm          = MatMul(%in)
+                    %v_linear_mm          = MatMul(%in)
+                    %k_batch_shape        = Shape(%k_linear_mm)
+                    %k_batch_gather       = Gather(%k_batch_shape)
+                    %k_batch              = Unsqueeze(%k_batch_gather)
+                    %k_seqlen_shape        = Shape(%k_linear_mm)
+                    %k_seqlen_gather       = Gather(%k_seqlen_shape)
+                    %k_seqlen              = Unsqueeze(%k_seqlen_gather) 
+                    %k_hidden_size_shape         = Shape(%k_linear_mm)
+                    %k_hidden_size_gather        = Gather(%k_hidden_size_shape)
+                    %k_hidden_size               = Unsqueeze(%k_hidden_size_gather) 
+                    %k_per_head_size             = Div(%k_hidden_size)
+                    %k_per_head_size_floor       = Floor(%k_per_head_size)
+                    %k_reshape_shape      = Concat(%k_batch, %k_seqlen, %num_heads, %k_per_head_size_floor)
+                    %k_reshape            = ReshapeTorch(%k_linear_mm, %k_reshape_shape)
+                    %k_permute            = Permute(%k_reshape)                   
+                    %k_batch_mul_heads  = Mul(%k_batch)
+                    %k_reshape_batch_mul_heads_shape = Concat(%k_batch_mul_heads, %k_seqlen, %k_per_head_size_floor)
+                    %k_reshape_batch_mul_heads = ReshapeTorch(%k_permute, %k_reshape_batch_mul_heads_shape)
+                    %v_batch_shape        = Shape(%v_linear_mm)
+                    %v_batch_gather       = Gather(%v_batch_shape)
+                    %v_batch              = Unsqueeze(%v_batch_gather) 
+                    %v_seqlen_shape        = Shape(%v_linear_mm)
+                    %v_seqlen_gather       = Gather(%v_seqlen_shape)
+                    %v_seqlen              = Unsqueeze(%v_seqlen_gather)
+                    %v_hidden_size_shape         = Shape(%v_linear_mm)
+                    %v_hidden_size_gather        = Gather(%v_hidden_size_shape)
+                    %v_hidden_size               = Unsqueeze(%v_hidden_size_gather) 
+                    %v_per_head_size             = Div(%v_hidden_size)
+                    %v_per_head_size_floor       = Floor(%v_per_head_size)
+                    %v_reshape_shape      = Concat(%v_batch, %v_seqlen, %num_heads, %v_per_head_size_floor)
+                    %v_reshape            = ReshapeTorch(%v_linear_mm, %v_reshape_shape)
+                    %v_permute            = Permute(%v_reshape)                   
+                    %v_batch_mul_heads  = Mul(%v_batch)
+                    %v_reshape_batch_mul_heads_shape = Concat(%v_batch_mul_heads, %v_seqlen, %v_per_head_size_floor)
+                    %v_reshape_batch_mul_heads = ReshapeTorch(%v_permute, %v_reshape_batch_mul_heads_shape)
+                    %q_remove_batch_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_batch_gather = Gather(%q_remove_batch_shape)
+                    %q_remove_batch = Unsqueeze(%q_remove_batch_gather)
+                    %q_remove_seqlen_shape = Shape(%q_reshape_batch_mul_heads)
+                    %q_remove_seqlen_gather = Gather(%q_remove_seqlen_shape)
+                    %q_remove_seqlen = Unsqueeze(%q_remove_seqlen_gather)
+                    %k_remove_hidden_size_shape = Shape(%k_reshape_batch_mul_heads)
+                    %k_remove_hidden_size_gather = Gather(%k_remove_hidden_size_shape)
+                    %k_remove_hidden_size = Unsqueeze(%k_remove_hidden_size_gather)
+                    %remove_shape = Concat(%q_remove_batch, %q_remove_seqlen, %k_remove_hidden_size)
+                    %k_permute_trans      = PermuteV2(%k_reshape_batch_mul_heads)
+                    %attn_score           = MatMul(%q_reshape_batch_mul_heads, %k_permute_trans)
+                    %attn_score_mul       = Mul(%attn_score)
+                    %attn_score_softmax   = SoftmaxCaffe(%attn_score_mul)
+                    %attn_context         = MatMul(%attn_score_softmax, %v_reshape_batch_mul_heads)
+                    %ac_batch_mul_heads_shape       = Shape(%attn_context)
+                    %ac_batch_mul_heads_gather      = Gather(%ac_batch_mul_heads_shape)
+                    %ac_batch_mul_heads             = Unsqueeze(%ac_batch_mul_heads_gather)
+                    %ac_seqlen_shape      = Shape(%attn_context)
+                    %ac_seqlen_gather     = Gather(%ac_seqlen_shape)
+                    %ac_seqlen            = Unsqueeze(%ac_seqlen_gather)
+                    %ac_per_head_size_shape       = Shape(%attn_context)
+                    %ac_per_head_size_gather      = Gather(%ac_per_head_size_shape)
+                    %ac_per_head_size             = Unsqueeze(%ac_per_head_size_gather)
+                    %ac_batch                   = Div(%ac_batch_mul_heads)
+                    %ac_batch_floor             = Floor(%ac_batch)
+                    %ac_reshape_shape           = Concat(%ac_batch_floor, %num_heads, %ac_seqlen, %ac_per_head_size)
+                    %ac_reshape                 = ReshapeTorch(%attn_context, %ac_reshape_shape)
+                    %ac_permute                 = Permute(%ac_reshape)
+                    %ac_hidden_size             = Mul(%ac_per_head_size)
+                    %ac_permute_reshape_shape = Concat(%ac_batch_floor, %ac_seqlen, %ac_hidden_size)
+                    %ac_permute_reshape = ReshapeTorch(%ac_permute, %ac_permute_reshape_shape)
+                    %o_linear_mm = MatMul(%ac_permute_reshape)
+                    return (%o_linear_mm, %remove_shape) 
+            )";
+
+            flash_attention_patten.nb_inputs        = 2;
+            flash_attention_patten.nb_outputs       = 2;
+            flash_attention_patten.shape_node_name  = "@q_reshape_shape";
+            flash_attention_patten.output_node_name = "@o_linear_mm";
+            pattens.push_back(flash_attention_patten);
+        }
+
+        return pattens;
+    }
+
+    Status FlashAttentionRewriter::Rewrite(const FlashAttentionPatternInfo &patten_info) {
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(patten_info.graph_str)) {
+            pattern = parser.getGraph();
+        } else {
+            return Status(TNNERR_PARAM_ERR, "invalid pattern syntax.");
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != patten_info.nb_inputs || in->outputs().size() != patten_info.nb_outputs) {
+                return nullptr;
+            }
+
+            if (GetHeadSize(in, patten_info) != TNN_OK) {
+                return nullptr;
+            }
+            
+            auto matmul_q = GetWeight(in, "@q_linear_mm");
+            auto matmul_k = GetWeight(in, "@k_linear_mm");
+            auto matmul_v = GetWeight(in, "@v_linear_mm");
+            auto matmul_o = GetWeight(in, "@o_linear_mm");
+            if (!matmul_q || !matmul_k || !matmul_v || !matmul_o) {
+                WARN("matmul resource is nil");
+                return nullptr;
+            }
+            
+            auto g         = std::make_shared<Graph>(); 
+            auto in_names  = GetInputs(g, patten_info);
+            auto out_names = GetOutputs(in, patten_info);
+
+            //qkv matmul
+            RawBuffer q_weight = matmul_q->weight;
+            RawBuffer k_weight = matmul_k->weight;
+            RawBuffer v_weight = matmul_v->weight;
+            auto k_weight_dims = k_weight.GetBufferDims();
+            int channel = k_weight_dims[0];
+            int per_head_size = k_weight_dims[1] / head_num_;
+            std::vector<int> reshape_size = {channel, head_num_, per_head_size};
+            q_weight.Reshape(reshape_size);
+            k_weight.Reshape(reshape_size);
+            v_weight.Reshape(reshape_size);
+            std::vector<RawBuffer> list = {q_weight, k_weight, v_weight};
+            RawBuffer qkv_weight = Concat(list, 2);
+            std::vector<int> new_shape  = {channel, head_num_ * 3 * per_head_size};
+            qkv_weight.Reshape(new_shape);
+            std::vector<std::string> qkv_in_names = {in_names[0]};
+            std::vector<std::string> qkv_out_names = {out_names[0] + "qkv_out"};
+            auto status = g->createNode(LAYER_MATMUL, qkv_in_names, qkv_out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto qkv_matmul_node = g->getNodeByTensorName(qkv_out_names[0]);
+            qkv_matmul_node->createParam<MatMulLayerParam>();
+            auto qkv_matmul_param = qkv_matmul_node->param<MatMulLayerParam>();
+            qkv_matmul_param->weight_position = 1;
+            qkv_matmul_node->info->param = qkv_matmul_param;
+            status = qkv_matmul_node->createResource<MatMulLayerResource>();
+            auto qkv_matmul_resource = qkv_matmul_node->resource<MatMulLayerResource>();
+            qkv_matmul_resource->weight = qkv_weight;
+
+            //qkv reshape to [batch, seqlen, heads, 3, per_head_size]
+            std::vector<std::string> qkv_reshape_names = {out_names[0] + "qkv_out_reshape"};
+            status = g->createNode(LAYER_RESHAPE, qkv_out_names, qkv_reshape_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto qkv_matmul_reshape_node = g->getNodeByTensorName(qkv_reshape_names[0]);
+            qkv_matmul_reshape_node->createParam<ReshapeLayerParam>();
+            auto qkv_matmul_reshape_param                     = qkv_matmul_reshape_node->param<ReshapeLayerParam>();
+            qkv_matmul_reshape_param->num_axes = 5;
+            qkv_matmul_reshape_param->shape = {0, 0, head_num_, 3, per_head_size};
+            qkv_matmul_reshape_node->info->param = qkv_matmul_reshape_param;
+         
+
+            //flash attention
+            std::vector<std::string> attention_out_names = {out_names[0] + "attention_out"};
+            std::vector<std::string> attention_in_names = {qkv_reshape_names[0], in_names[1]};
+            status = g->createNode(LAYER_FUSED, attention_in_names, attention_out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto attention_node = g->getNodeByTensorName(attention_out_names[0]);
+            attention_node->createParam<FusedLayerParam>();
+            auto attention_node_param                     = attention_node->param<FusedLayerParam>();
+            attention_node_param->attention_size_per_head = per_head_size;
+            attention_node_param->type                    = FusionType_Flash_Attention;
+            attention_node->info->param              = attention_node_param;
+            status = attention_node->createResource<FusedLayerResource>();
+            if(status != TNN_OK) {
+                return nullptr;
+            }
+
+            //Shape, only for output, this is not used, only for the number of outputs keep the same
+            std::vector<std::string> shape_in_names = {in_names[0]};
+            std::vector<std::string> shape_output_names_fake = {out_names[1] + "_fake"};
+            status = g->createNode(LAYER_SHAPE, shape_in_names, shape_output_names_fake);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_input_shape_node_fake         = g->getNodeByTensorName(out_names[1] + "_fake");
+            new_input_shape_node_fake->info->param = std::make_shared<LayerParam>();
+            g->markOutput(out_names[1] + "_fake");
+
+            //Shape
+            std::vector<std::string> shape_output_names = {out_names[1]};
+            status = g->createNode(LAYER_SHAPE, shape_in_names, shape_output_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_input_shape_node         = g->getNodeByTensorName(out_names[1]);
+            new_input_shape_node->info->param = std::make_shared<LayerParam>();
+
+            //Reshape
+            std::vector<std::string> attention_out_reshape_in_names = {attention_out_names[0], shape_output_names[0]};
+            std::vector<std::string> attention_out_reshape_names = {out_names[0] + "attention_out_reshape"};
+            status = g->createNode(LAYER_RESHAPE, attention_out_reshape_in_names, attention_out_reshape_names);
+            auto attention_out_reshape_node = g->getNodeByTensorName(attention_out_reshape_names[0]);
+            attention_out_reshape_node->createParam<ReshapeLayerParam>();
+
+            //output matmul
+            std::vector<std::string> output_matmul_name = {out_names[0]};
+            status = g->createNode(LAYER_MATMUL, attention_out_reshape_names, output_matmul_name);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto attention_out_matmul_node = g->getNodeByTensorName(out_names[0]);
+            attention_out_matmul_node->createParam<MatMulLayerParam>();
+            auto attention_out_matmul_param = attention_out_matmul_node->param<MatMulLayerParam>(); 
+            attention_out_matmul_param->weight_position = 1;
+            attention_out_matmul_node->info->param = attention_out_matmul_param;
+            status = attention_out_matmul_node->createResource<MatMulLayerResource>();
+            auto attention_out_matmul_resource = attention_out_matmul_node->resource<MatMulLayerResource>();
+            attention_out_matmul_resource->weight = matmul_o->weight;
+            g->markOutput(out_names[0]);
+            //new
+            //std::vector<std::string> output_order = {out_names[1], out_names[0]};
+            //old
+            std::vector<std::string> output_order = {out_names[0], out_names[1] + "_fake"};
+            g->setOutputsOrder(output_order);
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph_->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+    Status FlashAttentionRewriter::GetHeadSize(std::shared_ptr<AnchorGraph> in, const FlashAttentionPatternInfo &info) {
+        auto reshape_shape_node = in->getNodeByTensorName(info.shape_node_name);
+        if (!reshape_shape_node) {
+            WARN("reshape node not found");
+            return Status(TNNERR_NET_ERR, "reshape node not found");
+        }
+        if (reshape_shape_node->info->inputs.size() != 4) {
+            WARN("reshape node inputs size error");
+            return Status(TNNERR_NET_ERR, "reshape node inputs size error");
+        }
+        if (resource_->constant_map.find(reshape_shape_node->info->inputs[2]) == resource_->constant_map.end()) {
+            WARN("reshape node input not found in constant_map");
+            return Status(TNNERR_NET_ERR, "reshape node input not found in constant_map");
+        }
+        head_num_      = resource_->constant_map[reshape_shape_node->info->inputs[2]]->force_to<int*>()[0];
+        return TNN_OK;
+    }
+
+    MatMulLayerResource *FlashAttentionRewriter::GetWeight(std::shared_ptr<AnchorGraph> in, const std::string &mm_node_name) {
+        auto matmul_node = in->getNodeByTensorName(mm_node_name);
+        if (!matmul_node) {
+            WARN("node of interest not found");
+            return nullptr;
+        }
+        auto matmul_param = dynamic_cast<MatMulLayerParam *>(matmul_node->info->param.get());
+        if (!matmul_param) {
+            WARN("matmul_param is nil");
+            return nullptr;
+        }
+        if (matmul_param->weight_position != 1) {
+            WARN("matmul_param weight_position not supported");
+            return nullptr;
+        }
+        auto node_info_name = matmul_node->info->name;
+        if (resource_->resource_map.find(node_info_name) == resource_->resource_map.end()) {
+            WARN("matmul_resource is not found");
+            return nullptr;
+        }
+        auto matmul_res = dynamic_cast<MatMulLayerResource *>(resource_->resource_map[node_info_name].get());
+        if (!matmul_res) {
+            WARN("matmul_resource is nil");
+            return nullptr;
+        }
+        auto matmul_weight_dims = matmul_res->weight.GetBufferDims();
+        if (matmul_weight_dims.size() != 2) {
+            WARN("matmul_resource dims not support");
+            return nullptr;
+        }
+        return matmul_res;
+    }
+
+    std::vector<std::string> FlashAttentionRewriter::GetInputs(std::shared_ptr<Graph> g, const FlashAttentionPatternInfo &info) {
+        std::vector<std::string> inputs;
+        const std::string prefix = "input_";
+        for (int i = 0; i < info.nb_inputs; ++i) {
+            auto in_name = prefix + std::to_string(i);
+            g->getNodeOrCreatePlaceHolder(in_name);
+            inputs.push_back(in_name);
+        }
+        return inputs;
+    }
+
+    std::vector<std::string> FlashAttentionRewriter::GetOutputs(std::shared_ptr<AnchorGraph> in, const FlashAttentionPatternInfo &info) {
+        std::vector<std::string> outputs;
+        auto out_node = in->getNodeByTensorName(info.output_node_name);
+        const std::string prefix = out_node->name() + "__attention__";
+        for (int i = 0; i < info.nb_outputs; ++i) {
+            auto out_name = prefix + std::to_string(i);
+            outputs.push_back(out_name);
+        }
+        return outputs;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_flash_attention.h b/source/tnn/optimizer/net_optimizer_fuse_flash_attention.h
new file mode 100644
index 000000000..8e94cb9d5
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_flash_attention.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FLASH_ATTENTION_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FLASH_ATTENTION_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class FlashAttentionPatternInfo;
+
+    //@brief net optimize: fuse attention of transformer
+    class NetOptimizerFuseFlashAttention : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<FlashAttentionPatternInfo> GetAttentionPattens();
+        Status EliminateRedundantCasts(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_FLASH_ATTENTION_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.cc b/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.cc
new file mode 100644
index 000000000..09b1e8ba6
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.cc
@@ -0,0 +1,118 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_group_norm_swish.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseGroupNormSwish> g_net_optimizer_fuse_group_norm_swish(OptPriority::P0);
+
+    std::string NetOptimizerFuseGroupNormSwish::Strategy() {
+        return kNetOptimizerFuseGroupNormSwish;
+    }
+
+    bool NetOptimizerFuseGroupNormSwish::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerFuseGroupNormSwish::Optimize(NetStructure *structure, NetResource *resource) {
+
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        std::string pattern_str = R"(
+            graph(%in, %scale, %bias):
+                %group_norm = GroupNorm(%in, %scale, %bias)
+                %sigmoid    = Sigmoid(%group_norm)
+                %out        = Mul(%group_norm, %sigmoid)
+                return (%out)
+        )";
+
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(pattern_str)) {
+            pattern = parser.getGraph();
+        } else {
+            LOGEV("%s", msg, "invalid pattern syntax.");
+            return Status(TNNERR_PARAM_ERR, msg);
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != 3 || in->outputs().size() != 1 ){
+                return nullptr;
+            }
+
+            // create new nodes.
+            auto group_norm_node = in->getNodeByTensorName(std::string("@group_norm"));
+            if (!group_norm_node) {
+                return nullptr;
+            }
+
+            auto group_norm_param = dynamic_cast<GroupNormLayerParam *>(group_norm_node->info->param.get());
+            if (!group_norm_param) {
+                return nullptr;
+            }
+
+            auto g = std::make_shared<Graph>();
+            std::vector<std::string> in_names = {in->inputNodes()[0]->name(),
+                                                 in->inputNodes()[1]->name(),
+                                                 in->inputNodes()[2]->name()};
+            std::vector<std::string> out_names = {in->outputNodes()[0]->name()};
+            g->getNodeOrCreatePlaceHolder(in_names[0]);
+            g->getNodeOrCreatePlaceHolder(in_names[1]);
+            g->getNodeOrCreatePlaceHolder(in_names[2]);
+            auto status = g->createNode(LAYER_FUSED_GROUP_NORM_SWISH, in_names, out_names);
+            if (status != TNN_OK) {
+                LOGE("CreateNode failed, msg: %s\n", status.description().c_str());
+                return nullptr;
+            }
+            auto group_norm_swish_node         = g->getNodeByTensorName(out_names[0]);
+            RETURN_VALUE_ON_NEQ(group_norm_swish_node->createParam<GroupNormLayerParam>(), TNN_OK, nullptr);
+            group_norm_swish_node->param<GroupNormLayerParam>()->group = group_norm_param->group;
+            group_norm_swish_node->param<GroupNormLayerParam>()->eps = group_norm_param->eps;
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.h b/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.h
new file mode 100644
index 000000000..01c16edd0
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_group_norm_swish.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_GROUP_NORM_SWISS_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_GROUP_NORM_SWISS_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse separated nodes to GroupNormSwiss layer.
+    class NetOptimizerFuseGroupNormSwish : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_GROUP_NORM_SWISS_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_layer_norm.cc b/source/tnn/optimizer/net_optimizer_fuse_layer_norm.cc
new file mode 100644
index 000000000..5ef0e1810
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_layer_norm.cc
@@ -0,0 +1,140 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_layer_norm.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseLayerNorm> g_net_optimizer_fuse_layer_norm(OptPriority::P0);
+
+    std::string NetOptimizerFuseLayerNorm::Strategy() {
+        return kNetOptimizerFuseLayerNorm;
+    }
+
+    bool NetOptimizerFuseLayerNorm::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerFuseLayerNorm::Optimize(NetStructure *structure, NetResource *resource) {
+
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        std::string pattern_str = R"(
+            graph(%x):
+                %reduce_0     = ReduceMean(%x)
+                %sub          = Sub(%x, %reduce_0)
+                %mul_0        = Mul(%sub, %sub)
+                %reduce_1     = ReduceMean(%mul_0)
+                %eps          = Add(%reduce_1)
+                %sqrt         = Sqrt(%eps)
+                %div          = Div(%sub, %sqrt)
+                %scale        = Mul(%div)
+                %bias         = Add(%scale)
+                return (%bias)
+        )";
+
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(pattern_str)) {
+            pattern = parser.getGraph();
+        } else {
+            LOGEV("%s", msg, "invalid pattern syntax.");
+            return Status(TNNERR_PARAM_ERR, msg);
+        }
+
+        // Logger::instance().set_verbose_level("I");
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+
+            if (in->inputs().size() != 1 || in->outputs().size() != 1 ){
+                return nullptr;
+            }
+
+            auto input_node = in->getNodeByTensorName(std::string("@reduce_0"));
+            auto scale_node = in->getNodeByTensorName(std::string("@scale"));
+            auto bias_node = in->getNodeByTensorName(std::string("@bias"));
+            auto eps_node = in->getNodeByTensorName(std::string("@eps"));
+            if (!bias_node|| ! scale_node || !input_node || !eps_node) {
+                WARN("node of interest not found in layer_norm optimizer");
+                return nullptr;
+            }
+
+            // TODO, check reduce mean param is: keep_dims=1, axis=-1
+            DEBUG("found layernorm pattern at Node:%s", bias_node->name().c_str());
+
+            auto scale_layer_res = dynamic_cast<EltwiseLayerResource *>(resource->resource_map[scale_node->name()].get());
+            auto bias_layer_res  = dynamic_cast<EltwiseLayerResource *>(resource->resource_map[bias_node->name()].get());
+            auto eps_layer_res   = dynamic_cast<EltwiseLayerResource *>(resource->resource_map[eps_node->name()].get());
+            if (!scale_layer_res || !bias_layer_res || !eps_layer_res) {
+                ERRORV("Layernorm optimizer got nil resource.", msg);
+                return nullptr;
+            }
+
+            // create new nodes. 
+            auto g = std::make_shared<Graph>();
+
+            std::string in_name = input_node->info->inputs[0];
+            std::string scale_name = in_name + "_layernorm_scale_";
+            std::string bias_name  = in_name + "_layernorm_bias_";
+            std::string output_name = in_name + "_layernorm_output_";
+
+            auto in1 = g->getNodeOrCreatePlaceHolder(in_name);
+            RETURN_VALUE_ON_NEQ(g->createConst(scale_name, std::make_shared<RawBuffer>(scale_layer_res->element_handle)), TNN_OK, nullptr);
+            RETURN_VALUE_ON_NEQ(g->createConst(bias_name,  std::make_shared<RawBuffer>( bias_layer_res->element_handle)), TNN_OK, nullptr);
+
+            CREATE_NODE(new_node, g, LAYER_LAYER_NORM, NAMES({in_name, scale_name, bias_name}), {output_name});
+
+            RETURN_VALUE_ON_NEQ(new_node->createParam<LayerNormLayerParam>(), TNN_OK, nullptr);
+            std::shared_ptr<float> eps_ptr = GetFloatFromRawBuffer(eps_layer_res->element_handle);
+            new_node->param<LayerNormLayerParam>()->eps = *eps_ptr;
+            new_node->param<LayerNormLayerParam>()->reduce_dims_size = 1;
+
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph->rewrite(pattern, gen));
+
+        // ModelPacker packer(structure, resource);
+        // packer.Pack("pack.tnnproto", "pack.tnnmodel");
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/source/tnn/optimizer/net_optimizer_fuse_layer_norm.h b/source/tnn/optimizer/net_optimizer_fuse_layer_norm.h
new file mode 100644
index 000000000..57e8ee40f
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_layer_norm.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_LAYER_NORM_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_LAYER_NORM_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse separated nodes into layer from pytorch.
+    class NetOptimizerFuseLayerNorm : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_LAYER_NORM_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.cc b/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.cc
new file mode 100644
index 000000000..6ae77171b
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.cc
@@ -0,0 +1,463 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_matmul_concat.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/utils/data_type_utils.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseMatmulConcat> g_net_optimizer_fuse_matmul_concat(OptPriority::P1);
+
+    std::string NetOptimizerFuseMatmulConcat::Strategy() {
+        return kNetOptimizerFuseMatmulConcat;
+    }
+
+    bool NetOptimizerFuseMatmulConcat::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            return true;
+        }
+        return false;
+    }
+
+    class BaseMatmulCombiner {
+        public:
+            BaseMatmulCombiner() {}
+            virtual ~BaseMatmulCombiner() {}
+
+            Status Combine(NetStructure *structure, NetResource *resource);
+
+        protected:
+            virtual std::vector<LayerType> GetPattern() = 0;
+            virtual std::set<std::string> CombineLayers(const std::vector<std::string> &concat_inputs) = 0;
+
+            MatMulLayerResource *CheckAndGetMatmulResource(std::shared_ptr<LayerInfo> mm_layer, bool is_first_layer);
+            EltwiseLayerResource *CheckAndGetAddResource(std::shared_ptr<LayerInfo> add_layer, bool is_first_layer);
+            InnerProductLayerResource *CheckAndGetIpResource(std::shared_ptr<LayerInfo> ip_layer, InnerProductLayerParam *first_ip_param, bool is_first_layer);
+            Status ConcatMatmulResource(std::vector<MatMulLayerResource*> &mm_resources);
+            Status ConcatAddResource(std::vector<EltwiseLayerResource*> &add_resources);
+            Status ConcatIpResource(std::vector<InnerProductLayerResource*> &ip_resources);
+
+            NetStructure *structure_;
+            NetResource *resource_;
+            std::vector<std::shared_ptr<LayerInfo>> layers_orig_;
+            int layers_count_;
+            std::unordered_map<std::string, int> blob_to_layerid_;
+            std::unordered_map<std::string, int> blob_to_usecount_;
+            int gemm_k_;
+            int gemm_n_;
+            bool has_bias_;
+            int bias_n_;
+
+        private:
+            Status Init(NetStructure *structure, NetResource *resource);
+            bool IsPatternMatches(const std::vector<std::string> &inputs, const std::vector<LayerType> &pattern);
+    };
+
+    Status NetOptimizerFuseMatmulConcat::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+        if (!resource) {
+            LOGE("Error: empty NetResource\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetResource");
+        }
+
+        for (const auto &combiner : GetCombiners()) {
+            RETURN_ON_FAIL(combiner->Combine(structure, resource));
+        }
+
+        return TNN_OK;
+    }
+
+    class MatmulAddSigmoidCombiner : public BaseMatmulCombiner {
+        public:
+            MatmulAddSigmoidCombiner() : BaseMatmulCombiner() {}
+            virtual ~MatmulAddSigmoidCombiner() {}
+        private:
+            virtual std::vector<LayerType> GetPattern() override;
+            virtual std::set<std::string> CombineLayers(const std::vector<std::string> &concat_inputs) override;
+    };
+
+    class InnerproductSigmoidCombiner : public BaseMatmulCombiner {
+        public:
+            InnerproductSigmoidCombiner() : BaseMatmulCombiner() {}
+            virtual ~InnerproductSigmoidCombiner() {}
+        private:
+            virtual std::vector<LayerType> GetPattern() override;
+            virtual std::set<std::string> CombineLayers(const std::vector<std::string> &concat_inputs) override;
+    };
+
+    std::vector<std::shared_ptr<BaseMatmulCombiner>> NetOptimizerFuseMatmulConcat::GetCombiners() {
+        std::vector<std::shared_ptr<BaseMatmulCombiner>> res;
+        res.push_back(std::make_shared<MatmulAddSigmoidCombiner>());
+        res.push_back(std::make_shared<InnerproductSigmoidCombiner>());
+        return res;
+    }
+
+    Status BaseMatmulCombiner::Init(NetStructure *structure, NetResource *resource) {
+        structure_ = structure;
+        resource_  = resource;
+        layers_orig_  = structure_->layers;
+        layers_count_ = layers_orig_.size();
+        for (int index = 0; index < layers_count_; index++) {
+            for (const auto &in : layers_orig_[index]->inputs) {
+                blob_to_usecount_[in]++;
+            }
+            for (const auto &out : layers_orig_[index]->outputs) {
+                blob_to_layerid_[out] = index;
+            }
+        }
+        for (const auto &out : structure_->outputs) {
+            blob_to_usecount_[out]++;
+        }
+        return TNN_OK;
+    }
+
+    bool BaseMatmulCombiner::IsPatternMatches(const std::vector<std::string> &inputs,
+                                              const std::vector<LayerType> &pattern) {
+        std::string in_blob;
+        for (const auto &input : inputs) {
+            auto blob = input;
+            for (auto iter = pattern.rbegin(); iter != pattern.rend(); ++iter) {
+                if (blob_to_usecount_.find(blob) == blob_to_usecount_.end() ||
+                    blob_to_usecount_.at(blob) != 1) {
+                    return false;
+                }
+                if (blob_to_layerid_.find(blob) == blob_to_layerid_.end()) {
+                    return false;
+                }
+                auto prev_layer = structure_->layers[blob_to_layerid_.at(blob)];
+                if (prev_layer->type != *iter ||
+                    prev_layer->inputs.size() != 1 ||
+                    prev_layer->outputs.size() != 1) {
+                    return false;
+                }
+                blob = prev_layer->inputs[0];
+            }
+            if (in_blob == "") {
+                in_blob = blob;
+            } else {
+                if (blob != in_blob) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    Status BaseMatmulCombiner::Combine(NetStructure *structure, NetResource *resource) {
+        RETURN_ON_FAIL(Init(structure, resource));
+
+        std::set<std::string> remove_layers;
+        for (int index = 0; index < layers_count_; index++) {
+            auto concat_layer = layers_orig_[index];
+            if (concat_layer->type != LAYER_CONCAT || concat_layer->inputs.size() <= 1 || concat_layer->outputs.size() != 1) {
+                continue;
+            }
+
+            if (!IsPatternMatches(concat_layer->inputs, GetPattern())) {
+                continue;
+            }
+
+            std::set<std::string> cur_remove_layers = CombineLayers(concat_layer->inputs);
+            if (!cur_remove_layers.empty()) {
+                for (const auto &r : cur_remove_layers) {
+                    remove_layers.insert(r);
+                }
+                auto prev_layer = layers_orig_[blob_to_layerid_[concat_layer->inputs[0]]];
+                prev_layer->outputs = {concat_layer->outputs[0]};
+                remove_layers.insert(concat_layer->name);
+            }
+        }
+
+        if (remove_layers.empty()) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_optimized;
+        for (int index = 0; index < layers_count_; index++) {
+            if (remove_layers.find(layers_orig_[index]->name) == remove_layers.end()) {
+                layers_optimized.push_back(layers_orig_[index]);
+            }
+        }
+        structure_->layers = layers_optimized;
+
+        return TNN_OK;
+    }
+
+    MatMulLayerResource *BaseMatmulCombiner::CheckAndGetMatmulResource(std::shared_ptr<LayerInfo> mm_layer, bool is_first_layer) {
+        MatMulLayerParam* mm_param = dynamic_cast<MatMulLayerParam*>(mm_layer->param.get());
+        if (!mm_param || mm_param->weight_position != 1) {
+            return nullptr;
+        }
+        if (resource_->resource_map.find(mm_layer->name) == resource_->resource_map.end()) {
+            return nullptr;
+        }
+        MatMulLayerResource* mm_res = dynamic_cast<MatMulLayerResource*>(resource_->resource_map.at(mm_layer->name).get());
+        if (!mm_res || mm_res->weight.GetBufferDims().size() != 2) {
+            return nullptr;
+        }
+        if (is_first_layer) {
+            gemm_k_ = mm_res->weight.GetBufferDims()[0];
+            gemm_n_ = mm_res->weight.GetBufferDims()[1];
+        } else {
+            if (mm_res->weight.GetBufferDims()[0] != gemm_k_) {
+                return nullptr;
+            }
+            gemm_n_ += mm_res->weight.GetBufferDims()[1];
+        }
+        return mm_res;
+    }
+
+    EltwiseLayerResource *BaseMatmulCombiner::CheckAndGetAddResource(std::shared_ptr<LayerInfo> add_layer, bool is_first_layer) {
+        MultidirBroadcastLayerParam* add_param = dynamic_cast<MultidirBroadcastLayerParam*>(add_layer->param.get());
+        if (!add_param || add_param->weight_input_index != 1) {
+            return nullptr;
+        }
+
+        if (resource_->resource_map.find(add_layer->name) == resource_->resource_map.end()) {
+            return nullptr;
+        }
+        EltwiseLayerResource* add_res = dynamic_cast<EltwiseLayerResource*>(resource_->resource_map.at(add_layer->name).get());
+        if (!add_res || add_res->element_handle.GetBufferDims().size() != 1) {
+            return nullptr;
+        }
+        if (is_first_layer) {
+            bias_n_ = add_res->element_handle.GetBufferDims()[0];
+        } else {
+            bias_n_ += add_res->element_handle.GetBufferDims()[0];
+        }
+        if (bias_n_ != gemm_n_) {
+            return nullptr;
+        }
+        return add_res;
+    }
+
+    InnerProductLayerResource *BaseMatmulCombiner::CheckAndGetIpResource(std::shared_ptr<LayerInfo> ip_layer, InnerProductLayerParam* first_ip_param, bool is_first_layer) {
+        InnerProductLayerParam* ip_param = dynamic_cast<InnerProductLayerParam*>(ip_layer->param.get());
+        if (!ip_param ||
+            ip_param->axis != first_ip_param->axis ||
+            ip_param->has_bias != first_ip_param->has_bias ||
+            ip_param->transpose != 0) {
+            return nullptr;
+        }
+
+        if (resource_->resource_map.find(ip_layer->name) == resource_->resource_map.end()) {
+            return nullptr;
+        }
+        InnerProductLayerResource* ip_res = dynamic_cast<InnerProductLayerResource*>(resource_->resource_map.at(ip_layer->name).get());
+        if (!ip_res ||
+            ip_res->weight_handle.GetBufferDims().size() != 2) {
+            return nullptr;
+        }
+        if (is_first_layer) {
+            has_bias_ = ip_param->has_bias;
+            gemm_k_ = ip_res->weight_handle.GetBufferDims()[0];
+            gemm_n_ = ip_res->weight_handle.GetBufferDims()[1];
+            if (ip_param->has_bias) {
+                bias_n_ = ip_res->bias_handle.GetBufferDims()[0];
+            }
+        } else {
+            if (ip_res->weight_handle.GetBufferDims()[0] != gemm_k_) {
+                return nullptr;
+            }
+            gemm_n_ += ip_res->weight_handle.GetBufferDims()[1];
+            if (ip_param->has_bias) {
+                bias_n_ += ip_res->bias_handle.GetBufferDims()[0];
+            }
+        }
+        if (ip_param->has_bias && bias_n_ != gemm_n_) {
+            return nullptr;
+        }
+        return ip_res;
+    }
+
+    Status BaseMatmulCombiner::ConcatMatmulResource(std::vector<MatMulLayerResource*> &mm_resources) {
+        auto dtype = mm_resources[0]->weight.GetDataType();
+
+        int dsize = DataTypeUtils::GetBytesSize(dtype);
+        RawBuffer weight_handle(gemm_k_ * gemm_n_ * dsize);
+        weight_handle.SetBufferDims({gemm_k_, gemm_n_});
+        weight_handle.SetDataType(dtype);
+
+        auto weight_start = weight_handle.force_to<char*>();
+        for (const auto &res : mm_resources) {
+            int cur_gemm_n = res->weight.GetBufferDims()[1];
+            for (int k = 0; k < gemm_k_; ++k) {
+                memcpy(weight_start + k * gemm_n_ * dsize, res->weight.force_to<char*>() + k * cur_gemm_n * dsize, cur_gemm_n * dsize);
+            }
+            weight_start += cur_gemm_n * dsize;
+        }
+
+        mm_resources[0]->weight = weight_handle;
+        return TNN_OK;
+    }
+
+    Status BaseMatmulCombiner::ConcatAddResource(std::vector<EltwiseLayerResource*> &add_resources) {
+        auto dtype = add_resources[0]->element_handle.GetDataType();
+
+        int dsize = DataTypeUtils::GetBytesSize(dtype);
+        RawBuffer bias_handle(gemm_n_ * dsize);
+        bias_handle.SetBufferDims({gemm_n_});
+        bias_handle.SetDataType(dtype);
+
+        auto bias_start = bias_handle.force_to<char*>();
+        for (const auto &res : add_resources) {
+            memcpy(bias_start, res->element_handle.force_to<char*>(), res->element_handle.GetBytesSize());
+            bias_start += res->element_handle.GetBytesSize();
+        }
+
+        add_resources[0]->element_handle = bias_handle;
+        add_resources[0]->element_shape = bias_handle.GetBufferDims();
+        return TNN_OK;
+    }
+
+    Status BaseMatmulCombiner::ConcatIpResource(std::vector<InnerProductLayerResource*> &ip_resources) {
+        auto dtype = ip_resources[0]->weight_handle.GetDataType();
+
+        int dsize = DataTypeUtils::GetBytesSize(dtype);
+        RawBuffer weight_handle(gemm_k_ * gemm_n_ * dsize);
+        weight_handle.SetBufferDims({gemm_k_, gemm_n_});
+        weight_handle.SetDataType(dtype);
+        RawBuffer bias_handle(gemm_n_ * dsize);
+        bias_handle.SetBufferDims({gemm_n_});
+        bias_handle.SetDataType(dtype);
+
+        auto weight_start = weight_handle.force_to<char*>();
+        auto bias_start = bias_handle.force_to<char*>();
+        for (const auto &res : ip_resources) {
+            memcpy(weight_start, res->weight_handle.force_to<char*>(), res->weight_handle.GetBytesSize());
+            weight_start += res->weight_handle.GetBytesSize();
+            if (has_bias_) {
+                memcpy(bias_start, res->bias_handle.force_to<char*>(), res->bias_handle.GetBytesSize());
+                bias_start += res->bias_handle.GetBytesSize();
+            }
+        }
+
+        ip_resources[0]->weight_handle = weight_handle;
+        if (has_bias_) {
+            ip_resources[0]->bias_handle = bias_handle;
+        }
+        return TNN_OK;
+    }
+
+    std::vector<LayerType> MatmulAddSigmoidCombiner::GetPattern() {
+        return {LAYER_MATMUL, LAYER_ADD, LAYER_SIGMOID};
+    }
+
+    std::set<std::string> MatmulAddSigmoidCombiner::CombineLayers(const std::vector<std::string> &concat_inputs) {
+        auto sigmoid_layer = layers_orig_[blob_to_layerid_[concat_inputs[0]]];
+        auto add_layer     = layers_orig_[blob_to_layerid_[sigmoid_layer->inputs[0]]];
+        auto matmul_layer  = layers_orig_[blob_to_layerid_[add_layer->inputs[0]]];
+
+        MatMulLayerResource* mm_res = CheckAndGetMatmulResource(matmul_layer, true);
+        if (!mm_res) {
+            return {};
+        }
+        EltwiseLayerResource* add_res = CheckAndGetAddResource(add_layer, true);
+        if (!add_res) {
+            return {};
+        }
+
+        std::set<std::string> remove_layers;
+        std::vector<MatMulLayerResource*> mm_resources = {mm_res};
+        std::vector<EltwiseLayerResource*> add_resources = {add_res};
+        for (int i = 1; i < concat_inputs.size(); ++i) {
+            auto sigmoid_layer = layers_orig_[blob_to_layerid_[concat_inputs[i]]];
+            auto add_layer     = layers_orig_[blob_to_layerid_[sigmoid_layer->inputs[0]]];
+            auto matmul_layer  = layers_orig_[blob_to_layerid_[add_layer->inputs[0]]];
+
+            MatMulLayerResource* mm_res = CheckAndGetMatmulResource(matmul_layer, false);
+            if (!mm_res) {
+                return {};
+            }
+            EltwiseLayerResource* add_res = CheckAndGetAddResource(add_layer, false);
+            if (!add_res) {
+                return {};
+            }
+
+            mm_resources.push_back(mm_res);
+            add_resources.push_back(add_res);
+            remove_layers.insert(sigmoid_layer->name);
+            remove_layers.insert(add_layer->name);
+            remove_layers.insert(matmul_layer->name);
+        }
+
+        if (!remove_layers.empty()) {
+            if (ConcatMatmulResource(mm_resources) != TNN_OK) {
+                return {};
+            }
+            if (!ConcatAddResource(add_resources) != TNN_OK) {
+                return {};
+            }
+        }
+        return remove_layers;
+    }
+
+    std::vector<LayerType> InnerproductSigmoidCombiner::GetPattern() {
+        return {LAYER_INNER_PRODUCT, LAYER_SIGMOID};
+    }
+
+    std::set<std::string> InnerproductSigmoidCombiner::CombineLayers(const std::vector<std::string> &concat_inputs) {
+        auto sigmoid_layer = layers_orig_[blob_to_layerid_[concat_inputs[0]]];
+        auto ip_layer      = layers_orig_[blob_to_layerid_[sigmoid_layer->inputs[0]]];
+
+        InnerProductLayerParam* ip_param = dynamic_cast<InnerProductLayerParam*>(ip_layer->param.get());
+
+        InnerProductLayerResource* ip_res = CheckAndGetIpResource(ip_layer, ip_param, true);
+        if (!ip_res) {
+            return {};
+        }
+
+        std::set<std::string> remove_layers;
+        std::vector<InnerProductLayerResource*> ip_resources = {ip_res};
+        for (int i = 1; i < concat_inputs.size(); ++i) {
+            auto sigmoid_layer = layers_orig_[blob_to_layerid_[concat_inputs[i]]];
+            auto ip_layer      = layers_orig_[blob_to_layerid_[sigmoid_layer->inputs[0]]];
+
+            InnerProductLayerResource* ip_res = CheckAndGetIpResource(ip_layer, ip_param, false);
+            if (!ip_res) {
+                return {};
+            }
+
+            ip_resources.push_back(ip_res);
+            remove_layers.insert(sigmoid_layer->name);
+            remove_layers.insert(ip_layer->name);
+        }
+
+        if (!remove_layers.empty()) {
+            if (ConcatIpResource(ip_resources) != TNN_OK) {
+                return {};
+            }
+            ip_param->num_output = gemm_n_;
+        }
+        return remove_layers;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.h b/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.h
new file mode 100644
index 000000000..89ca79c94
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_matmul_concat.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_MATMUL_CONCAT_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_MATMUL_CONCAT_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    class BaseMatmulCombiner;
+
+    //@brief net optimize: fuse multiple matmul and concat to a larger matmul
+    class NetOptimizerFuseMatmulConcat : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+        std::vector<std::shared_ptr<BaseMatmulCombiner>> GetCombiners();
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_MATMUL_CONCAT_H_
diff --git a/source/tnn/optimizer/net_optimizer_fuse_split_gelu.cc b/source/tnn/optimizer/net_optimizer_fuse_split_gelu.cc
new file mode 100644
index 000000000..65b5239f4
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_split_gelu.cc
@@ -0,0 +1,101 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_fuse_split_gelu.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/interpreter/tnn/model_packer.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerFuseSplitGELU> g_net_optimizer_fuse_split_gelu(OptPriority::P0);
+
+    std::string NetOptimizerFuseSplitGELU::Strategy() {
+        return kNetOptimizerFuseSplitGELU;
+    }
+
+    bool NetOptimizerFuseSplitGELU::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerFuseSplitGELU::Optimize(NetStructure *structure, NetResource *resource) {
+
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        std::string pattern_str = R"(
+            graph(%in):
+                %state, %gate      = SplitV(%in)
+                %gelu              = GELU(%gate)
+                %out               = Mul(%state, %gelu)
+                return (%out)
+        )";
+
+        GraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(pattern_str)) {
+            pattern = parser.getGraph();
+        } else {
+            LOGEV("%s", msg, "invalid pattern syntax.");
+            return Status(TNNERR_PARAM_ERR, msg);
+        }
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+            if (in->inputs().size() != 1 || in->outputs().size() != 1 ){
+                return nullptr;
+            }
+
+            // create new nodes. 
+            auto g = std::make_shared<Graph>();
+            std::vector<std::string> in_names = {in->inputNodes()[0]->name()};
+            std::vector<std::string> out_names = {in->outputNodes()[0]->name()};
+            g->getNodeOrCreatePlaceHolder(in_names[0]);
+            auto status = g->createNode(LAYER_FUSED_SPLIT_GELU, in_names, out_names);
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto split_gelu_node         = g->getNodeByTensorName(out_names[0]);
+            split_gelu_node->createParam<LayerParam>();
+            
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/source/tnn/optimizer/net_optimizer_fuse_split_gelu.h b/source/tnn/optimizer/net_optimizer_fuse_split_gelu.h
new file mode 100644
index 000000000..829e778ca
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_fuse_split_gelu.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_SPLIT_GELU_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_SPLIT_GELU_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse separated nodes to splitgelu layer.
+    class NetOptimizerFuseSplitGELU : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_FUSE_SPLIT_GELU_H_
diff --git a/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc b/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc
index 4c1a13806..c58939b6a 100644
--- a/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc
+++ b/source/tnn/optimizer/net_optimizer_insert_fp16_reformat.cc
@@ -318,7 +318,7 @@ namespace optimizer {
                                                      std::vector<std::string> &reformat_outs,
                                                      const std::string &reformat_name_suffix, const int index,
                                                      const int count) {
-        // change blobs for layers to read blob data correctly
+        // change blobs for for layers to read blob data correctly
         new_layer->inputs = reformat_outs;
         for (auto cur_out : reformat_outs) {
             auto new_out = cur_out + reformat_name_suffix;
diff --git a/source/tnn/optimizer/net_optimizer_qdq.cc b/source/tnn/optimizer/net_optimizer_qdq.cc
new file mode 100644
index 000000000..08ce62cd8
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_qdq.cc
@@ -0,0 +1,403 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_qdq.h"
+#include "tnn/optimizer/QDQ/graph.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+#include <tnn/interpreter/tnn/model_packer.h>
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    using namespace TNN_NS::QDQ;
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerQDQ> g_net_optimizer_qdq(OptPriority::P0);
+
+    std::string NetOptimizerQDQ::Strategy() {
+        return kNetOptimizerQDQ;
+    }
+
+    bool NetOptimizerQDQ::IsSupported(const NetworkConfig &net_config) {
+        if (net_config.device_type == DEVICE_CUDA) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    //! A --\                        A -> Q clone --\                                 
+    //! B --+--> Concat --> Q   =>   B -> Q clone --+--> Concat
+    //! C --/                        C -> Q clone --/
+
+    // Status NetOptimizerQDQ::Optimize(NetStructure *structure, NetResource *resource) {
+    //     Graph graph(structure, resource);
+    //     std::unordered_map<std::string, std::string> rename_map;
+    //     for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+    //         auto layer = graph.GetLayerById(l);
+    //         if (layer->type == LAYER_CONCAT) {
+    //             auto prodecessor_ids = graph.FindPredecessors(l);
+    //             auto successor_ids = graph.FindSuccessors(l);
+
+    //             assert(prodecessor_ids.size() > 0);
+    //             if (successor_ids.size() == 0) continue;
+
+    //             // match patten
+    //             if (successor_ids.size() == 1) {
+    //                 auto next_layer = graph.GetLayerById(successor_ids[0]);
+    //                 if (next_layer->type != LAYER_QUANTIZE) continue;
+    //             } else {
+    //                 bool all_quantize = true;
+    //                 for (auto nid : successor_ids) {
+    //                     auto next_layer = graph.GetLayerById(nid);
+    //                     all_quantize = all_quantize && (next_layer->type == LAYER_QUANTIZE);
+    //                 }
+    //                 if (!all_quantize) continue;
+    //                 bool all_scale_equal = true;
+    //                 for (auto nid : successor_ids) {
+    //                     // Todo : check quantize scale
+    //                 }
+    //                 if (!all_scale_equal) continue;
+    //             }
+
+    //             // fuse patten
+    //             auto q_layer_id = successor_ids[0];
+    //             auto q_layer = graph.GetLayerById(q_layer_id);
+    //             rename_map[q_layer->outputs[0]] = q_layer->inputs[0];
+    //             std::vector<std::shared_ptr<LayerInfo>> q_clones;
+    //             for (int i = 0; i < prodecessor_ids.size(); i++) {
+    //                 auto prev_layer = graph.GetLayerById(prodecessor_ids[i]);
+    //                 auto clone_layer = graph.CloneLayerById(q_layer_id, i);
+    //                 // set clone input to prev output
+    //                 clone_layer->inputs[0] = prev_layer->outputs[0];
+    //                 // set concat input to clone output
+    //                 layer->inputs[i] = clone_layer->outputs[0];
+    //                 q_clones.push_back(clone_layer);
+    //             }
+
+    //             graph.EraseLayerById(q_layer_id);
+    //             graph.InsertLayers(l, q_clones);
+    //         } else {
+    //             for (int i = 0; i < layer->inputs.size(); i++) {
+    //                 if (rename_map.count(layer->inputs[i])) {
+    //                     layer->inputs[i] = rename_map[layer->inputs[i]];
+    //                 }
+    //             }
+    //         }
+    //     }
+
+    //     ModelPacker model_packer(structure, resource);
+    //     model_packer.Pack("qdq.tnnproto", "qdq.tnnmodel"); 
+    //     return TNN_OK;
+    // }
+
+    // dq -> HardSwish -> q  =>  QHardSwish
+    // dq -> GroupNorm -> relu -> q => QGroupNorm
+    // dq(graph output) => dq -> unsqueeze -> squeeze
+    Status NetOptimizerQDQ::Optimize(NetStructure *structure, NetResource *resource) {
+        Graph graph(structure, resource);
+        std::unordered_map<std::string, std::string> rename_map;
+
+        // step 1
+        //!   /-- Q - DQ - A                  /-- A
+        //! X --- Q - DQ - B   =>  X - Q - DQ --- B  , if all the branchs have the same scale 
+        //!   \-- Q - DQ - C                  \-- C
+        // fuse Q Node
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer         = graph.GetLayerById(l);
+            auto successor_ids = graph.FindSuccessors(l);
+            if (successor_ids.size() <= 1)
+                continue;
+            auto next_layer = graph.GetLayerById(successor_ids[0]);
+            if (next_layer->type != LAYER_QUANTIZE)
+                continue;
+
+            // match pattern
+            auto scale     = reinterpret_cast<QuantizeLayerResource *>(graph.GetLayerResByName(next_layer->name).get());
+            auto ref_scale = scale->scale_handle.force_to<float *>()[0];
+            bool quant_with_same_scale = true;
+            for (int i = 1; i < successor_ids.size(); i++) {
+                auto cur_layer = graph.GetLayerById(successor_ids[i]);
+                if (cur_layer->type != LAYER_QUANTIZE) {
+                    quant_with_same_scale = false;
+                    break;
+                } else {
+                    auto scale =
+                        reinterpret_cast<QuantizeLayerResource *>(graph.GetLayerResByName(next_layer->name).get());
+                    auto compare_scale = scale->scale_handle.force_to<float *>()[0];
+                    if (fabs(ref_scale - compare_scale) > 10e-6) {
+                        quant_with_same_scale = false;
+                        break;
+                    }
+                }
+            }
+            if (!quant_with_same_scale)
+                continue;
+
+            for (int i = 1; i < successor_ids.size(); i++) {
+                auto cur_layer                    = graph.GetLayerById(successor_ids[i]);
+                cur_layer->type                   = LAYER_NOT_SUPPORT;
+                rename_map[cur_layer->outputs[0]] = next_layer->outputs[0];
+            }
+        }
+
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer = graph.GetLayerById(l);
+            for (int i = 0; i < layer->inputs.size(); i++) {
+                if (rename_map.count(layer->inputs[i])) {
+                    layer->inputs[i] = rename_map[layer->inputs[i]];
+                }
+            }
+        }
+
+        graph.EliminateDeadLayer();
+
+        // fuse DQ Node
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer         = graph.GetLayerById(l);
+            auto successor_ids = graph.FindSuccessors(l);
+            if (successor_ids.size() <= 1)
+                continue;
+            auto next_layer = graph.GetLayerById(successor_ids[0]);
+            if (next_layer->type != LAYER_DEQUANTIZE)
+                continue;
+
+            // match pattern
+            auto scale     = reinterpret_cast<QuantizeLayerResource *>(graph.GetLayerResByName(next_layer->name).get());
+            auto ref_scale = scale->scale_handle.force_to<float *>()[0];
+            bool quant_with_same_scale = true;
+            for (int i = 1; i < successor_ids.size(); i++) {
+                auto cur_layer = graph.GetLayerById(successor_ids[i]);
+                if (cur_layer->type != LAYER_DEQUANTIZE) {
+                    quant_with_same_scale = false;
+                    break;
+                } else {
+                    auto scale =
+                        reinterpret_cast<QuantizeLayerResource *>(graph.GetLayerResByName(next_layer->name).get());
+                    auto compare_scale = scale->scale_handle.force_to<float *>()[0];
+                    if (fabs(ref_scale - compare_scale) > 10e-6) {
+                        quant_with_same_scale = false;
+                        break;
+                    }
+                }
+            }
+            if (!quant_with_same_scale)
+                continue;
+
+            for (int i = 1; i < successor_ids.size(); i++) {
+                auto cur_layer                    = graph.GetLayerById(successor_ids[i]);
+                cur_layer->type                   = LAYER_NOT_SUPPORT;
+                rename_map[cur_layer->outputs[0]] = next_layer->outputs[0];
+            }
+        }
+
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer = graph.GetLayerById(l);
+            for (int i = 0; i < layer->inputs.size(); i++) {
+                if (rename_map.count(layer->inputs[i])) {
+                    layer->inputs[i] = rename_map[layer->inputs[i]];
+                }
+            }
+        }
+
+        graph.EliminateDeadLayer();
+
+        // step 2
+        // dq -> HardSwish -> q  =>  QHardSwish
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer = graph.GetLayerById(l);
+            if (layer->type == LAYER_HARDSWISH) {
+                auto prodecessor_ids = graph.FindPredecessors(l);
+                auto successor_ids = graph.FindSuccessors(l);
+
+                assert(prodecessor_ids.size() > 0);
+                if (successor_ids.size() != 1) continue;
+                if (prodecessor_ids.size() != 1) continue;
+
+                // match patten
+                auto prev_layer = graph.GetLayerById(prodecessor_ids[0]);
+                if (prev_layer->type != LAYER_DEQUANTIZE) continue;
+                auto next_layer = graph.GetLayerById(successor_ids[0]);
+                if (next_layer->type != LAYER_QUANTIZE) continue;
+
+                auto _prodecessor_ids = graph.FindPredecessors(prodecessor_ids[0]);
+                auto __prodecessor_ids = graph.FindPredecessors(_prodecessor_ids[0]);
+                auto check_layer = graph.GetLayerById(__prodecessor_ids[0]);
+                // qhardswish after linear will cause trt error, linear has been changed from innerproduct to matmul+add
+                if (check_layer->type == LAYER_ADD) continue;
+
+                // Todo : check dq/q scale, should be equal
+
+                // fuse patten
+                rename_map[prev_layer->outputs[0]] = prev_layer->inputs[0];
+                rename_map[next_layer->outputs[0]] = next_layer->inputs[0];
+
+                layer->inputs[0] = rename_map[layer->inputs[0]];
+
+                prev_layer->type = LAYER_NOT_SUPPORT;
+                next_layer->type = LAYER_NOT_SUPPORT;
+
+                layer->param->quantized = true;
+                auto input_scale = reinterpret_cast<QuantizeLayerResource*>(graph.GetLayerResByName(prev_layer->name).get());
+                auto output_scale = reinterpret_cast<QuantizeLayerResource*>(graph.GetLayerResByName(next_layer->name).get());
+                auto input_scale_buf = std::make_shared<RawBuffer>(input_scale->scale_handle);
+                auto output_scale_buf = std::make_shared<RawBuffer>(output_scale->scale_handle);
+                graph.SetConstResByName(layer->inputs[0]+"_scale_data_", input_scale_buf);
+                graph.SetConstResByName(layer->outputs[0]+"_scale_data_", output_scale_buf);
+
+            } else {
+                for (int i = 0; i < layer->inputs.size(); i++) {
+                    if (rename_map.count(layer->inputs[i])) {
+                        layer->inputs[i] = rename_map[layer->inputs[i]];
+                    }
+                }
+            }
+        }
+        graph.EliminateDeadLayer();
+
+        // step3
+        // dq -> GroupNorm -> relu -> q => QGroupNorm
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer = graph.GetLayerById(l);
+            if (layer->type == LAYER_GROUP_NORM) {
+                auto prodecessor_ids = graph.FindPredecessors(l);
+                auto successor_ids = graph.FindSuccessors(l);
+
+                if (successor_ids.size() != 1) continue;
+                if (prodecessor_ids.size() != 3) continue;
+
+                auto dq_layer = graph.GetLayerById(prodecessor_ids[0]);
+                if (dq_layer->type != LAYER_DEQUANTIZE) continue;
+                auto relu_layer = graph.GetLayerById(successor_ids[0]);
+                if (relu_layer->type != LAYER_RELU) continue;
+
+                // relu only have one successor
+                auto _successor_ids = graph.FindSuccessors(successor_ids[0]);
+                if (_successor_ids.size() != 1) continue;
+                auto q_layer = graph.GetLayerById(_successor_ids[0]);
+                if (q_layer->type != LAYER_QUANTIZE) continue;
+
+                // dq only have one successor
+                auto dq_successor_ids = graph.FindSuccessors(prodecessor_ids[0]);
+                if (dq_successor_ids.size() != 1) continue;
+
+                rename_map[dq_layer->outputs[0]] = dq_layer->inputs[0];
+                rename_map[q_layer->outputs[0]] = layer->outputs[0];
+
+                layer->inputs[0] = rename_map[layer->inputs[0]];
+
+                dq_layer->type = LAYER_NOT_SUPPORT;
+                relu_layer->type = LAYER_NOT_SUPPORT;
+                q_layer->type = LAYER_NOT_SUPPORT;
+
+                layer->param->quantized = true;
+                auto input_scale = reinterpret_cast<QuantizeLayerResource*>(graph.GetLayerResByName(dq_layer->name).get());
+                auto output_scale = reinterpret_cast<QuantizeLayerResource*>(graph.GetLayerResByName(q_layer->name).get());
+                auto input_scale_buf = std::make_shared<RawBuffer>(input_scale->scale_handle);
+                auto output_scale_buf = std::make_shared<RawBuffer>(output_scale->scale_handle);
+                graph.SetConstResByName(layer->inputs[0]+"_scale_data_", input_scale_buf);
+                graph.SetConstResByName(layer->outputs[0]+"_scale_data_", output_scale_buf);
+            } else {
+                for (int i = 0; i < layer->inputs.size(); i++) {
+                    if (rename_map.count(layer->inputs[i])) {
+                        layer->inputs[i] = rename_map[layer->inputs[i]];
+                    }
+                }
+            }
+        }
+        graph.EliminateDeadLayer();
+
+        // step4
+        // dq(graph output) => dq -> unsqueeze -> squeeze
+        for (layer_id_t l = 0; l <= graph.GetMaxLayerId(); l++) {
+            auto layer = graph.GetLayerById(l);
+            if (layer->type == LAYER_DEQUANTIZE) {
+                if (structure->outputs.count(layer->outputs[0])) {
+                    rename_map[layer->outputs[0]] = layer->outputs[0] + "TensorDQ";
+                    std::vector<std::shared_ptr<LayerInfo>> insert_layers;
+
+                    // add unsqueeze layer
+                    {
+                        std::shared_ptr<LayerInfo> layer_info = std::make_shared<LayerInfo>();
+                        layer_info->name                      = layer->name + "_unsqueeze";
+                        layer_info->type                      = LAYER_UNSQUEEZE;
+                        layer_info->type_str                  = "Unsqueeze";
+
+                        auto layer_param  = std::make_shared<UnsqueezeLayerParam>();
+                        layer_param->axes = {0};
+                        layer_info->param = layer_param;
+
+                        layer_info->inputs.push_back(rename_map[layer->outputs[0]]);
+                        layer_info->outputs.push_back(layer->outputs[0] + "_unsqueeze");
+
+                        structure->blobs.insert(layer_info->inputs[0]);
+                        structure->blobs.insert(layer_info->outputs[0]);
+
+                        insert_layers.push_back(layer_info);
+                    }
+
+                    // add squeeze layer
+                    {
+                        std::shared_ptr<LayerInfo> layer_info = std::make_shared<LayerInfo>();
+                        layer_info->name                      = layer->name + "_squeeze";
+                        layer_info->type                      = LAYER_SQUEEZE;
+                        layer_info->type_str                  = "Squeeze";
+
+                        auto layer_param  = std::make_shared<SqueezeLayerParam>();
+                        layer_param->axes = {0};
+                        layer_info->param = layer_param;
+
+                        layer_info->inputs.push_back(layer->outputs[0] + "_unsqueeze");
+                        layer_info->outputs.push_back(layer->outputs[0]);
+
+                        structure->blobs.insert(layer_info->inputs[0]);
+                        structure->blobs.insert(layer_info->outputs[0]);
+
+                        insert_layers.push_back(layer_info);
+                    }
+
+                    layer->outputs[0] = rename_map[layer->outputs[0]];
+
+                    graph.InsertLayers(l + 1, insert_layers);
+                }
+            } else {
+                for (int i = 0; i < layer->inputs.size(); i++) {
+                    if (rename_map.count(layer->inputs[i])) {
+                        layer->inputs[i] = rename_map[layer->inputs[i]];
+                    }
+                }
+            }
+        }
+        graph.EliminateDeadLayer();
+
+        // ModelPacker model_packer(structure, resource);
+        // model_packer.Pack("qdq.tnnproto", "qdq.tnnmodel"); 
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_qdq.h b/source/tnn/optimizer/net_optimizer_qdq.h
new file mode 100644
index 000000000..028fba9c2
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_qdq.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_QDQ_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_QDQ_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: fuse qdq layer based on trt rules
+    class NetOptimizerQDQ: public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_QDQ_H_
diff --git a/source/tnn/optimizer/net_optimizer_quant_optimizer_group.cc b/source/tnn/optimizer/net_optimizer_quant_optimizer_group.cc
new file mode 100644
index 000000000..5734d6cf0
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_quant_optimizer_group.cc
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_quant_optimizer_group.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/interpreter/layer_resource.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+#include "tnn/optimizer/graph_matcher/ir.h"
+#include "tnn/optimizer/graph_matcher/text_graph_parser.h"
+#include "tnn/optimizer/graph_matcher/graph_matcher.h"
+#include "tnn/optimizer/graph_matcher/logger.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    NetOptimizerRegister<NetOptimizerQuantOptimizerGroup> g_net_optimizer_quant_optimizer_group(OptPriority::P2);
+
+    std::string NetOptimizerQuantOptimizerGroup::Strategy() {
+        return kNetOptimizerQuantOptimizerGroup;
+    }
+
+    bool NetOptimizerQuantOptimizerGroup::IsSupported(const NetworkConfig &net_config) {
+        auto device = net_config.device_type;
+        if (device == DEVICE_CUDA) {
+            return true;
+        }
+        return false;
+    }
+
+    Status NetOptimizerQuantOptimizerGroup::Optimize(NetStructure *structure, NetResource *resource) {
+
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::shared_ptr<Graph> graph = std::make_shared<Graph>();
+        RETURN_ON_FAIL(graph->fromInterpreted(structure, resource));
+
+        std::vector<std::string> text_graph_pattern = {
+            "AnyType@x_q                 ",
+            "Dequantize@dq  Permute@rhs",
+            "MatMul+>@fc",
+            "Add@bias",
+            "Quantize@q",
+        };
+
+        TextGraphParser parser;
+        std::shared_ptr<Graph> pattern = nullptr;
+        if (parser.parseFromString(text_graph_pattern)) {
+            pattern = parser.getGraph();
+        } else {
+            LOGEV("%s", msg, "invalid pattern syntax.");
+            return Status(TNNERR_PARAM_ERR, msg);
+        }
+
+        // Logger::instance().set_verbose_level("I");
+
+        auto gen = [&](std::shared_ptr<AnchorGraph> in) -> std::shared_ptr<Graph> {
+
+            if (in->inputs().size() != 1 || in->outputs().size() != 1 ){
+                return nullptr;
+            }
+
+            auto bias_node = in->getNodeByTensorName(std::string("@bias"));
+            auto q_node = in->getNodeByTensorName(std::string("@q"));
+            if (!bias_node|| ! q_node) {
+                WARN("node of interest not found in quanti optimizer");
+                return nullptr;
+            }
+
+
+            // printf("Got node add of name %s\n", bias_node->name().c_str());
+            // INFO("found pattern at Node:%s", bias_node->name().c_str());
+
+            auto add_layer_res = dynamic_cast<EltwiseLayerResource *>(resource->resource_map[bias_node->name()].get());
+            if (!add_layer_res) {
+                ERRORV("bias node of name %s, got nil resource.", msg, bias_node->name().c_str());
+                return nullptr;
+            }
+            auto q_layer_res = dynamic_cast<QuantizeLayerResource *>(resource->resource_map[q_node->name()].get());
+            if (!q_layer_res) {
+                ERRORV("q node of name %s, got nil resource.", msg, q_node->name().c_str());
+                return nullptr;
+            }
+
+            // printf("bias node handle len : %lu\n", add_layer_res->element_handle.GetDataCount());
+            // printf("q    node handle len : %lu\n", q_layer_res->scale_handle.GetDataCount());
+
+            return nullptr;
+            // create new node. 
+            // inorder the maintain the net_resoruce map un changed. we create a Node of the same name as before
+            auto g = std::make_shared<Graph>();
+            auto in_name = "input_1";
+            auto in1 = g->getNodeOrCreatePlaceHolder(in_name);
+            auto status = g->createNode(LAYER_CONVOLUTION, {in_name}, {bias_node->name()});
+            if (status != TNN_OK) {
+                return nullptr;
+            }
+            auto new_node = g->getNodeByTensorName(bias_node->name());
+            new_node->info->param = bias_node->info->param->Copy();
+            auto conv_param = dynamic_cast<ConvLayerParam *>(new_node->info->param.get());
+            if (!conv_param) {
+                return nullptr;
+            }
+
+            // // update layer param. 
+            // auto activation_type = kLayerActivationMap[act_node->info->type];
+            // if (conv_param->quantized)  {
+            //     // quantized conv fuse relu and relu6
+            //     if (activation_type == ActivationType_ReLU || activation_type == ActivationType_ReLU6) {
+            //         conv_param->activation_type = activation_type;
+            //     } else {
+            //         return nullptr;
+            //     }
+            // } else {
+            //     conv_param->activation_type = activation_type;
+            // }
+
+            // printf("finish gen \n");
+            return g;
+        };
+
+        RETURN_ON_FAIL(graph->rewrite(pattern, gen));
+
+        return TNN_OK;
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
diff --git a/source/tnn/optimizer/net_optimizer_quant_optimizer_group.h b/source/tnn/optimizer/net_optimizer_quant_optimizer_group.h
new file mode 100644
index 000000000..646f3b936
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_quant_optimizer_group.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_QUANT_OPTIMIZER_GROUP_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_QUANT_OPTIMIZER_GROUP_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: optimize QDQ related nodes.
+    class NetOptimizerQuantOptimizerGroup : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    private:
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_QUANT_OPTIMIZER_GROUP_H_
diff --git a/source/tnn/optimizer/net_optimizer_remove_inplace_ops.cc b/source/tnn/optimizer/net_optimizer_remove_inplace_ops.cc
new file mode 100644
index 000000000..c6aebcefb
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_remove_inplace_ops.cc
@@ -0,0 +1,128 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/optimizer/net_optimizer_remove_inplace_ops.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tnn/core/common.h"
+#include "tnn/core/layer_type.h"
+#include "tnn/interpreter/layer_param.h"
+#include "tnn/optimizer/net_optimizer_manager.h"
+#include "tnn/optimizer/optimizer_const.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    // P1 priority: should be fuse after bn scale fuse
+    NetOptimizerRegister<NetOptimizerRemoveInplaceOps> g_net_optimizer_remove_inplace_ops(OptPriority::P1);
+
+    std::string NetOptimizerRemoveInplaceOps::Strategy() {
+        return kNetOptimizerRemoveInplaceOps;
+    }
+
+    bool NetOptimizerRemoveInplaceOps::IsSupported(const NetworkConfig &net_config) {
+        return true;
+    }
+
+    Status NetOptimizerRemoveInplaceOps::Optimize(NetStructure *structure, NetResource *resource) {
+        if (!structure) {
+            LOGE("Error: empty NetStructure\n");
+            return Status(TNNERR_NET_ERR, "Error: empty NetStructure");
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_orig = structure->layers;
+        const int count                                     = (const int)layers_orig.size();
+        if (count <= 1) {
+            return TNN_OK;
+        }
+
+        std::vector<std::shared_ptr<LayerInfo>> layers_fused;
+
+        std::map<std::string, std::string> rename_map;
+
+        for (int index = 0; index < count; index++) {
+            auto layer = layers_orig[index];
+            if (layer->type == LAYER_INPLACE_COPY && layer->inputs.size() == 2) {
+                auto in_name0 = layer->inputs[0];
+                auto in_name1 = layer->inputs[1];
+
+                bool is_slice_copy_pattern = false;
+                // aten::slice + aten::copy_
+                std::string slice_in_name, new_slice_in_name;
+                std::string slice_layer_name;
+                std::shared_ptr<LayerInfo> slice_layer;
+                for (int slice_index = 0; slice_index < index; slice_index++) {
+                    slice_layer = layers_orig[slice_index];
+                    for (auto out_name : slice_layer->outputs) {
+                        if (out_name == in_name0 && slice_layer->type == LAYER_STRIDED_SLICE_V2) {
+                            slice_layer_name = slice_layer->name;
+                            slice_in_name = slice_layer->inputs[0];
+                            new_slice_in_name = slice_in_name + "_inplace";
+                            rename_map[slice_in_name] = new_slice_in_name;
+                            rename_map[out_name] = in_name1;
+                            is_slice_copy_pattern = true;
+                            break;
+                        }
+                    }
+                }
+                for (auto out_name : layer->outputs) {
+                    if (rename_map.find(out_name) == rename_map.end()) {
+                        rename_map[out_name] = is_slice_copy_pattern ? in_name1 : in_name0;
+                    } else {
+                        return Status(TNNERR_NET_ERR, "duplicated output blobs");
+                    }
+                }
+                if (is_slice_copy_pattern) {
+                    layers_fused.erase(std::remove_if(layers_fused.begin(), layers_fused.end(),
+                                                      [slice_layer_name](const std::shared_ptr<LayerInfo>& layer) {
+                                                            return layer->name == slice_layer_name;
+                                                        }),
+                                       layers_fused.end());
+                    std::shared_ptr<LayerInfo> new_layer = std::make_shared<LayerInfo>();
+                    new_layer->type = LAYER_INPLACE_SLICE_COPY;
+                    new_layer->type_str = "InplaceSliceCopy";
+                    new_layer->name = new_slice_in_name;
+                    new_layer->inputs = {slice_in_name, in_name1};
+                    new_layer->outputs = {new_slice_in_name};
+                    new_layer->param = slice_layer->param;
+                    layers_fused.push_back(new_layer);
+                    structure->blobs.insert(new_slice_in_name);
+                }
+            } else {
+                std::vector<std::string> new_inputs;
+                new_inputs.reserve(layer->inputs.size());
+                for (auto in_name : layer->inputs) {
+                    while (rename_map.find(in_name) != rename_map.end()) {
+                        in_name = rename_map[in_name];
+                    }
+                    new_inputs.push_back(in_name);
+                }
+                layer->inputs = new_inputs;
+                layers_fused.push_back(layer);
+            }
+        }
+
+        structure->layers = layers_fused;
+        return TNN_OK;
+
+    }
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/net_optimizer_remove_inplace_ops.h b/source/tnn/optimizer/net_optimizer_remove_inplace_ops.h
new file mode 100644
index 000000000..2703c0c4c
--- /dev/null
+++ b/source/tnn/optimizer/net_optimizer_remove_inplace_ops.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_INPLACE_OPS_H_
+#define TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_INPLACE_OPS_H_
+
+#include <string>
+
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/interpreter/net_resource.h"
+#include "tnn/interpreter/net_structure.h"
+#include "tnn/optimizer/net_optimizer.h"
+
+namespace TNN_NS {
+
+namespace optimizer {
+
+    //@brief net optimize: remove inplace ops defined in pytorch 
+    class NetOptimizerRemoveInplaceOps : public NetOptimizer {
+    public:
+        virtual std::string Strategy();
+        virtual bool IsSupported(const NetworkConfig &net_config);
+        virtual Status Optimize(NetStructure *structure, NetResource *resource);
+    };
+
+}  // namespace optimizer
+
+}  // namespace TNN_NS
+
+#endif  // TNN_SOURCE_TNN_NET_OPTIMIZER_REMOVE_INPLACE_OPS_H_
diff --git a/source/tnn/optimizer/optimizer_const.cc b/source/tnn/optimizer/optimizer_const.cc
deleted file mode 100644
index 4964706f0..000000000
--- a/source/tnn/optimizer/optimizer_const.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Tencent is pleased to support the open source community by making TNN available.
-//
-// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "tnn/optimizer/optimizer_const.h"
-
-#include "tnn/core/macro.h"
-
-namespace TNN_NS {
-
-const char * kNetOptimizerFuseConvPost =
-    "net_optimizer_fuse_conv_post";
-
-const char * kNetOptimizerFuseConvActivation =
-    "net_optimizer_fuse_conv_activation";
-
-const char * kNetOptimizerFuseConvAdd =
-    "net_optimizer_fuse_conv_add";
-
-const char * kNetOptimizerCbamFusedReduce =
-    "net_optimizer_cbam_fused_reduce";
-
-const char * kNetOptimizerCbamFusedPooling =
-    "net_optimizer_cbam_fused_pooling";
-
-const char * kNetOptimizerInsertInt8Reformat =
-    "net_optimizer_insert_int8_reformat";
-
-const char * kNetOptimizerInsertFp16Reformat =
-    "net_optimizer_insert_fp16_reformat";
-
-const char * kNetOptimizerInsertLayoutReformat =
-    "net_optimizer_insert_layout_reformat";
-
-const char * kNetOptimizerRemoveLayers =
-    "net_optimizer_remove_layers";
-
-const char * kNetOptimizerConvertInt8Layers =
-    "net_optimizer_convert_int8_layers";
-
-const char * kNetOptimizerDynamicRangeDequant =
-    "net_optimizer_dynamic_range_dequant";
-
-const char * kNetOptimizerConvertMatMulToConv =
-    "net_optimizer_convert_matmul_to_conv";
-
-}  // namespace TNN_NS
diff --git a/source/tnn/optimizer/optimizer_const.h b/source/tnn/optimizer/optimizer_const.h
index 531ee5c48..19cf008e7 100644
--- a/source/tnn/optimizer/optimizer_const.h
+++ b/source/tnn/optimizer/optimizer_const.h
@@ -17,33 +17,85 @@
 
 #include <string>
 
-#include "tnn/core/macro.h"
-
 namespace TNN_NS {
 
-extern const char * kNetOptimizerFuseConvPost;
+static const std::string kNetOptimizerFuseConvPost =
+    "net_optimizer_fuse_conv_post";
+
+static const std::string kNetOptimizerFuseConvActivation =
+    "net_optimizer_fuse_conv_activation";
+
+static const std::string kNetOptimizerFuseConvAdd =
+    "net_optimizer_fuse_conv_add";
+
+static const std::string kNetOptimizerCbamFusedReduce =
+    "net_optimizer_cbam_fused_reduce";
+
+static const std::string kNetOptimizerCbamFusedPooling =
+    "net_optimizer_cbam_fused_pooling";
+
+static const std::string kNetOptimizerInsertInt8Reformat =
+    "net_optimizer_insert_int8_reformat";
+
+static const std::string kNetOptimizerInsertFp16Reformat =
+    "net_optimizer_insert_fp16_reformat";
+
+static const std::string kNetOptimizerInsertLayoutReformat =
+    "net_optimizer_insert_layout_reformat";
+
+static const std::string kNetOptimizerRemoveLayers =
+    "net_optimizer_remove_layers";
+
+static const std::string kNetOptimizerConvertInt8Layers =
+    "net_optimizer_convert_int8_layers";
+
+static const std::string kNetOptimizerDynamicRangeDequant =
+    "net_optimizer_dynamic_range_dequant";
+
+static const std::string kNetOptimizerQDQ = 
+    "net_optimizer_qdq";
+
+static const std::string kNetOptimizerContextMarker =
+    "net_optimizer_context_marker";
+
+static const std::string kNetOptimizerEffectiveTransformer =
+    "net_optimizer_effective_transformer";
+
+static const std::string kNetOptimizerFuseAddLayerNorm =
+    "net_optimizer_fuse_add_layernorm";
 
-extern const char * kNetOptimizerFuseConvActivation;
+static const std::string kNetOptimizerFuseFFN =
+    "net_optimizer_fuse_ffn";
 
-extern const char * kNetOptimizerFuseConvAdd;
+static const std::string kNetOptimizerFuseAttention =
+    "net_optimizer_fuse_attention";
 
-extern const char * kNetOptimizerCbamFusedReduce;
+static const std::string kNetOptimizerFuseMatmulConcat =
+    "net_optimizer_fuse_matmul_concat";
 
-extern const char * kNetOptimizerCbamFusedPooling;
+static const std::string kNetOptimizerQuantOptimizerGroup =
+    "net_optimizer_quant_optimizer_group";
 
-extern const char * kNetOptimizerInsertInt8Reformat;
+static const std::string kNetOptimizerFuseLayerNorm =
+    "net_optimizer_fuse_layer_norm";
 
-extern const char * kNetOptimizerInsertFp16Reformat;
+static const std::string kNetOptimizerRemoveInplaceOps =
+    "net_optimizer_remove_inplace_ops";
 
-extern const char * kNetOptimizerInsertLayoutReformat;
+static const std::string kNetOptimizerConvertMatMulToConv =
+    "net_optimizer_convert_matmul_to_conv";
 
-extern const char * kNetOptimizerRemoveLayers;
+static const std::string kNetOptimizerFuseCrossAttention =
+    "net_optimizer_fuse_cross_attention";
 
-extern const char * kNetOptimizerConvertInt8Layers;
+static const std::string kNetOptimizerFuseFlashAttention =
+    "net_optimizer_fuse_flash_attention";
 
-extern const char * kNetOptimizerDynamicRangeDequant;
+static const std::string kNetOptimizerFuseSplitGELU =
+    "net_optimizer_fuse_split_gelu";
 
-extern const char * kNetOptimizerConvertMatMulToConv;
+static const std::string kNetOptimizerFuseGroupNormSwish =
+    "net_optimizer_fuse_group_norm_swish";
 
 }
 
diff --git a/source/tnn/utils/blob_converter_default.cc b/source/tnn/utils/blob_converter_default.cc
index f083519d0..3fa891197 100644
--- a/source/tnn/utils/blob_converter_default.cc
+++ b/source/tnn/utils/blob_converter_default.cc
@@ -188,7 +188,7 @@ Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam pa
     hw             = hw == 0 ? 1 : hw;
 
     if (desc.data_type == DATA_TYPE_INT8) {
-        if (image.GetMatType() == RESERVED_INT8_TEST) {
+        if (image.GetMatType() == NC_INT8 || image.GetMatType() == RESERVED_INT8_TEST) {
             memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims));
             return TNN_OK;
         } else {
@@ -202,7 +202,7 @@ Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam pa
             blob_data = real_blob_data;
         }
     } else if (desc.data_type == DATA_TYPE_BFP16) {
-        if (image.GetMatType() == RESERVED_BFP16_TEST) {
+        if (image.GetMatType() == NCHW_BFP16 || image.GetMatType() == RESERVED_BFP16_TEST) {
             memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * 2);
             return TNN_OK;
         }
@@ -218,6 +218,16 @@ Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam pa
                         param.scale.data(), param.bias.data(), channel, hw);
             }
         }
+    } else if (desc.data_type == DATA_TYPE_INT64) {
+        if (image.GetMatType() == NC_INT64) {
+            memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(int64_t));
+            return TNN_OK;
+        }
+    } else if (desc.data_type == DATA_TYPE_UINT8) {
+        if (image.GetMatType() == NC_UINT8) {
+            memcpy(image.GetData(), blob_data, DimsVectorUtils::Count(dims) * sizeof(uint8_t));
+            return TNN_OK;
+        }
     }
 
     if (image.GetMatType() == NCHW_FLOAT) {
@@ -242,13 +252,14 @@ Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam pa
             BlobToGray(blob_data + n * hw, reinterpret_cast<uint8_t *>(image.GetData()) + n * hw, param.scale[0],
                        param.bias[0], hw);
         }
-    } else if (image.GetMatType() == RESERVED_BFP16_TEST) {
+    } else if (image.GetMatType() == NCHW_BFP16 || image.GetMatType() == RESERVED_BFP16_TEST) {
         for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
             reinterpret_cast<bfp16_t *>(image.GetData())[n] = blob_data[n];
         }
     } else {
         FREE_INT8_TEMP_DATA();
-        return Status(TNNERR_PARAM_ERR, "DefaultBlobConverterAcc::ConvertToMatAsync, convert type not support yet");
+        return Status(TNNERR_PARAM_ERR, "DefaultBlobConverterAcc::ConvertToMatAsync, convert type not support yet: " +
+                                        std::to_string(desc.data_type) + " to " + std::to_string(image.GetMatType()));
     }
 
     // reverse channel before convert if needed
@@ -276,6 +287,21 @@ Status DefaultBlobConverterAcc::ConvertToMatAsync(Mat &image, MatConvertParam pa
     return ret;
 }
 
+static bool NeedDoScaleBias(const MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 Status DefaultBlobConverterAcc::ConvertFromMatFunc(Mat& image, float* blob_data,
         MatConvertParam& param, BlobDesc& desc, const DimsVector& dims, const int hw) {
     if (image.GetMatType() == NCHW_FLOAT) {
@@ -305,7 +331,7 @@ Status DefaultBlobConverterAcc::ConvertFromMatFunc(Mat& image, float* blob_data,
                        param.bias[0], hw);
         }
     } else if (image.GetMatType() == NNV12) {
-        Mat bgr(DEVICE_NAIVE, RESERVED_INT8_TEST, image.GetDims());
+        Mat bgr(DEVICE_NAIVE, NC_INT8, image.GetDims());
         for (int n = 0; n < dims[0]; n++) {
             NV12ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
                       reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
@@ -313,18 +339,18 @@ Status DefaultBlobConverterAcc::ConvertFromMatFunc(Mat& image, float* blob_data,
                       param.scale.data(), param.bias.data(), hw);
         }
     } else if (image.GetMatType() == NNV21) {
-        Mat bgr(DEVICE_NAIVE, RESERVED_INT8_TEST, image.GetDims());
+        Mat bgr(DEVICE_NAIVE, NC_INT8, image.GetDims());
         for (int n = 0; n < dims[0]; n++) {
             NV21ToBGR(reinterpret_cast<uint8_t *>(image.GetData()) + n * 3 * hw / 2,
                       reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, dims[2], dims[3]);
             BGRToBlob(reinterpret_cast<uint8_t *>(bgr.GetData()) + n * 3 * hw, blob_data + n * 3 * hw,
                       param.scale.data(), param.bias.data(), hw);
         }
-    } else if (image.GetMatType() == RESERVED_BFP16_TEST) {
+    } else if (image.GetMatType() == NCHW_BFP16 || image.GetMatType() == RESERVED_BFP16_TEST) {
         for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
             blob_data[n] = float(reinterpret_cast<bfp16_t *>(image.GetData())[n]);
         }
-    } else if (image.GetMatType() == RESERVED_FP16_TEST) {
+    } else if (image.GetMatType() == NCHW_HALF || image.GetMatType() == RESERVED_FP16_TEST) {
         for (int n = 0; n < DimsVectorUtils::Count(dims); n++) {
             blob_data[n] = (float)(reinterpret_cast<fp16_t *>(image.GetData())[n]);
         }
@@ -345,13 +371,18 @@ Status DefaultBlobConverterAcc::ConvertFromMatAsync(Mat &image_src, MatConvertPa
     void * blob_ptr = reinterpret_cast<void*>(((char*)blob_->GetHandle().base) + blob_->GetHandle().bytes_offset);
     auto blob_data = reinterpret_cast<float*>(blob_ptr);
     if (desc.data_type == DATA_TYPE_INT8) {
-        if (image_src.GetMatType() == RESERVED_INT8_TEST) {
+        if (image_src.GetMatType() == NC_INT8 || image_src.GetMatType() == RESERVED_INT8_TEST) {
             memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims));
             return TNN_OK;
         } else
             blob_data = new float[dims[0] * dims[1] * hw];
     } else if (desc.data_type == DATA_TYPE_BFP16) {
-        if (image_src.GetMatType() == RESERVED_BFP16_TEST) {
+        if (image_src.GetMatType() == NCHW_BFP16 || image_src.GetMatType() == RESERVED_BFP16_TEST) {
+            memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims) * 2);
+            return TNN_OK;
+        }
+    } else if (desc.data_type == DATA_TYPE_HALF) {
+        if (image_src.GetMatType() == NCHW_HALF || image_src.GetMatType() == RESERVED_FP16_TEST) {
             memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims) * 2);
             return TNN_OK;
         }
@@ -360,6 +391,13 @@ Status DefaultBlobConverterAcc::ConvertFromMatAsync(Mat &image_src, MatConvertPa
             memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims) * sizeof(int32_t));
             return TNN_OK;
         }
+    } else if (desc.data_type == DATA_TYPE_UINT8) {
+        if (image_src.GetMatType() == NC_UINT8) {
+            memcpy(blob_data, image_src.GetData(), DimsVectorUtils::Count(dims));
+            return TNN_OK;
+        } else {
+            return Status(TNNERR_PARAM_ERR, "reserve type uint8 only can be convert to uint8 blob");
+        }
     }
 
     Mat image(image_src.GetDeviceType(), image_src.GetMatType(), image_src.GetDims(), image_src.GetData());
diff --git a/source/tnn/utils/blob_converter_internal.cc b/source/tnn/utils/blob_converter_internal.cc
index fcd927603..ba5343698 100644
--- a/source/tnn/utils/blob_converter_internal.cc
+++ b/source/tnn/utils/blob_converter_internal.cc
@@ -21,21 +21,6 @@
 
 namespace TNN_NS {
 
-bool NeedDoScaleBias(const MatConvertParam& param) {
-    for (auto s : param.scale) {
-        if (s != 1.0f) {
-            return true;
-        }
-    }
-    for (auto b : param.bias) {
-        if (b != 0.0f) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 BlobConverter::BlobConverter(Blob* blob) {
     blob_ = blob;
     impl_ = BlobConverterManager::Shared()->CreateBlobConverterAcc(blob);
@@ -102,8 +87,9 @@ Status BlobConverter::CheckScaleBiasInParam(Mat& image, MatConvertParam& param,
         channel = image.GetChannel();
     }
     // 非图像类的Mat channel和scale/bias长度与不匹配时，如果scale全1，bias全0，会默认调整，否则报错
-    if ((image.GetMatType() == NCHW_FLOAT || image.GetMatType() == RESERVED_BFP16_TEST ||
-         image.GetMatType() == RESERVED_FP16_TEST || image.GetMatType() == RESERVED_INT8_TEST ||
+    if ((image.GetMatType() == NCHW_FLOAT || image.GetMatType() == NCHW_BFP16 ||
+         image.GetMatType() == NCHW_HALF || image.GetMatType() == NC_INT8 || image.GetMatType() == NC_UINT8 ||
+         image.GetMatType() == RESERVED_BFP16_TEST || image.GetMatType() == RESERVED_FP16_TEST || image.GetMatType() == RESERVED_INT8_TEST ||
          image.GetMatType() == NC_INT32) && (channel > param.scale.size() || channel > param.bias.size())) {
         if (!NeedDoScaleBias(param)) {
             param.scale = std::vector<float>(channel, 1.0f);
@@ -119,6 +105,21 @@ Status BlobConverter::CheckScaleBiasInParam(Mat& image, MatConvertParam& param,
     return TNN_OK;
 }
 
+bool BlobConverter::NeedDoScaleBias(MatConvertParam &param) {
+    for (auto s : param.scale) {
+        if (s != 1.0f) {
+            return true;
+        }
+    }
+    for (auto b : param.bias) {
+        if (b != 0.0f) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 std::shared_ptr<BlobConverterManager>& BlobConverterManager::Shared() {
     static std::once_flag once;
     static std::shared_ptr<BlobConverterManager> g_global_blob_converter_manager;
diff --git a/source/tnn/utils/blob_dump_utils.cc b/source/tnn/utils/blob_dump_utils.cc
index 2815cf537..f60d86d24 100644
--- a/source/tnn/utils/blob_dump_utils.cc
+++ b/source/tnn/utils/blob_dump_utils.cc
@@ -58,13 +58,17 @@ Status DumpDeviceBlob(Blob* blob, Context* context, std::string fname_prefix) {
     void* command_queue;
     context->GetCommandQueue(&command_queue);
 
+    return DumpDeviceBlob(blob, command_queue, fname_prefix);
+}
+
+Status DumpDeviceBlob(Blob* blob, void* command_queue, std::string fname_prefix) {
     auto blob_desc = blob->GetBlobDesc();
     MatType mat_type = NCHW_FLOAT;
     auto data_type = blob_desc.data_type;
 
 #ifdef DUMP_RAW_INT8
     if(blob_desc.data_type == DATA_TYPE_INT8) {
-        mat_type = RESERVED_INT8_TEST;
+        mat_type = NC_INT8;
     }
 #endif
     if(blob_desc.data_type == DATA_TYPE_INT32) {
diff --git a/source/tnn/utils/blob_dump_utils.h b/source/tnn/utils/blob_dump_utils.h
index c14595e33..61de5f120 100644
--- a/source/tnn/utils/blob_dump_utils.h
+++ b/source/tnn/utils/blob_dump_utils.h
@@ -31,6 +31,7 @@ extern PUBLIC std::string g_tnn_dump_directory;
 #endif
 
 Status DumpDeviceBlob(Blob* dst, Context* context, std::string fname_prefix);
+Status DumpDeviceBlob(Blob* dst, void* command_queue, std::string fname_prefix);
 
 }  // namespace TNN_NS
 
diff --git a/source/tnn/utils/data_format_converter.cc b/source/tnn/utils/data_format_converter.cc
index 8407bc44a..0c79f920c 100644
--- a/source/tnn/utils/data_format_converter.cc
+++ b/source/tnn/utils/data_format_converter.cc
@@ -18,6 +18,40 @@
 
 namespace TNN_NS {
 
+template <class T>
+static Status ConvertFromNCHWToNHWC(T *src, T *dst, int num, int channel, int height, int width) {
+    for (int n = 0; n < num; n++) {
+        auto n_dst = dst + n * channel * height * width;
+        auto n_src = src + n * channel * height * width;
+        for (int c = 0; c < channel; c++) {
+            auto z_dst = n_dst + c;
+            auto z_src = n_src + c * height * width;
+            for (int h = 0; h < height; h++) {
+                for (int w = 0; w < width; w++) {
+                    z_dst[h * width * channel + w * channel] = z_src[h * width + w];
+                }
+            }
+        }
+    }
+    return TNN_OK;
+};
+template <class T>
+static Status ConvertFromNHWCToNCHW(T *src, T *dst, int num, int channel, int height, int width) {
+    for (int n = 0; n < num; n++) {
+        auto n_dst = dst + n * channel * height * width;
+        auto n_src = src + n * channel * height * width;
+        for (int c = 0; c < channel; c++) {
+            auto z_dst = n_dst + c * height * width;
+            auto z_src = n_src + c;
+            for (int h = 0; h < height; h++) {
+                for (int w = 0; w < width; w++) {
+                    z_dst[h * width + w] = z_src[h * width * channel + w * channel];
+                }
+            }
+        }
+    }
+    return TNN_OK;
+};
 template <class T>
 static Status ConvertWeightsFromGOIHWToGOIHW16(T *src, T *dst, int group, int input_channel, int output_channel,
                                                int height, int width, bool tanspose = false) {
@@ -211,6 +245,12 @@ static Status ConvertFromNHWC4ToNCHW(T *src, T *dst, int num, int channel, int h
     return TNN_OK;
 };
 
+Status DataFormatConverter::ConvertFromNCHWToNHWCFloat(float *src, float *dst, int num, int channel, int height, int width) {
+    return ConvertBetweenNHWCAndNCHW<float>(src, dst, num, channel, height, width, NCHW2NHWC);
+}
+Status DataFormatConverter::ConvertFromNHWCToNCHWFloat(float *src, float *dst, int num, int channel, int height, int width) {
+    return ConvertBetweenNHWCAndNCHW<float>(src, dst, num, channel, height, width, NHWC2NCHW);
+}
 Status DataFormatConverter::ConvertFromGOIHWToGOIHW16Float(float *src, float *dst, int group, int input_channel,
                                                            int output_channel, int height, int width, bool tanspose) {
     return ConvertWeightsFromGOIHWToGOIHW16<float>(src, dst, group, input_channel, output_channel, height, width,
@@ -288,7 +328,32 @@ Status DataFormatConverter::ConvertFromInt8ToFloatNCHW(int8_t *src, float *dst,
 
     return 0;
 }
+Status DataFormatConverter::ConvertFromInt64ToFloatNCHW(int64_t *src, float *dst, int num, int channel, int height, int width) {
+    for (int n = 0; n < num; n++) {
+        for (int c = 0; c < channel; c++) {
+            for (int hw = 0; hw < height * width; hw++) {
+                int offset = n * channel * height * width + c * height * width + hw;
+                dst[offset] = (float)src[offset];
+            }
+        }
+    }
 
+    return 0;
+}
+Status DataFormatConverter::ConvertFromInt64NHWCToFloatNCHW(int64_t *src, float *dst, int num, int channel, int height, int width) {
+    for (int n = 0; n < num; n++) {
+        for (int h = 0; h < height; h++) {
+            for (int w = 0; w < width; w++) {
+                for (int c = 0; c < channel; c++) {
+                    int src_idx = n * height * width * channel + h * width * channel + w * channel + c;
+                    int dst_idx = n * channel * height * width + c * height * width + h * width + w;
+                    dst[dst_idx] = (float)src[src_idx];
+                }
+            }
+        }
+    }
+    return 0;
+}
 Status DataFormatConverter::ConvertFromNCHWToNCHW4Float(float *src, float *dst, int num, int channel, int height,
                                                         int width, bool transpose) {
     return ConvertFromNCHWToNCHW4<float>(src, dst, num, channel, height, width, transpose);
diff --git a/source/tnn/utils/data_format_converter.h b/source/tnn/utils/data_format_converter.h
index e8336db3d..d219d968e 100644
--- a/source/tnn/utils/data_format_converter.h
+++ b/source/tnn/utils/data_format_converter.h
@@ -37,6 +37,8 @@ class DataFormatConverter {
 
     // @brief convert weights from [n][c][h][w] to [n][c/4][h][w][4]
     // @param data_tyep data type info
+    static Status ConvertFromNCHWToNHWCFloat(float *src, float *dst, int num, int channel, int height, int width);
+    static Status ConvertFromNHWCToNCHWFloat(float *src, float *dst, int num, int channel, int height, int width);
     static Status ConvertFromNCHWToNCHW4Float(float *src, float *dst, int num, int channel, int height, int width, bool transpose = false);
     static Status ConvertFromNCHWToNCHW4Half(short *src, short *dst, int num, int channel, int height, int width, bool transpose = false);
     static Status ConvertFromNCHWToNHWC4Int8(int8_t *src, int8_t *dst, int num, int channel, int hw);
@@ -52,6 +54,8 @@ class DataFormatConverter {
 
     static Status ConvertFromInt8ToFloatNHWC4(int8_t *src, float *dst, float *scale, int scale_len, int num,
                                               int channel, int height, int width);
+    static Status ConvertFromInt64ToFloatNCHW(int64_t *src, float *dst, int num, int channel, int height, int width);
+    static Status ConvertFromInt64NHWCToFloatNCHW(int64_t *src, float *dst, int num, int channel, int height, int width);
 
     enum CVT_DIR { NHWC2NCHW, NCHW2NHWC };
 
diff --git a/source/tnn/utils/data_type_utils.cc b/source/tnn/utils/data_type_utils.cc
index 1b11af6f5..c700ea736 100644
--- a/source/tnn/utils/data_type_utils.cc
+++ b/source/tnn/utils/data_type_utils.cc
@@ -33,6 +33,8 @@ int DataTypeUtils::GetBytesSize(DataType data_type) {
         return 8;
     } else if (data_type == DATA_TYPE_UINT32) {
         return 4;
+    } else if (data_type == DATA_TYPE_UINT8) {
+        return 1;
     } else {
         LOGE("GetBytes Undefined \n");
         return -1;
@@ -52,6 +54,10 @@ std::string DataTypeUtils::GetDataTypeString(DataType data_type) {
         return "int64";
     } else if (data_type == DATA_TYPE_INT32) {
         return "int32";
+    } else if (data_type == DATA_TYPE_INT64) {
+        return "int64";
+    } else if (data_type == DATA_TYPE_UINT8) {
+        return "uint8";
     } else {
         return "";
     }
diff --git a/source/tnn/utils/device_utils.cc b/source/tnn/utils/device_utils.cc
new file mode 100644
index 000000000..e6f2fb235
--- /dev/null
+++ b/source/tnn/utils/device_utils.cc
@@ -0,0 +1,76 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tnn/utils/device_utils.h"
+
+#include <map>
+#include <mutex>
+
+#include "tnn/core/macro.h"
+#include "tnn/core/common.h"
+#include "tnn/core/status.h"
+#include "tnn/core/abstract_device.h"
+
+namespace TNN_NS {
+
+// @brief Get a specified device from a device group
+DeviceType GetConcreteDeviceType(DeviceType type) {
+    std::mutex g_map_mutex;
+    std::lock_guard<std::mutex> guard(g_map_mutex);
+
+    switch(type) {
+        case DEVICE_GROUP_CPU:
+            {
+                if (GetDevice(DEVICE_X86) != nullptr) {
+                    return DEVICE_X86;
+                } else if (GetDevice(DEVICE_ARM) != nullptr) {
+                    return DEVICE_ARM;
+                } else if (GetDevice(DEVICE_NAIVE) != nullptr) {
+                    return DEVICE_NAIVE;
+                }
+                break;
+            }
+        case DEVICE_GROUP_GPU:
+            {
+                if (GetDevice(DEVICE_CUDA) != nullptr) {
+                    return DEVICE_CUDA;
+                } else if (GetDevice(DEVICE_OPENCL) != nullptr) {
+                    return DEVICE_OPENCL;
+                } else if (GetDevice(DEVICE_METAL) != nullptr) {
+                    return DEVICE_METAL;
+                }
+                break;
+            }
+        case DEVICE_GROUP_NPU:
+            {
+                if (GetDevice(DEVICE_DSP) != nullptr) {
+                    return DEVICE_DSP;
+                } else if (GetDevice(DEVICE_ATLAS) != nullptr) {
+                    return DEVICE_ATLAS;
+                } else if (GetDevice(DEVICE_HUAWEI_NPU) != nullptr) {
+                    return DEVICE_HUAWEI_NPU;
+                } else if (GetDevice(DEVICE_RK_NPU) != nullptr) {
+                    return DEVICE_RK_NPU;
+                }
+                break;
+            }
+        default:
+            break;
+    }
+
+    LOGI("No concrete device available.\n");
+    return DEVICE_NAIVE;
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/utils/mat_converter_utils.cc b/source/tnn/utils/mat_converter_utils.cc
index c49ce4f0d..149fedcc1 100644
--- a/source/tnn/utils/mat_converter_utils.cc
+++ b/source/tnn/utils/mat_converter_utils.cc
@@ -159,9 +159,9 @@ int GetMatElementSize(Mat* mat) {
         return 4;
     } else if (N8UC3 == mat_type || N8UC4 == mat_type || NGRAY == mat_type || NNV21 == mat_type || NNV12 == mat_type) {
         return 1;
-    } else if (RESERVED_BFP16_TEST == mat_type || RESERVED_FP16_TEST == mat_type) {
+    } else if (NCHW_BFP16 == mat_type || NCHW_HALF == mat_type || RESERVED_BFP16_TEST == mat_type || RESERVED_FP16_TEST == mat_type) {
         return 2;
-    } else if (RESERVED_INT8_TEST == mat_type) {
+    } else if (NC_INT8 == mat_type || NC_UINT8 == mat_type || RESERVED_INT8_TEST == mat_type) {
         return 1;
     } else {
         return 0;
diff --git a/source/tnn/utils/mat_utils.cc b/source/tnn/utils/mat_utils.cc
index 925d7544e..46b2d3a84 100644
--- a/source/tnn/utils/mat_utils.cc
+++ b/source/tnn/utils/mat_utils.cc
@@ -71,7 +71,7 @@ static int GetCvtColorDstChannel(ColorConversionType type) {
 }
 
 Status MatUtils::Copy(Mat& src, Mat& dst, void* command_queue) {
-    auto ret = CheckSrcAndDstMat(src, dst, false, true, true);
+    auto ret = CheckSrcAndDstMat(src, dst, false, true, false);
     if (ret != TNN_OK) {
         return ret;
     }
@@ -189,6 +189,36 @@ Status MatUtils::CvtColor(Mat& src, Mat& dst, ColorConversionType type, void* co
     MAT_CONVERTER_PREPARATION(src.GetDeviceType());
     return converter->CvtColor(src, dst, type, command_queue);
 }
+Status MatUtils::GetMatByteSize(Mat& src, int& byte_size) {
+    int N = src.GetBatch();
+    int C = src.GetChannel();
+    int H = src.GetHeight();
+    int W = src.GetWidth();
+    MatType mat_type = src.GetMatType();
+    if (NCHW_FLOAT == mat_type) {
+        byte_size = DimsVectorUtils::Count(src.GetDims()) * sizeof(float);
+    } else if (N8UC3 == mat_type) {
+        byte_size = N * 3 * W * H;
+    } else if (N8UC4 == mat_type) {
+        byte_size = N * 4 * W * H;
+    } else if (NGRAY == mat_type) {
+        byte_size = N * 1 * W * H;
+    } else if (NNV12 == mat_type || NNV21 == mat_type) {
+        if (H % 2 != 0 || W %2 != 0) {
+            LOGE("invaild width or height for YUV (need to be even): %d x %d\n", H, W);
+            return Status(TNNERR_PARAM_ERR, "invaild width or height for YUV");
+        }
+        byte_size = N * 3 * W * H / 2;
+    } else if (NC_INT32 == mat_type) {
+        byte_size = DimsVectorUtils::Count(src.GetDims()) * sizeof(int);
+    } else if (NC_INT64 == mat_type) {
+        byte_size = DimsVectorUtils::Count(src.GetDims()) * sizeof(int64_t);
+    } else {
+        LOGE("not support this mat type: %d\n", mat_type);
+        return Status(TNNERR_PARAM_ERR, "not support this mat type");
+    }
+    return TNN_OK;
+}
 
 Status MatUtils::CopyMakeBorder(Mat& src, Mat& dst, CopyMakeBorderParam param, void* command_queue) {
     auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
diff --git a/source/tnn/utils/string_format.cc b/source/tnn/utils/string_format.cc
index cf8e81001..e49458a44 100644
--- a/source/tnn/utils/string_format.cc
+++ b/source/tnn/utils/string_format.cc
@@ -49,12 +49,14 @@ std::string MatTypeToString(MatType mat_type) {
         return "NCHW_FLOAT";
     } else if (NC_INT32 == mat_type) {
         return "NC_INT32";
-    } else if (RESERVED_BFP16_TEST == mat_type) {
-        return "RESERVED_BFP16_TEST";
-    } else if (RESERVED_FP16_TEST == mat_type) {
-        return "RESERVED_FP16_TEST";
-    } else if (RESERVED_INT8_TEST == mat_type) {
-        return "RESERVED_INT8_TEST";
+    } else if (NCHW_BFP16 == mat_type || RESERVED_BFP16_TEST == mat_type) {
+        return "NCHW_BFP16";
+    } else if (NCHW_HALF == mat_type || RESERVED_FP16_TEST == mat_type) {
+        return "NCHW_HALF";
+    } else if (NC_INT8 == mat_type || RESERVED_INT8_TEST == mat_type) {
+        return "NC_INT8";
+    } else if (NC_UINT8 == mat_type) {
+        return "NC_UINT8";
     } else {
         return "INVALID Mat Type";
     }
diff --git a/source/tnn/utils/string_utils_inner.cc b/source/tnn/utils/string_utils_inner.cc
new file mode 100644
index 000000000..3e7258b8b
--- /dev/null
+++ b/source/tnn/utils/string_utils_inner.cc
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <sstream>
+#include <string>
+#include "tnn/core/macro.h"
+#include "tnn/utils/string_utils.h"
+
+namespace TNN_NS {
+
+bool EndsWith(const std::string& str, const std::string suffix) {
+    if (suffix.length() > str.length()) {
+        return false;
+    }
+    return (str.rfind(suffix) == (str.length() - suffix.length()));
+}
+
+}  // namespace TNN_NS
diff --git a/source/tnn/utils/string_utils_inner.h b/source/tnn/utils/string_utils_inner.h
index 57852f9a5..d076bac8f 100644
--- a/source/tnn/utils/string_utils_inner.h
+++ b/source/tnn/utils/string_utils_inner.h
@@ -32,6 +32,13 @@ std::string ToString(T value) {
 template <>
 std::string ToString<float>(float value);
 
+bool EndsWith(const std::string& str, const std::string suffix);
+//     if (suffix.length() > str.length()) { 
+//         return false; 
+//     }
+//     return (str.rfind(suffix) == (str.length() - suffix.length()));
+// }
+
 }  // namespace TNN_NS
 
 #endif  // TNN_SOURCE_TNN_UTILS_STRING_UTILS_INNER_H_
diff --git a/test/flags.cc b/test/flags.cc
index 7d3def5fc..b26fe3a93 100644
--- a/test/flags.cc
+++ b/test/flags.cc
@@ -36,6 +36,8 @@ DEFINE_int32(wc, 0, warm_up_count_message);
 
 DEFINE_string(ip, "", input_path_message);
 
+DEFINE_string(im, "", input_map_message);
+
 DEFINE_string(op, "", output_path_message);
 
 DEFINE_bool(fc, false, output_format_cmp_message);
@@ -46,16 +48,22 @@ DEFINE_bool(ub, false, unit_test_benchmark_message);
 
 DEFINE_int32(th, 1, cpu_thread_num_message);
 
-DEFINE_int32(it, 0, input_format_message);
+DEFINE_int32(if, 0, input_format_message);
 
 DEFINE_string(pr, "AUTO", precision_message);
 
 DEFINE_string(is, "", input_shape_message);
 
+DEFINE_string(it, "", input_data_type_message);
+
 DEFINE_bool(et, false, enable_tune_message);
 
 DEFINE_string(sc, "", scale_message);
 
 DEFINE_string(bi, "", bias_message);
 
+DEFINE_int32(tt, 1, test_thread_num_message);
+
+DEFINE_string(cp, "", cache_path_message);
+
 }  // namespace TNN_NS
diff --git a/test/flags.h b/test/flags.h
index d308505b4..b49998c83 100644
--- a/test/flags.h
+++ b/test/flags.h
@@ -41,6 +41,8 @@ static const char warm_up_count_message[] = "warm up count (default 0).";
 
 static const char input_path_message[] = "input file path";
 
+static const char input_map_message[] = "input file map: input[input_file]";
+
 static const char output_path_message[] = "output file path";
 
 static const char output_format_cmp_message[] = "output format for comparison";
@@ -57,6 +59,8 @@ static const char precision_message[] = "compute precision(HIGH, NORMAL, LOW)";
 
 static const char input_shape_message[] = "input shape: name[n,c,h,w]";
 
+static const char input_data_type_message[] = "input data type: name[data_type (in int)], 0:float; 1:half; 2: int8; 3: int32; 4:bfp16; 5:int64; 6:uint32.";
+
 static const char network_type_message[] = "network type: NAIVE, NPU, COREML, SNPE, OPENVINO, default NAIVE";
 
 static const char enable_tune_message[] = "enable tune kernel(default false)";
@@ -65,6 +69,10 @@ static const char scale_message[] = "input scale: s0,s1,s2,...)";
 
 static const char bias_message[] = "input bias: b0,b1,b2,...)";
 
+static const char test_thread_num_message[] = "set test thread num, default is 1";
+
+static const char cache_path_message[] = "cache path to store possible cache models or opt kernel";
+
 DECLARE_bool(h);
 
 DECLARE_string(mt);
@@ -85,6 +93,8 @@ DECLARE_int32(wc);
 
 DECLARE_string(ip);
 
+DECLARE_string(im);
+
 DECLARE_string(op);
 
 DECLARE_bool(fc);
@@ -95,18 +105,24 @@ DECLARE_bool(ub);
 
 DECLARE_int32(th);
 
-DECLARE_int32(it);
+DECLARE_int32(if);
 
 DECLARE_string(pr);
 
 DECLARE_string(is);
 
+DECLARE_string(it);
+
 DECLARE_bool(et);
 
 DECLARE_string(sc);
 
 DECLARE_string(bi);
 
+DECLARE_int32(tt);
+
+DECLARE_string(cp);
+
 }  // namespace TNN_NS
 
 #endif  // TNN_TEST_FLAGS_H_
diff --git a/test/test.cc b/test/test.cc
index 126d0cadf..c50b2445c 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -23,7 +23,9 @@
 #include <fstream>
 #include <iomanip>
 #include <sstream>
+#include <iostream>
 #include <string>
+#include <thread>
 
 #include "test/flags.h"
 #include "test/test_utils.h"
@@ -38,6 +40,7 @@
 #include "tnn/utils/cpu_utils.h"
 #include "tnn/utils/data_type_utils.h"
 #include "tnn/utils/dims_vector_utils.h"
+#include "tnn/utils/device_utils.h"
 #include "tnn/utils/omp_utils.h"
 #include "tnn/utils/string_utils_inner.h"
 
@@ -50,9 +53,9 @@ namespace TNN_NS {
 namespace test {
 
     int Run(int argc, char* argv[]) {
-        // parse command line params
-        if (!ParseAndCheckCommandLine(argc, argv))
+        if (!TNN_NS::test::ParseAndCheckCommandLine(argc, argv))
             return -1;
+
 #if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
         g_tnn_dump_directory = FLAGS_op;
 #endif
@@ -66,105 +69,137 @@ namespace test {
         NetworkConfig network_config = GetNetworkConfig();
 
         InputShapesMap input_shape = GetInputShapesMap();
+        InputDataTypeMap input_data_type = GetInputDataTypeMap();
 
         srand(102);
 
         TNN net;
         Status ret = net.Init(model_config);
         model_config.params.clear();
-        if (CheckResult("init tnn", ret)) {
-            auto instance = net.CreateInst(network_config, ret, input_shape);
-            if (!CheckResult("create instance", ret)) {
-                return ret;
-            }
-            instance->SetCpuNumThreads(std::max(FLAGS_th, 1));
-
-            //get blob
-            BlobMap input_blob_map;
-            BlobMap output_blob_map;
-            void* command_queue;
-            instance->GetAllInputBlobs(input_blob_map);
-            instance->GetAllOutputBlobs(output_blob_map);
-            instance->GetCommandQueue(&command_queue);
-
-            //create mat and converter
-            MatMap input_mat_map = CreateBlobMatMap(input_blob_map, FLAGS_it);
-            InitInputMatMap(input_mat_map);
-            auto input_converters_map = CreateBlobConverterMap(input_blob_map);
-            auto input_params_map = CreateConvertParamMap(input_mat_map, true);
-
-            //mat format NCHW_FLOAT
-            MatMap output_mat_map = CreateBlobMatMap(output_blob_map, 0);
-            auto output_converters_map = CreateBlobConverterMap(output_blob_map);
-            auto output_params_map = CreateConvertParamMap(output_mat_map, false);
-
-            for (int i = 0; i < FLAGS_wc; ++i) {
-                for(auto element : input_converters_map) {
-                    auto name = element.first;
-                    auto blob_converter = element.second;
-                    blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
-                }
-                ret = instance->ForwardAsync(nullptr);
-                for(auto element : output_converters_map) {
-                    auto name = element.first;
-                    auto blob_converter = element.second;
-                    blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
+
+        auto instance_run = [&]() -> int {
+            if (CheckResult("init tnn", ret)) {
+                auto instance = net.CreateInst(network_config, ret, input_shape, input_data_type);
+                if (!CheckResult("create instance", ret)) {
+                    return ret;
                 }
-            }
-#if TNN_PROFILE
-            instance->StartProfile();
-#endif
+                instance->SetCpuNumThreads(std::max(FLAGS_th, 1));
+
+                //get blob
+                BlobMap input_blob_map;
+                BlobMap output_blob_map;
+                void* command_queue;
+                instance->GetAllInputBlobs(input_blob_map);
+                instance->GetAllOutputBlobs(output_blob_map);
+                instance->GetCommandQueue(&command_queue);
+
+                //create mat and converter
+                MatMap input_mat_map;
+                CreateBlobMatMap(input_blob_map, FLAGS_if, input_mat_map);
+                InitInputMatMap(input_mat_map);
+                auto input_converters_map = CreateBlobConverterMap(input_blob_map);
+                auto input_params_map = CreateConvertParamMap(input_mat_map, true);
+
+                //mat format NCHW_FLOAT
+                MatMap output_mat_map;
+                CreateBlobMatMap(output_blob_map, 0, output_mat_map);
+                auto output_converters_map = CreateBlobConverterMap(output_blob_map);
+                auto output_params_map = CreateConvertParamMap(output_mat_map, false);
+
+                for (int i = 0; i < FLAGS_wc; ++i) {
+                    for(auto element : input_converters_map) {
+                        auto name = element.first;
+                        auto blob_converter = element.second;
+                        blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
+                    }
+                    ret = instance->ForwardAsync(nullptr);
 
-            std::string model_name = FLAGS_mp;
-            if(FLAGS_mp.find_last_of("/") != -1) {
-                model_name = FLAGS_mp.substr(FLAGS_mp.find_last_of("/") + 1); 
-            }   
- 
-            Timer timer(model_name + " - " + FLAGS_dt);
-
-            for (int i = 0; i < FLAGS_ic; ++i) {
-                timer.Start();
-                for(auto element : input_converters_map) {
-                    auto name = element.first;
-                    auto blob_converter = element.second;
-                    ret = blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
-                    if (!CheckResult("ConvertFromMat", ret)) {
-                        return ret;
+                    bool is_update = CreateBlobMatMap(output_blob_map, 0, output_mat_map);
+                    if (is_update) {
+                        output_converters_map = CreateBlobConverterMap(output_blob_map);
+                        output_params_map = CreateConvertParamMap(output_mat_map, false);
+                    }
+
+                    for(auto element : output_converters_map) {
+                        auto name = element.first;
+                        auto blob_converter = element.second;
+                        blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
                     }
                 }
-#if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
-                ret = instance->Forward();
-#else
-                ret = instance->ForwardAsync(nullptr);
-#endif
-                if (!CheckResult("Forward", ret)) {
-                    return ret;
-                }
-                for(auto element : output_converters_map) {
-                    auto name = element.first;
-                    auto blob_converter = element.second;
-                    ret = blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
-                    if (!CheckResult("ConvertToMat", ret)) {
+    #if TNN_PROFILE
+                instance->StartProfile();
+    #endif
+
+                std::string model_name = FLAGS_mp;
+                if(FLAGS_mp.find_last_of("/") != -1) {
+                    model_name = FLAGS_mp.substr(FLAGS_mp.find_last_of("/") + 1); 
+                }   
+    
+                Timer timer(model_name + " - " + FLAGS_dt);
+
+                for (int i = 0; i < FLAGS_ic; ++i) {
+                    timer.Start();
+                    for(auto element : input_converters_map) {
+                        auto name = element.first;
+                        auto blob_converter = element.second;
+                        ret = blob_converter->ConvertFromMatAsync(*input_mat_map[name], input_params_map[name], command_queue);
+                        if (!CheckResult("ConvertFromMat", ret)) {
+                            return ret;
+                        }
+                    }
+    #if (DUMP_INPUT_BLOB || DUMP_OUTPUT_BLOB)
+                    ret = instance->Forward();
+    #else
+                    ret = instance->ForwardAsync(nullptr);
+    #endif
+                    if (!CheckResult("Forward", ret)) {
                         return ret;
                     }
+
+                    bool is_update = CreateBlobMatMap(output_blob_map, 0, output_mat_map);
+                    if (is_update) {
+                        output_converters_map = CreateBlobConverterMap(output_blob_map);
+                        output_params_map = CreateConvertParamMap(output_mat_map, false);
+                    }
+
+                    for(auto element : output_converters_map) {
+                        auto name = element.first;
+                        auto blob_converter = element.second;
+                        ret = blob_converter->ConvertToMat(*output_mat_map[name], output_params_map[name], command_queue);
+                        if (!CheckResult("ConvertToMat", ret)) {
+                            return ret;
+                        }
+                    }
+                    timer.Stop();
+                }
+    #if TNN_PROFILE
+                instance->FinishProfile(true);
+    #endif
+                if (!FLAGS_op.empty()) {
+                    WriteOutput(output_mat_map);
                 }
-                timer.Stop();
-            }
-#if TNN_PROFILE
-            instance->FinishProfile(true);
-#endif
-            if (!FLAGS_op.empty()) {
-                WriteOutput(output_mat_map);
-            }
 
-            timer.Print();
+                timer.Print();
 
-            FreeMatMapMemory(input_mat_map);
-            FreeMatMapMemory(output_mat_map);
-            return 0;
-        } else {
-            return ret;
+                return 0;
+            } else {
+                return ret;
+            }
+        };
+
+        int test_thread_num = TNN_NS::FLAGS_tt;
+        std::vector<std::thread> threads_list;
+        for (int i = 0; i < test_thread_num; i++) {
+            std::thread t(instance_run);
+            threads_list.push_back(std::move(t));
+        }
+        for (auto &t : threads_list) {
+            if (t.joinable()) {
+                t.join();
+            }
         }
+
+        return 0;
     }
 
     bool ParseAndCheckCommandLine(int argc, char* argv[]) {
@@ -201,14 +236,16 @@ namespace test {
         printf("    -op \"<path>\"          \t%s \n", output_path_message);
         printf("    -dl \"<device list>\"   \t%s \n", device_list_message);
         printf("    -th \"<thread umber>\"  \t%s \n", cpu_thread_num_message);
-        printf("    -it \"<input type>\"    \t%s \n", input_format_message);
+        printf("    -if \"<input type>\"    \t%s \n", input_format_message);
         printf("    -pr \"<precision >\"    \t%s \n", precision_message);
         printf("    -is \"<input shape>\"   \t%s \n", input_shape_message);
+        printf("    -it \"<input data type>\"    \t%s \n", input_data_type_message);
         printf("    -fc \"<format for compare>\t%s \n", output_format_cmp_message);
         printf("    -nt \"<network type>\t%s \n", output_format_cmp_message);
         printf("    -et \"<enable tune>\t%s \n", enable_tune_message);
         printf("    -sc \"<input scale>\t%s \n", scale_message);
         printf("    -bi \"<input bias>\t%s \n", bias_message);
+        printf("    -tt \"<number>\"        \t%s \n", test_thread_num_message);
     }
 
     void SetCpuAffinity() {
@@ -235,28 +272,78 @@ namespace test {
         InputShapesMap input_shape;
         if(!FLAGS_is.empty()) {
             std::string input_shape_message(FLAGS_is);
-            std::string delimiter = "[";
-            std::vector<int> input_dim;
-            std::ptrdiff_t p1 = 0, p2;
-            p2 = input_shape_message.find(delimiter, p1);
-            std::string input_name = input_shape_message.substr(p1, p2 -p1);
-            p1 = p2 + 1;
-            delimiter = ",";
-            while (true) {
-                p2 = input_shape_message.find(delimiter, p1);
-                if (p2 != std::string::npos) {
-                    input_dim.push_back(atoi(input_shape_message.substr(p1, p2 - p1).c_str()));
-                    p1 = p2 + 1;
-                } else {
-                    input_dim.push_back(atoi(input_shape_message.substr(p1, input_shape_message.length() - 1 - p1).c_str()));
-                    break;
+            std::vector<std::string> input_shape_strs;
+            std::string delimiter = ";";
+
+            size_t pos = 0;
+            std::string token;
+            while ((pos = input_shape_message.find(delimiter)) != std::string::npos) {
+                token = input_shape_message.substr(0, pos);
+                input_shape_strs.push_back(token);
+                input_shape_message.erase(0, pos + delimiter.length());
+            }
+            if (input_shape_message.length() != 0) {
+                input_shape_strs.push_back(input_shape_message);
+            }
+
+            for(auto input_shape_str : input_shape_strs)
+            {
+                std::string delimiter = ":";
+                std::vector<int> input_dim;
+                std::ptrdiff_t p1 = 0, p2;
+                p2 = input_shape_str.find(delimiter, p1);
+                std::string input_name = input_shape_str.substr(p1, p2 -p1);
+                p1 = p2 + 1;
+                delimiter = ",";
+                while (true) {
+                    p2 = input_shape_str.find(delimiter, p1);
+                    if (p2 != std::string::npos) {
+                        input_dim.push_back(atoi(input_shape_str.substr(p1, p2 - p1).c_str()));
+                        p1 = p2 + 1;
+                    } else {
+                        input_dim.push_back(atoi(input_shape_str.substr(p1, input_shape_str.length() - p1).c_str()));
+                        break;
+                    }
                 }
+                input_shape[input_name] = input_dim;
             }
-            input_shape[input_name] = input_dim;
         }
         return input_shape;
     }
 
+    InputDataTypeMap GetInputDataTypeMap() {
+        InputDataTypeMap input_data_type;
+        if(!FLAGS_it.empty()) {
+            std::string input_data_type_message(FLAGS_it);
+            std::vector<std::string> input_data_type_strs;
+            std::string delimiter = ";";
+
+            size_t pos = 0;
+            std::string token;
+            while ((pos = input_data_type_message.find(delimiter)) != std::string::npos) {
+                token = input_data_type_message.substr(0, pos);
+                input_data_type_strs.push_back(token);
+                input_data_type_message.erase(0, pos + delimiter.length());
+            }
+            if (input_data_type_message.length() != 0) {
+                input_data_type_strs.push_back(input_data_type_message);
+            }
+
+            for (auto input_data_type_str : input_data_type_strs) {
+                std::string delimiter = ":";
+                std::ptrdiff_t p1 = 0, p2;
+                p2 = input_data_type_str.find(delimiter, p1);
+                std::string input_name = input_data_type_str.substr(p1, p2-p1);
+                p1 = p2 + 1;
+                
+                delimiter = ";";
+                p2 = input_data_type_str.find(delimiter, p1);
+                input_data_type[input_name] = static_cast<DataType>(std::stoi(input_data_type_str.substr(p1, p2-p1)));
+            }
+        }
+        return input_data_type;
+    }
+
     ModelConfig GetModelConfig() {
         ModelConfig config;
         config.model_type = ConvertModelType(FLAGS_mt);
@@ -298,6 +385,20 @@ namespace test {
             } else {
                 config.params.push_back(model_path);
             }
+        } else if (config.model_type == MODEL_TYPE_ATLAS){
+            std::ifstream model_stream(FLAGS_mp, std::ios::binary);
+            if (!model_stream.is_open() || !model_stream.good()) {
+                config.params.push_back("");
+                return config;
+            }
+        } else if (config.model_type == MODEL_TYPE_TORCHSCRIPT) {
+            config.params.push_back(std::string(FLAGS_mp));
+        } else if (config.model_type == MODEL_TYPE_TORCHSCRIPT_BIN) {
+            // config.params.push_back(std::string(FLAGS_mp));
+            std::ifstream model_stream(FLAGS_mp, std::ios::binary);
+            std::stringstream model_content;
+            model_content << model_stream.rdbuf();
+            config.params.push_back(model_content.str());
         } else {
             config.params.push_back(FLAGS_mp);
         }
@@ -311,9 +412,13 @@ namespace test {
 
         config.enable_tune_kernel = FLAGS_et;
 #if defined(__ANDROID__)
-        config.cache_path = "/data/local/tmp/";
+	if(FLAGS_cp.empty()) {
+            config.cache_path = "/data/local/tmp/";
+	} else {
+            config.cache_path = FLAGS_cp;
+	}
 #else
-        config.cache_path = "";
+        config.cache_path = FLAGS_cp;
 #endif
 
         // Device Type: ARM, OPENECL, ...
@@ -322,7 +427,7 @@ namespace test {
         // use model type instead, may change later for same model type with
         // different network type
         if (config.device_type == DEVICE_CUDA) {
-            config.network_type = NETWORK_TYPE_TENSORRT;
+            config.network_type = ConvertNetworkType(FLAGS_nt);
         } else {
             config.network_type = ConvertNetworkType(FLAGS_nt);
         }
@@ -343,8 +448,8 @@ namespace test {
         }
     }
 
-    MatMap CreateBlobMatMap(BlobMap& blob_map, int format_type) {
-        MatMap mat_map;
+    bool CreateBlobMatMap(BlobMap& blob_map, int format_type, MatMap &mat_map) {
+        bool is_update = false;
         for (auto iter : blob_map) {
             auto name = iter.first;
             Blob* device_blob = iter.second;
@@ -364,53 +469,121 @@ namespace test {
                 mat_type = NC_INT32;
                 data_type = DATA_TYPE_INT32;
             } else if (format_type == 4) {
-                mat_type = RESERVED_INT8_TEST;
+                mat_type = NC_INT8;
+                data_type = DATA_TYPE_INT8;
+            } else if (format_type == 5) {
+                mat_type = NC_UINT8;
+                data_type = DATA_TYPE_UINT8;
             }
 
             if (blob_desc.data_type == DATA_TYPE_INT32) {
+                data_type = DATA_TYPE_INT32;
                 mat_type = NC_INT32;
+            } else if (blob_desc.data_type == DATA_TYPE_INT64) {
+                data_type = DATA_TYPE_INT64;
+                mat_type = NC_INT64;
+            } else if (blob_desc.data_type == DATA_TYPE_HALF) {
+                data_type = DATA_TYPE_HALF;
+                mat_type = NCHW_HALF;
+            }
+
+            // check whether mat need to update
+            if (mat_map.find(name) != mat_map.end()) {
+                if (mat_map[name]->GetMatType() == mat_type &&
+                    DimsVectorUtils::Equal(mat_map[name]->GetDims(), blob_desc.dims)) continue;
             }
 
-            int bytes = DimsVectorUtils::Count(blob_desc.dims) * DataTypeUtils::GetBytesSize(data_type);
-            void* mat_data = malloc(bytes);
-            auto mat = std::make_shared<Mat>(DEVICE_NAIVE, mat_type, blob_desc.dims, mat_data);
+            auto device_type = GetConcreteDeviceType(DEVICE_GROUP_CPU);
+            auto mat = std::make_shared<Mat>(device_type, mat_type, blob_desc.dims);
             mat_map[name] = mat;
+            is_update = true;
         }
-        return mat_map;
+        return is_update;
     }
 
 
     void InitInputMatMap(MatMap& mat_map) {
+        std::map<std::string, std::string> input_file_map;
+        if(!FLAGS_im.empty()) {
+            std::string input_map_message(FLAGS_im);
+            std::vector<std::string> input_map_strs;
+            std::string delimiter = ";";
+
+            size_t pos = 0;
+            std::string token;
+            while ((pos = input_map_message.find(delimiter)) != std::string::npos) {
+                token = input_map_message.substr(0, pos);
+                input_map_strs.push_back(token);
+                input_map_message.erase(0, pos + delimiter.length());
+            }
+            if (input_map_message.length() != 0) {
+                input_map_strs.push_back(input_map_message);
+            }
+
+            for(auto input_map_str : input_map_strs)
+            {
+                std::string delimiter = ":";
+                std::ptrdiff_t p1 = 0, p2;
+                p2 = input_map_str.find(delimiter, p1);
+                std::string input_name = input_map_str.substr(p1, p2 -p1);
+                std::string input_file = input_map_str.substr(p2 + 1);
+                LOGD("input_name:%s input_file: %s\n", input_name.c_str(), input_file.c_str());
+                input_file_map[input_name] = input_file;
+            }
+        }
+
         for (auto iter : mat_map) {
             auto name = iter.first;
             auto mat = iter.second;
             void* mat_data = mat->GetData();
             int data_count     = DimsVectorUtils::Count(mat->GetDims());
             auto mat_type = mat->GetMatType();
-            if (FLAGS_ip.empty()) {
+            std::string input_file = "";
+            if (input_file_map.find(name) != input_file_map.end()) {
+                input_file = input_file_map[name];
+            } else {
+                input_file = FLAGS_ip;
+            }
+            if (input_file.empty()) {
                 for (int i = 0; i < data_count; i++) {
                     if (mat_type == NCHW_FLOAT) {
                         reinterpret_cast<float*>(mat_data)[i] = (float)(rand() % 256) / 128.0f;
                     } else if (mat_type == NC_INT32) {
                         reinterpret_cast<int32_t*>(mat_data)[i] = rand() % 2;
-                    } else if (mat_type == RESERVED_INT8_TEST) {
+                    } else if (mat_type == NC_INT64) {
+                        reinterpret_cast<int64_t*>(mat_data)[i] = rand() % 2;
+                    } else if (mat_type == NC_INT8) {
                         reinterpret_cast<int8_t*>(mat_data)[i] = (rand() % 256) - 128;
+                    } else if (mat_type == NC_UINT8) {
+                        reinterpret_cast<uint8_t*>(mat_data)[i] = rand() % 256;
+                    } else if (mat_type == NCHW_HALF) {
+                        reinterpret_cast<fp16_t*>(mat_data)[i] = fp16_t((rand() % 256) / 128.0f);
                     } else {
                         reinterpret_cast<uint8_t*>(mat_data)[i] = (rand() % 256);
                     }
                 }
             } else {
-                LOGD("input path: %s\n", FLAGS_ip.c_str());
-                std::ifstream input_stream(FLAGS_ip);
+                LOGD("input path: %s\n", input_file.c_str());
+                std::ifstream input_stream(input_file);
                 for (int i = 0; i < data_count; i++) {
                     if (mat_type == NCHW_FLOAT) {
                         input_stream >> reinterpret_cast<float*>(mat_data)[i];
                     } else if (mat_type == NC_INT32) {
                         input_stream >> reinterpret_cast<int32_t*>(mat_data)[i];
-                    } else if (mat_type == RESERVED_INT8_TEST) {
+                    } else if (mat_type == NC_INT64) {
+                        input_stream >> reinterpret_cast<int64_t*>(mat_data)[i];                                         
+                    } else if (mat_type == NCHW_HALF) {
+                        float val;
+                        input_stream >> val;
+                        reinterpret_cast<fp16_t*>(mat_data)[i] = (fp16_t)val;
+                    } else if (mat_type == NC_INT8) {
                         int val;
                         input_stream >> val;
                         reinterpret_cast<int8_t*>(mat_data)[i] = (int8_t)val;
+                    } else if (mat_type == NC_UINT8) {
+                        int val;
+                        input_stream >> val;
+                        reinterpret_cast<uint8_t*>(mat_data)[i] = (uint8_t)val;
                     } else {
                         int val;
                         input_stream >> val;
@@ -474,12 +647,14 @@ namespace test {
             if(is_input && !FLAGS_sc.empty()) {
                 SetScaleOrBias(param.scale, FLAGS_sc);
             } else {
-                if (mat_type == RESERVED_INT8_TEST) {
+                if (mat_type == NC_INT8 || mat_type == NC_UINT8) {
                     std::fill(param.scale.begin(), param.scale.end(), 1.0f);
                 } else if(IsImageMat(mat_type)) {
                     std::fill(param.scale.begin(), param.scale.end(), 1.0f / 255.0f);
-                } else if(dims[1] > 4) {
-                    param.scale = std::vector<float>(dims[1], 1);
+                } else if(dims.empty() || dims.size()<2) {
+                    param.scale = std::vector<float>(1, 1);
+                } else if(dims.size()>=2 && dims[1] > 4) {
+                    param.scale = std::vector<float>(dims[1], 1.0f);
                 }
             }
 
@@ -487,11 +662,13 @@ namespace test {
             if(is_input && !FLAGS_bi.empty()) {
                 SetScaleOrBias(param.bias, FLAGS_bi);
             } else {
-                if (mat_type == RESERVED_INT8_TEST) {
+                if (mat_type == NC_INT8 || mat_type == NC_UINT8) {
                     std::fill(param.bias.begin(), param.bias.end(), 0);
                 } else if(IsImageMat(mat_type)) {
                     std::fill(param.bias.begin(), param.bias.end(), 0);
-                } else if(dims[1] > 4) {
+                } else if(dims.empty() || dims.size()<2) {
+                    param.bias  = std::vector<float>(1, 0);
+                } else if(dims.size()>=2 && dims[1] > 4) {
                     param.bias  = std::vector<float>(dims[1], 0);
                 }
             }
@@ -503,7 +680,16 @@ namespace test {
 
 
     void WriteOutput(MatMap& outputs) {
-        std::ofstream f(FLAGS_op);
+	std::stringstream thread_id_stream;
+        thread_id_stream << std::this_thread::get_id();
+	std::string thread_id = thread_id_stream.str();
+        std::string output_file_name;
+	if(FLAGS_tt > 1) {
+            output_file_name = FLAGS_op + thread_id; 
+	} else {
+	    output_file_name = FLAGS_op;
+	}
+	std::ofstream f(output_file_name);
         LOGD("the output path: %s\n", FLAGS_op.c_str());
         if (!FLAGS_fc) {
             LOGD("output path: %s\n", FLAGS_op.c_str());
@@ -532,14 +718,37 @@ namespace test {
                 LOGD("the output shape: %s\n", shape.c_str());
             }
         }
+        f << std::endl;
         for (auto output : outputs) {
+            {
+                f << output.first;
+                auto mat      = output.second;
+                DimsVector dims = mat->GetDims();
+                f << " mat_type: " << mat->GetMatType() ;
+                f << " dims: " << dims.size();
+                for (auto dim : dims) {
+                    f << " " << dim;
+                }
+                f << std::endl;
+            }
+
             auto mat  = output.second;
             int data_count     = DimsVectorUtils::Count(mat->GetDims());
-            if (mat->GetMatType() == NC_INT32 ) {
+            if (mat->GetMatType() == NC_INT32) {
                 int * data = reinterpret_cast<int*>(mat->GetData());
                 for (int c = 0; c < data_count; ++c) {
                     f << data[c] << std::endl;
                 }
+            } else if (mat->GetMatType() == NC_INT64) {
+                int64_t * data = reinterpret_cast<int64_t*>(mat->GetData());
+                for (int c = 0; c < data_count; ++c) {
+                    f << data[c] << std::endl;
+                }
+            } else if (mat->GetMatType() == NCHW_HALF) {
+                fp16_t * data = reinterpret_cast<fp16_t*>(mat->GetData());
+                for (int c = 0; c < data_count; ++c) {
+                    f << std::fixed << std::setprecision(6) << float(data[c]) << std::endl;
+                }
             } else {
                 float* data = reinterpret_cast<float*>(mat->GetData());
                 for (int c = 0; c < data_count; ++c) {
@@ -550,12 +759,6 @@ namespace test {
         f.close();
     }
 
-    void FreeMatMapMemory(MatMap& mat_map) {
-        for(auto iter : mat_map) {
-            free(iter.second->GetData());
-        }
-    }
-
 }  // namespace test
 
 }  // namespace TNN_NS
diff --git a/test/test.h b/test/test.h
index e98fbc2f5..91570009a 100644
--- a/test/test.h
+++ b/test/test.h
@@ -26,7 +26,7 @@ namespace TNN_NS {
 
 namespace test {
 
-    int Run(int argc, char* argv[]);
+    int Run(int argc, char *argv[]);
 
     bool ParseAndCheckCommandLine(int argc, char* argv[]);
 
@@ -35,6 +35,8 @@ namespace test {
     void SetCpuAffinity();
 
     InputShapesMap GetInputShapesMap();
+    
+    InputDataTypeMap GetInputDataTypeMap();
 
     ModelConfig GetModelConfig();
 
@@ -42,7 +44,7 @@ namespace test {
 
     bool CheckResult(std::string desc, Status result);
 
-    MatMap CreateBlobMatMap(BlobMap& blob_map, int mat_type);
+    bool CreateBlobMatMap(BlobMap& blob_map, int mat_type, MatMap &mat_map);
 
     void InitInputMatMap(MatMap& mat_map);
 
diff --git a/test/test_utils.cc b/test/test_utils.cc
index 948389481..013355330 100644
--- a/test/test_utils.cc
+++ b/test/test_utils.cc
@@ -19,6 +19,7 @@
 
 #include "tnn/core/common.h"
 #include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
 
 namespace TNN_NS {
 
@@ -34,12 +35,18 @@ DeviceType ConvertDeviceType(std::string device_type) {
         return DEVICE_X86;
     } else if ("NAIVE" == device_type) {
         return DEVICE_NAIVE;
+    } else if ("DSP" == device_type) {
+        return DEVICE_DSP;
     } else if ("HUAWEI_NPU" == device_type) {
         return DEVICE_HUAWEI_NPU;
     } else if ("RKNPU" == device_type) {
         return DEVICE_RK_NPU;
     } else if ("APPLE_NPU" == device_type) {
         return DEVICE_APPLE_NPU;
+    }else if ("ATLAS" == device_type) {
+        return DEVICE_ATLAS;
+    } else if ("ZIXIAO" == device_type) {
+        return DEVICE_ZIXIAO;
     } else {
         return DEVICE_ARM;
     }
@@ -56,6 +63,14 @@ ModelType ConvertModelType(std::string model_type) {
         return MODEL_TYPE_NCNN;
     } else if ("RKCACHE" == model_type) {
         return MODEL_TYPE_RKCACHE;
+    } else if ("ATLAS" == model_type) {
+        return MODEL_TYPE_ATLAS;
+    } else if ("TS" == model_type) {
+        return MODEL_TYPE_TORCHSCRIPT;
+    } else if ("TSB" == model_type) {
+        return MODEL_TYPE_TORCHSCRIPT_BIN;
+    } else if ("LRT" == model_type) {
+        return MODEL_TYPE_LRT;
     } else {
         return MODEL_TYPE_TNN;
     }
@@ -74,8 +89,14 @@ NetworkType ConvertNetworkType(std::string network_type) {
         return NETWORK_TYPE_RK_NPU;
     } else if ("TRT" == network_type) {
         return NETWORK_TYPE_TENSORRT;
+    } else if ("ATLAS" == network_type) {
+        return NETWORK_TYPE_ATLAS;
+    } else if ("TORCH" == network_type) {
+        return NETWORK_TYPE_TNNTORCH;
+    } else if ("ZIXIAO" == network_type) {
+        return NETWORK_TYPE_ZIXIAO;
     } else {
-        return NETWORK_TYPE_DEFAULT;
+        return NETWORK_TYPE_AUTO;
     }
 }
 
diff --git a/test/test_utils.h b/test/test_utils.h
index f2b74e63b..27a5e7372 100644
--- a/test/test_utils.h
+++ b/test/test_utils.h
@@ -18,6 +18,7 @@
 #include "tnn/core/common.h"
 #include "tnn/core/macro.h"
 #include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
 
 namespace TNN_NS {
 
diff --git a/test/unit_test/layer_test/layer_test.cc b/test/unit_test/layer_test/layer_test.cc
index ed5c91971..5337faef5 100644
--- a/test/unit_test/layer_test/layer_test.cc
+++ b/test/unit_test/layer_test/layer_test.cc
@@ -265,13 +265,13 @@ Status LayerTest::GenerateRandomBlob(Blob* cpu_blob, Blob* device_blob, void* co
     MatType mat_type          = NCHW_FLOAT;
     if (blob_desc_device.data_type == DATA_TYPE_BFP16) {
         // the value is initialized as bfp16
-        mat_type = RESERVED_BFP16_TEST;
+        mat_type = NCHW_BFP16;
     } else if (blob_desc_device.data_type == DATA_TYPE_INT8) {
         // the value is initialized as int8
-        mat_type = RESERVED_INT8_TEST;
+        mat_type = NC_INT8;
     } else if (blob_desc_device.data_type == DATA_TYPE_HALF && device_blob->GetBlobDesc().device_type == DEVICE_ARM) {
         // the value is initialized as half
-        mat_type = RESERVED_FP16_TEST;
+        mat_type = NCHW_HALF;
     } else if (blob_desc_device.data_type == DATA_TYPE_INT32) {
         mat_type = NC_INT32;
     }
@@ -284,21 +284,21 @@ Status LayerTest::GenerateRandomBlob(Blob* cpu_blob, Blob* device_blob, void* co
         } else {
             InitRandom(static_cast<float*>(input_data), blob_count, 1.0f + (float)magic_num);
         }
-    } else if (mat_type == RESERVED_FP16_TEST) {
+    } else if (mat_type == NCHW_HALF) {
         if (ensure_input_positive_) {
             // some layers only supports positive values as input
             InitRandom(static_cast<fp16_t*>(input_data), blob_count, (fp16_t)0.0f, (fp16_t)(1.0f + magic_num));
         } else {
             InitRandom(static_cast<fp16_t*>(input_data), blob_count, (fp16_t)(1.0f + magic_num));
         }
-    } else if (mat_type == RESERVED_INT8_TEST) {
+    } else if (mat_type == NC_INT8) {
         if (ensure_input_positive_) {
             // some layers only supports positive values as input
             InitRandom(static_cast<int8_t*>(input_data), blob_count, (int8_t)0, (int8_t)8);
         } else {
             InitRandom(static_cast<int8_t*>(input_data), blob_count, (int8_t)8);
         }
-    } else if (mat_type == RESERVED_BFP16_TEST) {
+    } else if (mat_type == NCHW_BFP16) {
         if (ensure_input_positive_) {
             InitRandom(static_cast<bfp16_t*>(input_data), blob_count, bfp16_t(0.f), bfp16_t(1.0f + magic_num));
         } else {
@@ -357,9 +357,9 @@ int LayerTest::CompareBlob(Blob* cpu_blob, Blob* device_blob, void* command_queu
     // mat type for both
     MatType mat_type = NCHW_FLOAT;
     if (blob_desc_device.data_type == DATA_TYPE_BFP16) {
-        mat_type = RESERVED_BFP16_TEST;
+        mat_type = NCHW_BFP16;
     } else if (blob_desc_device.data_type == DATA_TYPE_INT8) {
-        mat_type = RESERVED_INT8_TEST;
+        mat_type = NC_INT8;
     } else if (blob_desc_device.data_type == DATA_TYPE_INT32) {
         mat_type = NC_INT32;
     }
diff --git a/test/unit_test/unit_test_common.cc b/test/unit_test/unit_test_common.cc
index e753ccbf1..aa52acc6c 100644
--- a/test/unit_test/unit_test_common.cc
+++ b/test/unit_test/unit_test_common.cc
@@ -22,6 +22,7 @@
 #include "tnn/core/macro.h"
 #include "tnn/interpreter/default_model_interpreter.h"
 #include "tnn/utils/bfp16.h"
+#include "tnn/utils/half_utils_inner.h"
 
 namespace TNN_NS {
 
diff --git a/third_party/flatbuffers/src/idl_gen_rust.cpp b/third_party/flatbuffers/src/idl_gen_rust.cpp
index 455780cd9..264bf1122 100644
--- a/third_party/flatbuffers/src/idl_gen_rust.cpp
+++ b/third_party/flatbuffers/src/idl_gen_rust.cpp
@@ -496,7 +496,7 @@ class RustGenerator : public BaseGenerator {
     // example: f(A, D::E)          -> super::D::E
     // does not include leaf object (typically a struct type).
 
-    size_t i = 0;
+    //size_t i = 0;
     std::stringstream stream;
 
     auto s = src->components.begin();
@@ -507,7 +507,7 @@ class RustGenerator : public BaseGenerator {
       if (*s != *d) { break; }
       ++s;
       ++d;
-      ++i;
+      //++i;
     }
 
     for (; s != src->components.end(); ++s) { stream << "super::"; }
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/CMakeLists.txt b/third_party/pybind11/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/CMakeLists.txt
rename to third_party/pybind11/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/CONTRIBUTING.md b/third_party/pybind11/CONTRIBUTING.md
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/CONTRIBUTING.md
rename to third_party/pybind11/CONTRIBUTING.md
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/ISSUE_TEMPLATE.md b/third_party/pybind11/ISSUE_TEMPLATE.md
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/ISSUE_TEMPLATE.md
rename to third_party/pybind11/ISSUE_TEMPLATE.md
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/LICENSE b/third_party/pybind11/LICENSE
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/LICENSE
rename to third_party/pybind11/LICENSE
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/MANIFEST.in b/third_party/pybind11/MANIFEST.in
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/MANIFEST.in
rename to third_party/pybind11/MANIFEST.in
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/README.md b/third_party/pybind11/README.md
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/README.md
rename to third_party/pybind11/README.md
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/Doxyfile b/third_party/pybind11/docs/Doxyfile
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/Doxyfile
rename to third_party/pybind11/docs/Doxyfile
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/_static/theme_overrides.css b/third_party/pybind11/docs/_static/theme_overrides.css
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/_static/theme_overrides.css
rename to third_party/pybind11/docs/_static/theme_overrides.css
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/chrono.rst b/third_party/pybind11/docs/advanced/cast/chrono.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/chrono.rst
rename to third_party/pybind11/docs/advanced/cast/chrono.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst b/third_party/pybind11/docs/advanced/cast/custom.rst
similarity index 97%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst
rename to third_party/pybind11/docs/advanced/cast/custom.rst
index 60c70d708..e4f99ac5b 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/custom.rst
+++ b/third_party/pybind11/docs/advanced/cast/custom.rst
@@ -7,7 +7,7 @@ Python C API calls. This is fairly advanced usage and should only be pursued by
 experts who are familiar with the intricacies of Python reference counting.
 
 The following snippets demonstrate how this works for a very simple ``inty``
-type that should be convertible from Python types that provide a
+type that that should be convertible from Python types that provide a
 ``__int__(self)`` method.
 
 .. code-block:: cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst b/third_party/pybind11/docs/advanced/cast/eigen.rst
similarity index 99%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst
rename to third_party/pybind11/docs/advanced/cast/eigen.rst
index 338c91e35..59ba08c3c 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/eigen.rst
+++ b/third_party/pybind11/docs/advanced/cast/eigen.rst
@@ -259,7 +259,7 @@ copying to take place:
         "small"_a            // <- This one can be copied if needed
     );
 
-With the above binding code, attempting to call the ``some_method(m)``
+With the above binding code, attempting to call the the ``some_method(m)``
 method on a ``MyClass`` object, or attempting to call ``some_function(m, m2)``
 will raise a ``RuntimeError`` rather than making a temporary copy of the array.
 It will, however, allow the ``m2`` argument to be copied into a temporary if
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/functional.rst b/third_party/pybind11/docs/advanced/cast/functional.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/functional.rst
rename to third_party/pybind11/docs/advanced/cast/functional.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/index.rst b/third_party/pybind11/docs/advanced/cast/index.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/index.rst
rename to third_party/pybind11/docs/advanced/cast/index.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/overview.rst b/third_party/pybind11/docs/advanced/cast/overview.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/overview.rst
rename to third_party/pybind11/docs/advanced/cast/overview.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/stl.rst b/third_party/pybind11/docs/advanced/cast/stl.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/stl.rst
rename to third_party/pybind11/docs/advanced/cast/stl.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/strings.rst b/third_party/pybind11/docs/advanced/cast/strings.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/cast/strings.rst
rename to third_party/pybind11/docs/advanced/cast/strings.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/classes.rst b/third_party/pybind11/docs/advanced/classes.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/classes.rst
rename to third_party/pybind11/docs/advanced/classes.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/embedding.rst b/third_party/pybind11/docs/advanced/embedding.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/embedding.rst
rename to third_party/pybind11/docs/advanced/embedding.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/exceptions.rst b/third_party/pybind11/docs/advanced/exceptions.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/exceptions.rst
rename to third_party/pybind11/docs/advanced/exceptions.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/functions.rst b/third_party/pybind11/docs/advanced/functions.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/functions.rst
rename to third_party/pybind11/docs/advanced/functions.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/misc.rst b/third_party/pybind11/docs/advanced/misc.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/misc.rst
rename to third_party/pybind11/docs/advanced/misc.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/index.rst b/third_party/pybind11/docs/advanced/pycpp/index.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/index.rst
rename to third_party/pybind11/docs/advanced/pycpp/index.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/numpy.rst b/third_party/pybind11/docs/advanced/pycpp/numpy.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/numpy.rst
rename to third_party/pybind11/docs/advanced/pycpp/numpy.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/object.rst b/third_party/pybind11/docs/advanced/pycpp/object.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/object.rst
rename to third_party/pybind11/docs/advanced/pycpp/object.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/utilities.rst b/third_party/pybind11/docs/advanced/pycpp/utilities.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/pycpp/utilities.rst
rename to third_party/pybind11/docs/advanced/pycpp/utilities.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/smart_ptrs.rst b/third_party/pybind11/docs/advanced/smart_ptrs.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/advanced/smart_ptrs.rst
rename to third_party/pybind11/docs/advanced/smart_ptrs.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/basics.rst b/third_party/pybind11/docs/basics.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/basics.rst
rename to third_party/pybind11/docs/basics.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.py b/third_party/pybind11/docs/benchmark.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.py
rename to third_party/pybind11/docs/benchmark.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.rst b/third_party/pybind11/docs/benchmark.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/benchmark.rst
rename to third_party/pybind11/docs/benchmark.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst b/third_party/pybind11/docs/changelog.rst
similarity index 99%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst
rename to third_party/pybind11/docs/changelog.rst
index da44af10b..d65c2d800 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/docs/changelog.rst
+++ b/third_party/pybind11/docs/changelog.rst
@@ -938,7 +938,7 @@ Happy Christmas!
   `#464 <https://github.com/pybind/pybind11/pull/464>`_.
 
 * Added built-in support for ``std::shared_ptr`` holder type. It is no longer
-  necessary to include a declaration of the form
+  necessary to to include a declaration of the form
   ``PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>)`` (though continuing to
   do so won't cause an error).
   `#454 <https://github.com/pybind/pybind11/pull/454>`_.
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/classes.rst b/third_party/pybind11/docs/classes.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/classes.rst
rename to third_party/pybind11/docs/classes.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/compiling.rst b/third_party/pybind11/docs/compiling.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/compiling.rst
rename to third_party/pybind11/docs/compiling.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/conf.py b/third_party/pybind11/docs/conf.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/conf.py
rename to third_party/pybind11/docs/conf.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst b/third_party/pybind11/docs/faq.rst
similarity index 99%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst
rename to third_party/pybind11/docs/faq.rst
index 456fdb514..4d491fb87 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/docs/faq.rst
+++ b/third_party/pybind11/docs/faq.rst
@@ -244,7 +244,7 @@ Visual Studio 2015 into a Python distribution that was compiled using Visual
 Studio 2008. However, no such issue exists: it's perfectly legitimate to
 interface DLLs that are built with different compilers and/or C libraries.
 Common gotchas to watch out for involve not ``free()``-ing memory region
-that were ``malloc()``-ed in another shared library, using data
+that that were ``malloc()``-ed in another shared library, using data
 structures with incompatible ABIs, and so on. pybind11 is very careful not
 to make these types of mistakes.
 
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/index.rst b/third_party/pybind11/docs/index.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/index.rst
rename to third_party/pybind11/docs/index.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/intro.rst b/third_party/pybind11/docs/intro.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/intro.rst
rename to third_party/pybind11/docs/intro.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/limitations.rst b/third_party/pybind11/docs/limitations.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/limitations.rst
rename to third_party/pybind11/docs/limitations.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11-logo.png b/third_party/pybind11/docs/pybind11-logo.png
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11-logo.png
rename to third_party/pybind11/docs/pybind11-logo.png
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.png b/third_party/pybind11/docs/pybind11_vs_boost_python1.png
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.png
rename to third_party/pybind11/docs/pybind11_vs_boost_python1.png
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.svg b/third_party/pybind11/docs/pybind11_vs_boost_python1.svg
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python1.svg
rename to third_party/pybind11/docs/pybind11_vs_boost_python1.svg
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.png b/third_party/pybind11/docs/pybind11_vs_boost_python2.png
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.png
rename to third_party/pybind11/docs/pybind11_vs_boost_python2.png
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.svg b/third_party/pybind11/docs/pybind11_vs_boost_python2.svg
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/pybind11_vs_boost_python2.svg
rename to third_party/pybind11/docs/pybind11_vs_boost_python2.svg
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/reference.rst b/third_party/pybind11/docs/reference.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/reference.rst
rename to third_party/pybind11/docs/reference.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/release.rst b/third_party/pybind11/docs/release.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/release.rst
rename to third_party/pybind11/docs/release.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/requirements.txt b/third_party/pybind11/docs/requirements.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/requirements.txt
rename to third_party/pybind11/docs/requirements.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/upgrade.rst b/third_party/pybind11/docs/upgrade.rst
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/docs/upgrade.rst
rename to third_party/pybind11/docs/upgrade.rst
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/attr.h b/third_party/pybind11/include/pybind11/attr.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/attr.h
rename to third_party/pybind11/include/pybind11/attr.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/buffer_info.h b/third_party/pybind11/include/pybind11/buffer_info.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/buffer_info.h
rename to third_party/pybind11/include/pybind11/buffer_info.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/cast.h b/third_party/pybind11/include/pybind11/cast.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/cast.h
rename to third_party/pybind11/include/pybind11/cast.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/chrono.h b/third_party/pybind11/include/pybind11/chrono.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/chrono.h
rename to third_party/pybind11/include/pybind11/chrono.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/common.h b/third_party/pybind11/include/pybind11/common.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/common.h
rename to third_party/pybind11/include/pybind11/common.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/complex.h b/third_party/pybind11/include/pybind11/complex.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/complex.h
rename to third_party/pybind11/include/pybind11/complex.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h b/third_party/pybind11/include/pybind11/detail/class.h
similarity index 99%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h
rename to third_party/pybind11/include/pybind11/detail/class.h
index dc12e26ea..edfa7de68 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/class.h
+++ b/third_party/pybind11/include/pybind11/detail/class.h
@@ -250,7 +250,7 @@ inline bool deregister_instance(instance *self, void *valptr, const type_info *t
 inline PyObject *make_new_instance(PyTypeObject *type) {
 #if defined(PYPY_VERSION)
     // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
-    // object is a plain Python type (i.e. not derived from an extension type).  Fix it.
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
     ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
     if (type->tp_basicsize < instance_size) {
         type->tp_basicsize = instance_size;
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h b/third_party/pybind11/include/pybind11/detail/common.h
similarity index 99%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h
rename to third_party/pybind11/include/pybind11/detail/common.h
index d3dd87b44..e2330bbe6 100644
--- a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/common.h
+++ b/third_party/pybind11/include/pybind11/detail/common.h
@@ -348,7 +348,7 @@ enum class return_value_policy : uint8_t {
         object without taking ownership similar to the above
         return_value_policy::reference policy. In contrast to that policy, the
         function or property’s implicit this argument (called the parent) is
-        considered to be the owner of the return value (the child).
+        considered to be the the owner of the return value (the child).
         pybind11 then couples the lifetime of the parent to the child via a
         reference relationship that ensures that the parent cannot be garbage
         collected while Python is still using the child. More advanced
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/descr.h b/third_party/pybind11/include/pybind11/detail/descr.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/descr.h
rename to third_party/pybind11/include/pybind11/detail/descr.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/init.h b/third_party/pybind11/include/pybind11/detail/init.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/init.h
rename to third_party/pybind11/include/pybind11/detail/init.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/internals.h b/third_party/pybind11/include/pybind11/detail/internals.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/internals.h
rename to third_party/pybind11/include/pybind11/detail/internals.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/typeid.h b/third_party/pybind11/include/pybind11/detail/typeid.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/detail/typeid.h
rename to third_party/pybind11/include/pybind11/detail/typeid.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eigen.h b/third_party/pybind11/include/pybind11/eigen.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eigen.h
rename to third_party/pybind11/include/pybind11/eigen.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/embed.h b/third_party/pybind11/include/pybind11/embed.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/embed.h
rename to third_party/pybind11/include/pybind11/embed.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eval.h b/third_party/pybind11/include/pybind11/eval.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/eval.h
rename to third_party/pybind11/include/pybind11/eval.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/functional.h b/third_party/pybind11/include/pybind11/functional.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/functional.h
rename to third_party/pybind11/include/pybind11/functional.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/iostream.h b/third_party/pybind11/include/pybind11/iostream.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/iostream.h
rename to third_party/pybind11/include/pybind11/iostream.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/numpy.h b/third_party/pybind11/include/pybind11/numpy.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/numpy.h
rename to third_party/pybind11/include/pybind11/numpy.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/operators.h b/third_party/pybind11/include/pybind11/operators.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/operators.h
rename to third_party/pybind11/include/pybind11/operators.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/options.h b/third_party/pybind11/include/pybind11/options.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/options.h
rename to third_party/pybind11/include/pybind11/options.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pybind11.h b/third_party/pybind11/include/pybind11/pybind11.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pybind11.h
rename to third_party/pybind11/include/pybind11/pybind11.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pytypes.h b/third_party/pybind11/include/pybind11/pytypes.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/pytypes.h
rename to third_party/pybind11/include/pybind11/pytypes.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl.h b/third_party/pybind11/include/pybind11/stl.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl.h
rename to third_party/pybind11/include/pybind11/stl.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl_bind.h b/third_party/pybind11/include/pybind11/stl_bind.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/include/pybind11/stl_bind.h
rename to third_party/pybind11/include/pybind11/stl_bind.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__init__.py b/third_party/pybind11/pybind11/__init__.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/pybind11/__init__.py
rename to third_party/pybind11/pybind11/__init__.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/pybind11/__main__.py b/third_party/pybind11/pybind11/__main__.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/pybind11/__main__.py
rename to third_party/pybind11/pybind11/__main__.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/pybind11/_version.py b/third_party/pybind11/pybind11/_version.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/pybind11/_version.py
rename to third_party/pybind11/pybind11/_version.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/setup.cfg b/third_party/pybind11/setup.cfg
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/setup.cfg
rename to third_party/pybind11/setup.cfg
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/setup.py b/third_party/pybind11/setup.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/setup.py
rename to third_party/pybind11/setup.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/CMakeLists.txt b/third_party/pybind11/tests/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/CMakeLists.txt
rename to third_party/pybind11/tests/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/conftest.py b/third_party/pybind11/tests/conftest.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/conftest.py
rename to third_party/pybind11/tests/conftest.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/constructor_stats.h b/third_party/pybind11/tests/constructor_stats.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/constructor_stats.h
rename to third_party/pybind11/tests/constructor_stats.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cc b/third_party/pybind11/tests/cross_module_gil_utils.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cc
rename to third_party/pybind11/tests/cross_module_gil_utils.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cpp b/third_party/pybind11/tests/cross_module_gil_utils.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/cross_module_gil_utils.cpp
rename to third_party/pybind11/tests/cross_module_gil_utils.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/local_bindings.h b/third_party/pybind11/tests/local_bindings.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/local_bindings.h
rename to third_party/pybind11/tests/local_bindings.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/object.h b/third_party/pybind11/tests/object.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/object.h
rename to third_party/pybind11/tests/object.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cc b/third_party/pybind11/tests/pybind11_cross_module_tests.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cc
rename to third_party/pybind11/tests/pybind11_cross_module_tests.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cpp b/third_party/pybind11/tests/pybind11_cross_module_tests.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_cross_module_tests.cpp
rename to third_party/pybind11/tests/pybind11_cross_module_tests.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cc b/third_party/pybind11/tests/pybind11_tests.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cc
rename to third_party/pybind11/tests/pybind11_tests.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cpp b/third_party/pybind11/tests/pybind11_tests.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.cpp
rename to third_party/pybind11/tests/pybind11_tests.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.h b/third_party/pybind11/tests/pybind11_tests.h
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pybind11_tests.h
rename to third_party/pybind11/tests/pybind11_tests.h
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/pytest.ini b/third_party/pybind11/tests/pytest.ini
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/pytest.ini
rename to third_party/pybind11/tests/pytest.ini
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cc b/third_party/pybind11/tests/test_async.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cc
rename to third_party/pybind11/tests/test_async.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cpp b/third_party/pybind11/tests/test_async.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.cpp
rename to third_party/pybind11/tests/test_async.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.py b/third_party/pybind11/tests/test_async.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_async.py
rename to third_party/pybind11/tests/test_async.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cc b/third_party/pybind11/tests/test_buffers.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cc
rename to third_party/pybind11/tests/test_buffers.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cpp b/third_party/pybind11/tests/test_buffers.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.cpp
rename to third_party/pybind11/tests/test_buffers.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.py b/third_party/pybind11/tests/test_buffers.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_buffers.py
rename to third_party/pybind11/tests/test_buffers.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cc b/third_party/pybind11/tests/test_builtin_casters.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cc
rename to third_party/pybind11/tests/test_builtin_casters.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cpp b/third_party/pybind11/tests/test_builtin_casters.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.cpp
rename to third_party/pybind11/tests/test_builtin_casters.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.py b/third_party/pybind11/tests/test_builtin_casters.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_builtin_casters.py
rename to third_party/pybind11/tests/test_builtin_casters.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cc b/third_party/pybind11/tests/test_call_policies.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cc
rename to third_party/pybind11/tests/test_call_policies.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cpp b/third_party/pybind11/tests/test_call_policies.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.cpp
rename to third_party/pybind11/tests/test_call_policies.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.py b/third_party/pybind11/tests/test_call_policies.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_call_policies.py
rename to third_party/pybind11/tests/test_call_policies.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cc b/third_party/pybind11/tests/test_callbacks.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cc
rename to third_party/pybind11/tests/test_callbacks.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cpp b/third_party/pybind11/tests/test_callbacks.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.cpp
rename to third_party/pybind11/tests/test_callbacks.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.py b/third_party/pybind11/tests/test_callbacks.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_callbacks.py
rename to third_party/pybind11/tests/test_callbacks.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cc b/third_party/pybind11/tests/test_chrono.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cc
rename to third_party/pybind11/tests/test_chrono.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cpp b/third_party/pybind11/tests/test_chrono.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.cpp
rename to third_party/pybind11/tests/test_chrono.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.py b/third_party/pybind11/tests/test_chrono.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_chrono.py
rename to third_party/pybind11/tests/test_chrono.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cc b/third_party/pybind11/tests/test_class.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cc
rename to third_party/pybind11/tests/test_class.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cpp b/third_party/pybind11/tests/test_class.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.cpp
rename to third_party/pybind11/tests/test_class.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.py b/third_party/pybind11/tests/test_class.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_class.py
rename to third_party/pybind11/tests/test_class.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cc b/third_party/pybind11/tests/test_cmake_build/embed.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cc
rename to third_party/pybind11/tests/test_cmake_build/embed.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cpp b/third_party/pybind11/tests/test_cmake_build/embed.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/embed.cpp
rename to third_party/pybind11/tests/test_cmake_build/embed.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cc b/third_party/pybind11/tests/test_cmake_build/main.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cc
rename to third_party/pybind11/tests/test_cmake_build/main.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cpp b/third_party/pybind11/tests/test_cmake_build/main.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/main.cpp
rename to third_party/pybind11/tests/test_cmake_build/main.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt b/third_party/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
rename to third_party/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/test.py b/third_party/pybind11/tests/test_cmake_build/test.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_cmake_build/test.py
rename to third_party/pybind11/tests/test_cmake_build/test.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cc b/third_party/pybind11/tests/test_constants_and_functions.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cc
rename to third_party/pybind11/tests/test_constants_and_functions.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cpp b/third_party/pybind11/tests/test_constants_and_functions.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.cpp
rename to third_party/pybind11/tests/test_constants_and_functions.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.py b/third_party/pybind11/tests/test_constants_and_functions.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_constants_and_functions.py
rename to third_party/pybind11/tests/test_constants_and_functions.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cc b/third_party/pybind11/tests/test_copy_move.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cc
rename to third_party/pybind11/tests/test_copy_move.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cpp b/third_party/pybind11/tests/test_copy_move.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.cpp
rename to third_party/pybind11/tests/test_copy_move.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.py b/third_party/pybind11/tests/test_copy_move.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_copy_move.py
rename to third_party/pybind11/tests/test_copy_move.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cc b/third_party/pybind11/tests/test_docstring_options.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cc
rename to third_party/pybind11/tests/test_docstring_options.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cpp b/third_party/pybind11/tests/test_docstring_options.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.cpp
rename to third_party/pybind11/tests/test_docstring_options.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.py b/third_party/pybind11/tests/test_docstring_options.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_docstring_options.py
rename to third_party/pybind11/tests/test_docstring_options.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cc b/third_party/pybind11/tests/test_eigen.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cc
rename to third_party/pybind11/tests/test_eigen.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cpp b/third_party/pybind11/tests/test_eigen.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.cpp
rename to third_party/pybind11/tests/test_eigen.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.py b/third_party/pybind11/tests/test_eigen.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eigen.py
rename to third_party/pybind11/tests/test_eigen.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/CMakeLists.txt b/third_party/pybind11/tests/test_embed/CMakeLists.txt
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/CMakeLists.txt
rename to third_party/pybind11/tests/test_embed/CMakeLists.txt
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cc b/third_party/pybind11/tests/test_embed/catch.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cc
rename to third_party/pybind11/tests/test_embed/catch.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cpp b/third_party/pybind11/tests/test_embed/catch.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/catch.cpp
rename to third_party/pybind11/tests/test_embed/catch.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cc b/third_party/pybind11/tests/test_embed/external_module.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cc
rename to third_party/pybind11/tests/test_embed/external_module.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cpp b/third_party/pybind11/tests/test_embed/external_module.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/external_module.cpp
rename to third_party/pybind11/tests/test_embed/external_module.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cc b/third_party/pybind11/tests/test_embed/test_interpreter.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cc
rename to third_party/pybind11/tests/test_embed/test_interpreter.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cpp b/third_party/pybind11/tests/test_embed/test_interpreter.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.cpp
rename to third_party/pybind11/tests/test_embed/test_interpreter.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.py b/third_party/pybind11/tests/test_embed/test_interpreter.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_embed/test_interpreter.py
rename to third_party/pybind11/tests/test_embed/test_interpreter.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cc b/third_party/pybind11/tests/test_enum.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cc
rename to third_party/pybind11/tests/test_enum.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cpp b/third_party/pybind11/tests/test_enum.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.cpp
rename to third_party/pybind11/tests/test_enum.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.py b/third_party/pybind11/tests/test_enum.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_enum.py
rename to third_party/pybind11/tests/test_enum.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cc b/third_party/pybind11/tests/test_eval.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cc
rename to third_party/pybind11/tests/test_eval.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cpp b/third_party/pybind11/tests/test_eval.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.cpp
rename to third_party/pybind11/tests/test_eval.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.py b/third_party/pybind11/tests/test_eval.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval.py
rename to third_party/pybind11/tests/test_eval.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval_call.py b/third_party/pybind11/tests/test_eval_call.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_eval_call.py
rename to third_party/pybind11/tests/test_eval_call.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cc b/third_party/pybind11/tests/test_exceptions.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cc
rename to third_party/pybind11/tests/test_exceptions.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cpp b/third_party/pybind11/tests/test_exceptions.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.cpp
rename to third_party/pybind11/tests/test_exceptions.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.py b/third_party/pybind11/tests/test_exceptions.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_exceptions.py
rename to third_party/pybind11/tests/test_exceptions.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cc b/third_party/pybind11/tests/test_factory_constructors.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cc
rename to third_party/pybind11/tests/test_factory_constructors.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cpp b/third_party/pybind11/tests/test_factory_constructors.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.cpp
rename to third_party/pybind11/tests/test_factory_constructors.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.py b/third_party/pybind11/tests/test_factory_constructors.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_factory_constructors.py
rename to third_party/pybind11/tests/test_factory_constructors.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cc b/third_party/pybind11/tests/test_gil_scoped.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cc
rename to third_party/pybind11/tests/test_gil_scoped.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cpp b/third_party/pybind11/tests/test_gil_scoped.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.cpp
rename to third_party/pybind11/tests/test_gil_scoped.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.py b/third_party/pybind11/tests/test_gil_scoped.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_gil_scoped.py
rename to third_party/pybind11/tests/test_gil_scoped.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cc b/third_party/pybind11/tests/test_iostream.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cc
rename to third_party/pybind11/tests/test_iostream.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cpp b/third_party/pybind11/tests/test_iostream.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.cpp
rename to third_party/pybind11/tests/test_iostream.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.py b/third_party/pybind11/tests/test_iostream.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_iostream.py
rename to third_party/pybind11/tests/test_iostream.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cc b/third_party/pybind11/tests/test_kwargs_and_defaults.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cc
rename to third_party/pybind11/tests/test_kwargs_and_defaults.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cpp b/third_party/pybind11/tests/test_kwargs_and_defaults.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.cpp
rename to third_party/pybind11/tests/test_kwargs_and_defaults.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.py b/third_party/pybind11/tests/test_kwargs_and_defaults.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_kwargs_and_defaults.py
rename to third_party/pybind11/tests/test_kwargs_and_defaults.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cc b/third_party/pybind11/tests/test_local_bindings.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cc
rename to third_party/pybind11/tests/test_local_bindings.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cpp b/third_party/pybind11/tests/test_local_bindings.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.cpp
rename to third_party/pybind11/tests/test_local_bindings.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.py b/third_party/pybind11/tests/test_local_bindings.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_local_bindings.py
rename to third_party/pybind11/tests/test_local_bindings.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cc b/third_party/pybind11/tests/test_methods_and_attributes.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cc
rename to third_party/pybind11/tests/test_methods_and_attributes.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cpp b/third_party/pybind11/tests/test_methods_and_attributes.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.cpp
rename to third_party/pybind11/tests/test_methods_and_attributes.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.py b/third_party/pybind11/tests/test_methods_and_attributes.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_methods_and_attributes.py
rename to third_party/pybind11/tests/test_methods_and_attributes.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cc b/third_party/pybind11/tests/test_modules.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cc
rename to third_party/pybind11/tests/test_modules.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cpp b/third_party/pybind11/tests/test_modules.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.cpp
rename to third_party/pybind11/tests/test_modules.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.py b/third_party/pybind11/tests/test_modules.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_modules.py
rename to third_party/pybind11/tests/test_modules.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cc b/third_party/pybind11/tests/test_multiple_inheritance.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cc
rename to third_party/pybind11/tests/test_multiple_inheritance.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cpp b/third_party/pybind11/tests/test_multiple_inheritance.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.cpp
rename to third_party/pybind11/tests/test_multiple_inheritance.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.py b/third_party/pybind11/tests/test_multiple_inheritance.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_multiple_inheritance.py
rename to third_party/pybind11/tests/test_multiple_inheritance.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cc b/third_party/pybind11/tests/test_numpy_array.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cc
rename to third_party/pybind11/tests/test_numpy_array.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cpp b/third_party/pybind11/tests/test_numpy_array.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.cpp
rename to third_party/pybind11/tests/test_numpy_array.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.py b/third_party/pybind11/tests/test_numpy_array.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_array.py
rename to third_party/pybind11/tests/test_numpy_array.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cc b/third_party/pybind11/tests/test_numpy_dtypes.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cc
rename to third_party/pybind11/tests/test_numpy_dtypes.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cpp b/third_party/pybind11/tests/test_numpy_dtypes.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.cpp
rename to third_party/pybind11/tests/test_numpy_dtypes.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.py b/third_party/pybind11/tests/test_numpy_dtypes.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_dtypes.py
rename to third_party/pybind11/tests/test_numpy_dtypes.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cc b/third_party/pybind11/tests/test_numpy_vectorize.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cc
rename to third_party/pybind11/tests/test_numpy_vectorize.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cpp b/third_party/pybind11/tests/test_numpy_vectorize.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.cpp
rename to third_party/pybind11/tests/test_numpy_vectorize.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.py b/third_party/pybind11/tests/test_numpy_vectorize.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_numpy_vectorize.py
rename to third_party/pybind11/tests/test_numpy_vectorize.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cc b/third_party/pybind11/tests/test_opaque_types.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cc
rename to third_party/pybind11/tests/test_opaque_types.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cpp b/third_party/pybind11/tests/test_opaque_types.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.cpp
rename to third_party/pybind11/tests/test_opaque_types.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.py b/third_party/pybind11/tests/test_opaque_types.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_opaque_types.py
rename to third_party/pybind11/tests/test_opaque_types.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cc b/third_party/pybind11/tests/test_operator_overloading.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cc
rename to third_party/pybind11/tests/test_operator_overloading.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cpp b/third_party/pybind11/tests/test_operator_overloading.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.cpp
rename to third_party/pybind11/tests/test_operator_overloading.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.py b/third_party/pybind11/tests/test_operator_overloading.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_operator_overloading.py
rename to third_party/pybind11/tests/test_operator_overloading.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cc b/third_party/pybind11/tests/test_pickling.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cc
rename to third_party/pybind11/tests/test_pickling.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cpp b/third_party/pybind11/tests/test_pickling.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.cpp
rename to third_party/pybind11/tests/test_pickling.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.py b/third_party/pybind11/tests/test_pickling.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pickling.py
rename to third_party/pybind11/tests/test_pickling.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cc b/third_party/pybind11/tests/test_pytypes.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cc
rename to third_party/pybind11/tests/test_pytypes.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cpp b/third_party/pybind11/tests/test_pytypes.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.cpp
rename to third_party/pybind11/tests/test_pytypes.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.py b/third_party/pybind11/tests/test_pytypes.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_pytypes.py
rename to third_party/pybind11/tests/test_pytypes.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cc b/third_party/pybind11/tests/test_sequences_and_iterators.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cc
rename to third_party/pybind11/tests/test_sequences_and_iterators.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cpp b/third_party/pybind11/tests/test_sequences_and_iterators.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.cpp
rename to third_party/pybind11/tests/test_sequences_and_iterators.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.py b/third_party/pybind11/tests/test_sequences_and_iterators.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_sequences_and_iterators.py
rename to third_party/pybind11/tests/test_sequences_and_iterators.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cc b/third_party/pybind11/tests/test_smart_ptr.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cc
rename to third_party/pybind11/tests/test_smart_ptr.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cpp b/third_party/pybind11/tests/test_smart_ptr.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.cpp
rename to third_party/pybind11/tests/test_smart_ptr.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.py b/third_party/pybind11/tests/test_smart_ptr.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_smart_ptr.py
rename to third_party/pybind11/tests/test_smart_ptr.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cc b/third_party/pybind11/tests/test_stl.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cc
rename to third_party/pybind11/tests/test_stl.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cpp b/third_party/pybind11/tests/test_stl.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.cpp
rename to third_party/pybind11/tests/test_stl.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.py b/third_party/pybind11/tests/test_stl.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl.py
rename to third_party/pybind11/tests/test_stl.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cc b/third_party/pybind11/tests/test_stl_binders.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cc
rename to third_party/pybind11/tests/test_stl_binders.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cpp b/third_party/pybind11/tests/test_stl_binders.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.cpp
rename to third_party/pybind11/tests/test_stl_binders.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.py b/third_party/pybind11/tests/test_stl_binders.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_stl_binders.py
rename to third_party/pybind11/tests/test_stl_binders.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cc b/third_party/pybind11/tests/test_tagbased_polymorphic.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cc
rename to third_party/pybind11/tests/test_tagbased_polymorphic.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cpp b/third_party/pybind11/tests/test_tagbased_polymorphic.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.cpp
rename to third_party/pybind11/tests/test_tagbased_polymorphic.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.py b/third_party/pybind11/tests/test_tagbased_polymorphic.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_tagbased_polymorphic.py
rename to third_party/pybind11/tests/test_tagbased_polymorphic.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cc b/third_party/pybind11/tests/test_union.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cc
rename to third_party/pybind11/tests/test_union.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cpp b/third_party/pybind11/tests/test_union.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.cpp
rename to third_party/pybind11/tests/test_union.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.py b/third_party/pybind11/tests/test_union.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_union.py
rename to third_party/pybind11/tests/test_union.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cc b/third_party/pybind11/tests/test_virtual_functions.cc
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cc
rename to third_party/pybind11/tests/test_virtual_functions.cc
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cpp b/third_party/pybind11/tests/test_virtual_functions.cpp
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.cpp
rename to third_party/pybind11/tests/test_virtual_functions.cpp
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.py b/third_party/pybind11/tests/test_virtual_functions.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tests/test_virtual_functions.py
rename to third_party/pybind11/tests/test_virtual_functions.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/FindCatch.cmake b/third_party/pybind11/tools/FindCatch.cmake
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/FindCatch.cmake
rename to third_party/pybind11/tools/FindCatch.cmake
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/FindEigen3.cmake b/third_party/pybind11/tools/FindEigen3.cmake
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/FindEigen3.cmake
rename to third_party/pybind11/tools/FindEigen3.cmake
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/FindPythonLibsNew.cmake b/third_party/pybind11/tools/FindPythonLibsNew.cmake
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/FindPythonLibsNew.cmake
rename to third_party/pybind11/tools/FindPythonLibsNew.cmake
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/check-style.sh b/third_party/pybind11/tools/check-style.sh
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/check-style.sh
rename to third_party/pybind11/tools/check-style.sh
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/libsize.py b/third_party/pybind11/tools/libsize.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/libsize.py
rename to third_party/pybind11/tools/libsize.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/mkdoc.py b/third_party/pybind11/tools/mkdoc.py
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/mkdoc.py
rename to third_party/pybind11/tools/mkdoc.py
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Config.cmake.in b/third_party/pybind11/tools/pybind11Config.cmake.in
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Config.cmake.in
rename to third_party/pybind11/tools/pybind11Config.cmake.in
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Tools.cmake b/third_party/pybind11/tools/pybind11Tools.cmake
similarity index 100%
rename from tools/onnx2tnn/onnx-converter/pybind11/tools/pybind11Tools.cmake
rename to third_party/pybind11/tools/pybind11Tools.cmake
diff --git a/tools/common/file_reader.cc b/tools/common/file_reader.cc
index 3ad6db002..b890d1631 100644
--- a/tools/common/file_reader.cc
+++ b/tools/common/file_reader.cc
@@ -133,7 +133,9 @@ Status FileReader::Read(std::map<std::string, std::shared_ptr<Mat>>& mat_map, co
             } else if (DATA_TYPE_INT32 == data_type) {
                 mat_type = NC_INT32;
             } else if (DATA_TYPE_INT8 == data_type) {
-                mat_type = RESERVED_INT8_TEST;
+                mat_type = NC_INT8;
+            } else if (DATA_TYPE_UINT8 == data_type) {
+                mat_type = NC_UINT8;
             } else {
                 LOGE("FileReader::Read dont support data type:%d\n", data_type);
                 f_stream.close();
diff --git a/tools/convert2tnn/build.sh b/tools/convert2tnn/build.sh
index 523b19ea9..db5080056 100755
--- a/tools/convert2tnn/build.sh
+++ b/tools/convert2tnn/build.sh
@@ -41,7 +41,7 @@ function build_model_check_and_tnn_converter_and_onnx2tnn() {
         -DTNN_DYNAMIC_RANGE_QUANTIZATION_ENABLE:BOOL="ON" \
         -DTNN_BUILD_SHARED="OFF"
 
-    make -j4
+    make -j16
 
     if [ -f "model_check" ]; then
         cp model_check ../${BIN_DIR}/
diff --git a/tools/converter/source/runtime/tnn_runtime.cc b/tools/converter/source/runtime/tnn_runtime.cc
index 136d4552d..4535cf68f 100644
--- a/tools/converter/source/runtime/tnn_runtime.cc
+++ b/tools/converter/source/runtime/tnn_runtime.cc
@@ -48,7 +48,7 @@ TNN_NS::Status TnnRuntime::ConstantFolding(const std::shared_ptr<TNN_NS::Abstrac
     TNN_NS::InputShapesMap& input_shapes_map = tnn_interpreter->GetNetStructure()->inputs_shape_map;
     auto const_folder                        = std::make_shared<TNN_NS::ConstFolder>();
     auto& instance                           = const_folder;
-    auto status = const_folder->Init(network_config_, model_config_, tnn_interpreter, {}, {});
+    auto status = const_folder->Init(network_config_, model_config_, tnn_interpreter, {}, {}, {});
     if (status != TNN_NS::TNN_OK) {
         LOGE("Converter Runtime: instance init failed!\n");
         return status;
diff --git a/tools/model_check/model_checker.cc b/tools/model_check/model_checker.cc
index 19a36c78d..5a9563a86 100644
--- a/tools/model_check/model_checker.cc
+++ b/tools/model_check/model_checker.cc
@@ -321,8 +321,10 @@ Status ModelChecker::RunModelCheckerOutput() {
         DataType data_type = DATA_TYPE_FLOAT;
         if (mat_type == NC_INT32) {
             data_type = DATA_TYPE_INT32;
-        } else if(mat_type == RESERVED_INT8_TEST) {
+        } else if(mat_type == NC_INT8 || mat_type == RESERVED_INT8_TEST) {
             data_type = DATA_TYPE_INT8;
+        } else if(mat_type == NC_UINT8) {
+            data_type = DATA_TYPE_UINT8;
         }
         
         // check for dims count
@@ -499,12 +501,18 @@ Status ModelChecker::FeedInputData() {
                 for (int i = 0; i < data_count; i++) {
                     data_ptr[i] = rand() % 2;
                 }
-            } else if (DATA_TYPE_INT8 == data_type ) {
-                mat             = std::shared_ptr<Mat>(new Mat(DEVICE_NAIVE, RESERVED_INT8_TEST, dims));
+            } else if (DATA_TYPE_INT8 == data_type) {
+                mat             = std::shared_ptr<Mat>(new Mat(DEVICE_NAIVE, NC_INT8, dims));
                 auto data_ptr = reinterpret_cast<int8_t *>(mat->GetData());
                 for (int i = 0; i < data_count; i++) {
                     data_ptr[i] = (int8_t)(rand() % 256 - 128);
                 }
+            } else if (DATA_TYPE_UINT8 == data_type) {
+                mat             = std::shared_ptr<Mat>(new Mat(DEVICE_NAIVE, NC_UINT8, dims));
+                auto data_ptr = reinterpret_cast<uint8_t *>(mat->GetData());
+                for (int i = 0; i < data_count; i++) {
+                    data_ptr[i] = (uint8_t)(rand() % 256);
+                }
             } else {
                 return Status(TNNERR_COMMON_ERROR, "generate input data failed");
             }
@@ -592,7 +600,9 @@ Status ModelChecker::GetOutputData(Instance* instance, std::map<std::string, std
         } else if (DATA_TYPE_INT32 == data_type) {
             mat_type = NC_INT32;
         } else if (DATA_TYPE_INT8 == data_type) {
-            mat_type = RESERVED_INT8_TEST;
+            mat_type = NC_INT8;
+        } else if (DATA_TYPE_UINT8 == data_type) {
+            mat_type = NC_UINT8;
         } else {
             LOGE("ModelChecker::GetOutputData dont support data type:%d\n", data_type);
             return Status(TNNERR_INVALID_INPUT, "the data type is not support in ModelChecker::GetOutputData");
@@ -626,7 +636,9 @@ Status ModelChecker::GetBlobData(Instance* instance, Blob* blob,
     } else if (DATA_TYPE_INT32 == data_type) {
         mat_type = NC_INT32;
     } else if (DATA_TYPE_INT8 == data_type) {
-        mat_type = RESERVED_INT8_TEST;
+        mat_type = NC_INT8;
+    } else if (DATA_TYPE_UINT8 == data_type) {
+        mat_type = NC_UINT8;
     } else {
         LOGE("ModelChecker::GetBlobData dont support data type:%d\n", data_type);
         return Status(TNNERR_INVALID_INPUT, "the data type is not support in ModelChecker::GetBlobData");
diff --git a/tools/onnx2tnn/onnx-converter/CMakeLists.txt b/tools/onnx2tnn/onnx-converter/CMakeLists.txt
index d90512038..d9dae1cc4 100755
--- a/tools/onnx2tnn/onnx-converter/CMakeLists.txt
+++ b/tools/onnx2tnn/onnx-converter/CMakeLists.txt
@@ -10,8 +10,6 @@ else ()
     message(FATAL_ERROR "Protobuf not found, must install first")
 endif ()
 
-add_subdirectory(pybind11)
-
 if(APPLE)
     find_library(MAC_ACCE Accelerate)
     if (NOT MAC_ACCE)
diff --git a/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc b/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc
index c64ab3cfa..f52b0e68b 100644
--- a/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc
+++ b/tools/onnx2tnn/onnx-converter/onnx2tnn_convert.cc
@@ -205,7 +205,7 @@ int onnx2tnn_convert(std::string onnx_model_path, std::string output_dir, std::s
         }
 
         auto const_folder = std::make_shared<ConstFolder>();
-        status = const_folder->Init(network_config, model_config, interpreter, {}, {});
+        status = const_folder->Init(network_config, model_config, interpreter, {}, {}, {});
         if (status != TNN_OK) {
             DLog("ConstFolder Init Error: %s\n", status.description().c_str());
             return status;
diff --git a/tools/onnx2tnn/onnx-converter/pybind11/docs/Makefile b/tools/onnx2tnn/onnx-converter/pybind11/docs/Makefile
deleted file mode 100644
index 511b47c2d..000000000
--- a/tools/onnx2tnn/onnx-converter/pybind11/docs/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = .build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  applehelp  to make an Apple Help Book"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-	@echo "  coverage   to run coverage check of the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pybind11.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pybind11.qhc"
-
-applehelp:
-	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
-	@echo
-	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
-	@echo "N.B. You won't be able to view it unless you put it in" \
-	      "~/Library/Documentation/Help or install it in your application" \
-	      "bundle."
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/pybind11"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pybind11"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-coverage:
-	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
-	@echo "Testing of coverage in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/coverage/python.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/tools/onnx2tnn/src/core/layer/onnx_converter_cumsum.cc b/tools/onnx2tnn/src/core/layer/onnx_converter_cumsum.cc
new file mode 100644
index 000000000..572bb49cd
--- /dev/null
+++ b/tools/onnx2tnn/src/core/layer/onnx_converter_cumsum.cc
@@ -0,0 +1,61 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_op_converter.h"
+#include "onnx_utility.h"
+
+
+DECLARE_OP_CONVERTER(CumSum);
+
+string OnnxOpConverterCumSum::TNNOpType(NodeProto& node,
+                                        OnnxNetInfo &net_info) {
+    return "Cumsum";
+}
+
+string OnnxOpConverterCumSum::TNNLayerParam(NodeProto& node,
+                                            OnnxNetInfo& net_info) {
+    const std::string& onnx_op = node.op_type();
+    ostringstream layer_param;
+
+    std::vector<int64_t> axis_vec = get_node_attr_ai(node, "axis", net_info, 1);
+    if (axis_vec.size() != 1) {
+        DLog("Cumsum axis size != -1, not supported right now.\n");
+        assert(0);
+    }
+
+    int64_t exclusive        = get_node_attr_i(node, "exclusive", 0);
+    int64_t exclusive_extend = 0;   // By ONNX 1.15.0, Cumsum does not support PyTorch 'extend' mode, set to 0 by default.
+    int64_t reverse          = get_node_attr_i(node, "reverse", 0);
+
+    layer_param << axis_vec[0] << " ";
+    layer_param << exclusive << " ";
+    layer_param << exclusive_extend << " ";
+    layer_param << reverse << " ";
+
+    return layer_param.str();
+}
+
+bool OnnxOpConverterCumSum::HasLayerResource(NodeProto &node, OnnxNetInfo &net_info) {
+    return false;
+}
+
+int OnnxOpConverterCumSum::WriteTNNModel(Serializer* net_writer, NodeProto& node, OnnxNetInfo& net_info) {
+    //有权值写入的返回1， 没有的返回0
+    return 0;
+}
+
+REGISTER_OP_CONVERTER(CumSum, CumSum);
diff --git a/tools/onnx2tnn/src/core/layer/onnx_converter_or.cc b/tools/onnx2tnn/src/core/layer/onnx_converter_or.cc
new file mode 100644
index 000000000..34d64d27a
--- /dev/null
+++ b/tools/onnx2tnn/src/core/layer/onnx_converter_or.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Or);
+
+string OnnxOpConverterOr::TNNOpType(NodeProto& node, OnnxNetInfo& net_info) {
+    return "Or";
+}
+
+string OnnxOpConverterOr::TNNLayerParam(NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterOr::HasLayerResource(NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterOr::WriteTNNModel(Serializer* net_writer, NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Or, Or);
diff --git a/tools/onnx2tnn/src/core/layer/onnx_converter_xor.cc b/tools/onnx2tnn/src/core/layer/onnx_converter_xor.cc
new file mode 100644
index 000000000..1c7c7c98b
--- /dev/null
+++ b/tools/onnx2tnn/src/core/layer/onnx_converter_xor.cc
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making TNN available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include "onnx_converter_multidir_broadcast.h"
+#include "onnx_utility.h"
+
+DECLARE_MULTI_BROADCASR_OP_CONVERTER(Xor);
+
+string OnnxOpConverterXor::TNNOpType(NodeProto& node, OnnxNetInfo& net_info) {
+    return "Xor";
+}
+
+string OnnxOpConverterXor::TNNLayerParam(NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::TNNLayerParam(node, net_info);
+}
+
+bool OnnxOpConverterXor::HasLayerResource(NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::HasLayerResource(node, net_info);
+}
+
+int OnnxOpConverterXor::WriteTNNModel(Serializer* net_writer, NodeProto& node, OnnxNetInfo& net_info) {
+    return OnnxOpConverterMultiBrodcast::WriteTNNModel(net_writer, node, net_info);
+}
+
+REGISTER_MULTI_BROADCASR_OP_CONVERTER(Xor, Xor);
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc
index 0dc3c2ef2..79014bada 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_conv.cc
@@ -14,9 +14,9 @@
 
 #include <math.h>
 
-
 #include "onnx2tnn.h"
 
+
 onnx::TensorProto CopyFloatTensorProto(onnx::TensorProto& src_tensor) {
     onnx::TensorProto cpy_tensor;
     cpy_tensor.set_data_type(src_tensor.data_type());
@@ -35,256 +35,302 @@ onnx::TensorProto CopyFloatTensorProto(onnx::TensorProto& src_tensor) {
     return cpy_tensor;
 }
 
-int Onnx2TNN::FuseConv(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
-                       std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names) {
-    auto const node_count = index_nodes.size();
-
-    std::map<std::string, std::map<std::string, int>> unable_fuse_table;
-    for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
+// Type 1
+// Conv <= Conv - BatchNormalization
+bool CheckAndFuseConvType1(std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           std::map<std::string, std::map<std::string, int>>& unable_fuse_table,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Conv" && i + 1 < node_count) {
+        auto node_conv      = node;
+        auto node_batchnorm = index_nodes[i + 1].node;
+
+        // check op
+        if (!(node_batchnorm->op_type() == "BatchNormalization"))
+            return false;
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+
+        auto conv_weights_name = node_conv->input(1);
+        auto bn_gamma_name     = node_batchnorm->input(1);
+        if (unable_fuse_table.count(conv_weights_name) == 0) {
+            std::map<std::string, int> tmp;
+            tmp[bn_gamma_name]                   = 0;
+            unable_fuse_table[conv_weights_name] = std::move(tmp);
+        } else if (unable_fuse_table[conv_weights_name].count(bn_gamma_name) == 0) {
+            unable_fuse_table[conv_weights_name][bn_gamma_name] = 0;
+        }
+        unable_fuse_table[conv_weights_name][bn_gamma_name]++;
+
+        i += 1;
+    }
+    return true;
+}
 
-        do {
-            if (node->op_type() == "Conv" && i + 1 < node_count) {
-                auto node_conv      = node;
-                auto node_batchnorm = index_nodes[i + 1].node;
-
-                // check op
-                if (!(node_batchnorm->op_type() == "BatchNormalization"))
-                    break;
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
+// Type 2
+// Conv <= Conv - BatchNormalization
+bool CheckAndFuseConvType2(std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           std::map<std::string, std::map<std::string, int>>& unable_fuse_table,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Conv" && i + 1 < node_count) {
+        auto node_conv      = node;
+        auto node_batchnorm = index_nodes[i + 1].node;
+
+        // check op
+        if (!(node_batchnorm->op_type() == "BatchNormalization"))
+            return false;
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+
+        auto conv_weights_name = node_conv->input(1);
+        auto bn_gamma_name     = node_batchnorm->input(1);
+        if (unable_fuse_table[conv_weights_name].size() > 1) {
+            return false;
+        }
+
+        auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
+
+        bool can_fuse = false;
+        if (node_conv->output_size() == 1 && node_batchnorm->input_size() == 5 &&
+            node_conv->output(0) == node_batchnorm->input(0)) {
+            //目前仅仅考虑二维情况
+            can_fuse = kernel_shape.size() == 2;
+        }
+        int kernel_size = (int)kernel_shape[0] * kernel_shape[1];
+
+        if (!can_fuse) {
+            return false;
+        }
+
+        int group                = (int)get_node_attr_i(*node_conv, "group", 1);
+        auto& conv_weight_tensor = weights[node_conv->input(1)];
+        int channel_output       = (int)conv_weight_tensor.dims(0);
+        int channel_input        = (int)conv_weight_tensor.dims(1) * group;
+
+        float* slope = new float[channel_output];
+        float* bias  = new float[channel_output];
+        {
+            float epsilon = get_node_attr_f(*node_batchnorm, "epsilon", 1e-5f);
+
+            const onnx::TensorProto& gamma = weights[node_batchnorm->input(1)];
+            const onnx::TensorProto& beta  = weights[node_batchnorm->input(2)];
+            const onnx::TensorProto& mean  = weights[node_batchnorm->input(3)];
+            const onnx::TensorProto& var   = weights[node_batchnorm->input(4)];
+
+            int channels = get_tensor_proto_data_size(gamma);
+            assert(channels == channel_output);
+
+            // apply epsilon to var
+            {
+                const float* gamma_data = get_tensor_proto_data(gamma);
+                const float* beta_data  = get_tensor_proto_data(beta);
+                const float* mean_data  = get_tensor_proto_data(mean);
+                const float* var_data   = get_tensor_proto_data(var);
+
+                for (int j = 0; j < channels; j++) {
+                    double sqrt_var = sqrt(double(var_data[j]) + epsilon);
+                    bias[j]  = double(beta_data[j]) - double(gamma_data[j]) * double(mean_data[j]) / sqrt_var;
+                    slope[j] = double(gamma_data[j]) / sqrt_var;
                 }
-
-                auto conv_weights_name = node_conv->input(1);
-                auto bn_gamma_name     = node_batchnorm->input(1);
-                if (unable_fuse_table.count(conv_weights_name) == 0) {
-                    std::map<std::string, int> tmp;
-                    tmp[bn_gamma_name]                   = 0;
-                    unable_fuse_table[conv_weights_name] = std::move(tmp);
-                } else if (unable_fuse_table[conv_weights_name].count(bn_gamma_name) == 0) {
-                    unable_fuse_table[conv_weights_name][bn_gamma_name] = 0;
+            }
+        }
+
+        int has_bias = node_conv->input_size() == 3 ? 1 : 0;
+        if (!has_bias) {
+            auto temp_tensor        = onnx::TensorProto(weights[node_batchnorm->input(2)]);
+            float* temp_tensor_data = get_tensor_proto_mutable_data(temp_tensor);
+            int channels            = get_tensor_proto_data_size(temp_tensor);
+            assert(channels == channel_output);
+            for (int j = 0; j < channels; j++) {
+                temp_tensor_data[j] = 0;
+            }
+            auto temp_tensor_name     = node_batchnorm->output(0) + "_bias";
+            weights[temp_tensor_name] = temp_tensor;
+
+            node_conv->add_input(temp_tensor_name);
+        }
+        auto& conv_bias_tensor = weights[node_conv->input(2)];
+
+        auto new_conv_weight_name   = node_conv->input(1) + "_@" + std::to_string(i);
+        auto new_conv_weight_tensor = CopyFloatTensorProto(weights[node_conv->input(1)]);
+        node_conv->set_input(1, new_conv_weight_name);
+
+        auto new_conv_bias_name   = node_conv->input(2) + "_@" + std::to_string(i);
+        auto new_conv_bias_tensor = CopyFloatTensorProto(weights[node_conv->input(2)]);
+        node_conv->set_input(2, new_conv_bias_name);
+
+        // modeify conv weight
+        float* conv_weights = get_tensor_proto_mutable_data(new_conv_weight_tensor);
+        float* conv_bias    = get_tensor_proto_mutable_data(new_conv_bias_tensor);
+
+        const int channel_input_group  = channel_input / group;
+        const int channel_output_group = channel_output / group;
+        for (int g = 0; g < group; g++) {
+            for (int g_o = 0; g_o < channel_output_group; g_o++) {
+                int oc = g * channel_output_group + g_o;
+                for (int g_i = 0; g_i < channel_input_group; g_i++) {
+                    for (int g_k = 0; g_k < kernel_size; g_k++) {
+                        int index = g * channel_output_group * channel_input_group * kernel_size +
+                                    g_o * channel_input_group * kernel_size + g_i * kernel_size + g_k;
+                        conv_weights[index] *= slope[oc];
+                    }
                 }
-                unable_fuse_table[conv_weights_name][bn_gamma_name]++;
-
-                i += 1;
+                conv_bias[oc] = conv_bias[oc] * slope[oc] + bias[oc];
+                //                        conv_bias[oc] = conv_bias[oc]
+                //                        + bias[oc] + 1000;
             }
-        } while (0);
-    }
+        }
 
-    // Conv <= Conv - BatchNormalization
-    for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
+        delete[] slope;
+        delete[] bias;
 
-        do {
-            if (node->op_type() == "Conv" && i + 1 < node_count) {
-                auto node_conv      = node;
-                auto node_batchnorm = index_nodes[i + 1].node;
-
-                // check op
-                if (!(node_batchnorm->op_type() == "BatchNormalization"))
-                    break;
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
+        weights[new_conv_bias_name]   = new_conv_bias_tensor;
+        weights[new_conv_weight_name] = new_conv_weight_tensor;
 
-                auto conv_weights_name = node_conv->input(1);
-                auto bn_gamma_name     = node_batchnorm->input(1);
-                if (unable_fuse_table[conv_weights_name].size() > 1) {
-                    break;
-                }
+        node_batchnorm->set_op_type(k_tnn_noop_type);
 
-                auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
+        node_reference.erase(node_reference.find(node_conv->output(0)));
+        blob_names.erase(node_conv->output(0));
+        node_conv->set_output(0, node_batchnorm->output(0));
 
-                bool can_fuse = false;
-                if (node_conv->output_size() == 1 && node_batchnorm->input_size() == 5 &&
-                    node_conv->output(0) == node_batchnorm->input(0)) {
-                    //目前仅仅考虑二维情况
-                    can_fuse = kernel_shape.size() == 2;
-                }
-                int kernel_size = (int)kernel_shape[0] * kernel_shape[1];
-
-                if (!can_fuse) {
-                    break;
-                }
+        i += 1;
+    }
+    return true;
+}
 
-                int group                = (int)get_node_attr_i(*node_conv, "group", 1);
-                auto& conv_weight_tensor = weights[node_conv->input(1)];
-                int channel_output       = (int)conv_weight_tensor.dims(0);
-                int channel_input        = (int)conv_weight_tensor.dims(1) * group;
-
-                float* slope = new float[channel_output];
-                float* bias  = new float[channel_output];
-                {
-                    float epsilon = get_node_attr_f(*node_batchnorm, "epsilon", 1e-5f);
-
-                    const onnx::TensorProto& gamma = weights[node_batchnorm->input(1)];
-                    const onnx::TensorProto& beta  = weights[node_batchnorm->input(2)];
-                    const onnx::TensorProto& mean  = weights[node_batchnorm->input(3)];
-                    const onnx::TensorProto& var   = weights[node_batchnorm->input(4)];
-
-                    int channels = get_tensor_proto_data_size(gamma);
-                    assert(channels == channel_output);
-
-                    // apply epsilon to var
-                    {
-                        const float* gamma_data = get_tensor_proto_data(gamma);
-                        const float* beta_data  = get_tensor_proto_data(beta);
-                        const float* mean_data  = get_tensor_proto_data(mean);
-                        const float* var_data   = get_tensor_proto_data(var);
-
-                        for (int j = 0; j < channels; j++) {
-                            double sqrt_var = sqrt(double(var_data[j]) + epsilon);
-                            bias[j]  = double(beta_data[j]) - double(gamma_data[j]) * double(mean_data[j]) / sqrt_var;
-                            slope[j] = double(gamma_data[j]) / sqrt_var;
-                        }
-                    }
+// Type 3
+// Conv <= Conv - Add
+bool CheckAndFuseConvType3(std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           std::map<std::string, std::map<std::string, int>>& unable_fuse_table,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Conv" && i + 1 < node_count) {
+        auto node_conv                = node;
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        auto node_add = index_nodes[next_indexes[0]].node;
+
+        // check op
+        if (!(node_add->op_type() == "Add")) {
+            return false;
+        }
+        if (weights.find(node_add->input(0)) == weights.end() &&
+            weights.find(node_add->input(1)) == weights.end()) {
+            // Add don't have weight
+            return false;
+        }
+
+        auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
+        bool can_fuse     = false;
+        if (node_conv->output_size() == 1 && node_add->input_size() == 2 &&
+            node_conv->output(0) == node_add->input(0)) {
+            //目前仅仅考虑二维情况
+            can_fuse = kernel_shape.size() == 2;
+        }
+        if (!can_fuse) {
+            return false;
+        }
+
+        int group                = (int)get_node_attr_i(*node_conv, "group", 1);
+        auto& conv_weight_tensor = weights[node_conv->input(1)];
+        int channel_output       = (int)conv_weight_tensor.dims(0);
+        // get add weight
+        onnx::TensorProto add_bias_tensor;
+        std::string add_bias_name;
+        if (weights.find(node_add->input(0)) != weights.end()) {
+            add_bias_name   = node_add->input(0);
+            add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
+        } else {
+            add_bias_name   = node_add->input(1);
+            add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
+        }
+
+        int add_bias_size = get_tensor_proto_data_size(add_bias_tensor);
+        if (add_bias_size != 1) {
+            if (add_bias_tensor.dims_size() < 2) {
+                return false;
+            }
+            int add_bias_channel_size = add_bias_tensor.dims(1);
+            if (add_bias_size != channel_output || add_bias_channel_size != channel_output) {
+                return false;
+            }
+        }
+        int has_bias = node_conv->input_size() == 3 ? 1 : 0;
+        if (!has_bias) {
+            // move add bias to Conv
+            node_conv->add_input(add_bias_name);
+        } else {
+            float* add_bias        = get_tensor_proto_mutable_data(add_bias_tensor);
+            auto& conv_bias_tensor = weights[node_conv->input(2)];
+            float* conv_bias       = get_tensor_proto_mutable_data(conv_bias_tensor);
+
+            for (int i = 0; i < channel_output; ++i) {
+                if (add_bias_size == 1) {
+                    conv_bias[i] = conv_bias[i] + add_bias[0];
+                } else {
+                    conv_bias[i] = conv_bias[i] + add_bias[i];
                 }
+            }
+        }
 
-                int has_bias = node_conv->input_size() == 3 ? 1 : 0;
-                if (!has_bias) {
-                    auto temp_tensor        = onnx::TensorProto(weights[node_batchnorm->input(2)]);
-                    float* temp_tensor_data = get_tensor_proto_mutable_data(temp_tensor);
-                    int channels            = get_tensor_proto_data_size(temp_tensor);
-                    assert(channels == channel_output);
-                    for (int j = 0; j < channels; j++) {
-                        temp_tensor_data[j] = 0;
-                    }
-                    auto temp_tensor_name     = node_batchnorm->output(0) + "_bias";
-                    weights[temp_tensor_name] = temp_tensor;
-
-                    node_conv->add_input(temp_tensor_name);
-                }
-                auto& conv_bias_tensor = weights[node_conv->input(2)];
-
-                auto new_conv_weight_name   = node_conv->input(1) + "_@" + std::to_string(i);
-                auto new_conv_weight_tensor = CopyFloatTensorProto(weights[node_conv->input(1)]);
-                node_conv->set_input(1, new_conv_weight_name);
-
-                auto new_conv_bias_name   = node_conv->input(2) + "_@" + std::to_string(i);
-                auto new_conv_bias_tensor = CopyFloatTensorProto(weights[node_conv->input(2)]);
-                node_conv->set_input(2, new_conv_bias_name);
-
-                // modeify conv weight
-                float* conv_weights = get_tensor_proto_mutable_data(new_conv_weight_tensor);
-                float* conv_bias    = get_tensor_proto_mutable_data(new_conv_bias_tensor);
-
-                const int channel_input_group  = channel_input / group;
-                const int channel_output_group = channel_output / group;
-                for (int g = 0; g < group; g++) {
-                    for (int g_o = 0; g_o < channel_output_group; g_o++) {
-                        int oc = g * channel_output_group + g_o;
-                        for (int g_i = 0; g_i < channel_input_group; g_i++) {
-                            for (int g_k = 0; g_k < kernel_size; g_k++) {
-                                int index = g * channel_output_group * channel_input_group * kernel_size +
-                                            g_o * channel_input_group * kernel_size + g_i * kernel_size + g_k;
-                                conv_weights[index] *= slope[oc];
-                            }
-                        }
-                        conv_bias[oc] = conv_bias[oc] * slope[oc] + bias[oc];
-                        //                        conv_bias[oc] = conv_bias[oc]
-                        //                        + bias[oc] + 1000;
-                    }
-                }
+        node_add->set_op_type(k_tnn_noop_type);
+        node_reference.erase(node_reference.find(node_conv->output(0)));
+        blob_names.erase(node_conv->output(0));
+        node_conv->set_output(0, node_add->output(0));
 
-                delete[] slope;
-                delete[] bias;
+        i += 1;
+    }
+    return true;
+}
 
-                weights[new_conv_bias_name]   = new_conv_bias_tensor;
-                weights[new_conv_weight_name] = new_conv_weight_tensor;
 
-                node_batchnorm->set_op_type(k_tnn_noop_type);
+int Onnx2TNN::FuseConv(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                       std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference,
+                       std::set<std::string>& blob_names) {
+    std::map<std::string, std::map<std::string, int>> unable_fuse_table;
 
-                node_reference.erase(node_reference.find(node_conv->output(0)));
-                blob_names.erase(node_conv->output(0));
-                node_conv->set_output(0, node_batchnorm->output(0));
+    for (int i = 0; i < index_nodes.size(); i++) {
+        // Type 1
+        // Conv <= Conv - BatchNormalization
+        do {
+            bool fused = CheckAndFuseConvType1(index_nodes, weights, node_reference, blob_names, unable_fuse_table, i);
+            if (!fused) break;
+        } while (0);
+    }
 
-                i += 1;
-            }
+    // Conv <= Conv - BatchNormalization
+    for (int i = 0; i < index_nodes.size(); i++) {
+        // Type 2
+        // Round 2 Conv <= Conv - BatchNormalization
+        do {
+            bool fused = CheckAndFuseConvType2(index_nodes, weights, node_reference, blob_names, unable_fuse_table, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 3
         // Conv <= Conv - Add
         do {
-            if (node->op_type() == "Conv" && i + 1 < node_count) {
-                auto node_conv                = node;
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                auto node_add = index_nodes[next_indexes[0]].node;
-
-                // check op
-                if (!(node_add->op_type() == "Add")) {
-                    break;
-                }
-                if (weights.find(node_add->input(0)) == weights.end() &&
-                    weights.find(node_add->input(1)) == weights.end()) {
-                    // Add don't have weight
-                    break;
-                }
-
-                auto kernel_shape = get_node_attr_ai(*node_conv, "kernel_shape");
-                bool can_fuse     = false;
-                if (node_conv->output_size() == 1 && node_add->input_size() == 2 &&
-                    node_conv->output(0) == node_add->input(0)) {
-                    //目前仅仅考虑二维情况
-                    can_fuse = kernel_shape.size() == 2;
-                }
-                if (!can_fuse) {
-                    break;
-                }
-
-                int group                = (int)get_node_attr_i(*node_conv, "group", 1);
-                auto& conv_weight_tensor = weights[node_conv->input(1)];
-                int channel_output       = (int)conv_weight_tensor.dims(0);
-                // get add weight
-                onnx::TensorProto add_bias_tensor;
-                std::string add_bias_name;
-                if (weights.find(node_add->input(0)) != weights.end()) {
-                    add_bias_name   = node_add->input(0);
-                    add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
-                } else {
-                    add_bias_name   = node_add->input(1);
-                    add_bias_tensor = onnx::TensorProto(weights[add_bias_name]);
-                }
-
-                int add_bias_size = get_tensor_proto_data_size(add_bias_tensor);
-                if (add_bias_size != 1) {
-                    if (add_bias_tensor.dims_size() < 2) {
-                        break;
-                    }
-                    int add_bias_channel_size = add_bias_tensor.dims(1);
-                    if (add_bias_size != channel_output || add_bias_channel_size != channel_output) {
-                        break;
-                    }
-                }
-                int has_bias = node_conv->input_size() == 3 ? 1 : 0;
-                if (!has_bias) {
-                    // move add bias to Conv
-                    node_conv->add_input(add_bias_name);
-                } else {
-                    float* add_bias        = get_tensor_proto_mutable_data(add_bias_tensor);
-                    auto& conv_bias_tensor = weights[node_conv->input(2)];
-                    float* conv_bias       = get_tensor_proto_mutable_data(conv_bias_tensor);
-
-                    for (int i = 0; i < channel_output; ++i) {
-                        if (add_bias_size == 1) {
-                            conv_bias[i] = conv_bias[i] + add_bias[0];
-                        } else {
-                            conv_bias[i] = conv_bias[i] + add_bias[i];
-                        }
-                    }
-                }
-
-                node_add->set_op_type(k_tnn_noop_type);
-                node_reference.erase(node_reference.find(node_conv->output(0)));
-                blob_names.erase(node_conv->output(0));
-                node_conv->set_output(0, node_add->output(0));
-
-                i += 1;
-            }
+            bool fused = CheckAndFuseConvType3(index_nodes, weights, node_reference, blob_names, unable_fuse_table, i);
+            if (!fused) break;
         } while (0);
     }
 
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc
index b4e858b3b..77993f27f 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_group_normalization.cc
@@ -14,158 +14,306 @@
 
 #include "onnx2tnn.h"
 
-int Onnx2TNN::FuseGroupNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
-                                        std::map<std::string, onnx::TensorProto>& weights,
-                                        std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+// Type 1
+// Fuse for a special model
+// GroupNormalization <= X + Reshape + InstanceNormalization + Shape + Reshape + (Unsqueeze) + Mul + (Unsqueeze) + Add
+bool CheckAndFuseGroupNormType1(std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
     auto node_count = index_nodes.size();
-    for (int i = 0; i < node_count; i++) {
-        auto node_reshape0 = index_nodes[i].node;
+    auto node_reshape0 = index_nodes[i].node;
+    string node0_type = node_reshape0->op_type();
+    if (node0_type == "Reshape" && i + 5 < node_count) {
+        auto node_inst_norm  = index_nodes[i + 1].node;
+        auto node_shape  = index_nodes[i + 2].node;
+        auto node_reshape  = index_nodes[i + 3].node;
+        
+        int offset = 5;
+        onnx::NodeProto* node_unsqueeze_scale = nullptr;
+        onnx::NodeProto* node_mul = nullptr;
+        onnx::NodeProto* node_unsqueeze_bias = nullptr;
+        onnx::NodeProto* node_add = nullptr;
+        if (i + 7 < node_count) {
+            node_unsqueeze_scale = index_nodes[i + 4].node;
+            node_mul = index_nodes[i + 5].node;
+            node_unsqueeze_bias = index_nodes[i + 6].node;
+            node_add = index_nodes[i + 7].node;
+            offset = 7;
+        } else {
+            node_mul = index_nodes[i + 4].node;
+            node_add = index_nodes[i + 5].node;
+        }
+        
+        if (node_inst_norm->op_type() != "InstanceNormalization" || node_shape->op_type() != "Shape" || node_reshape->op_type() != "Reshape" ||
+            node_mul->op_type() != "Mul" || node_add->op_type() != "Add" ) {
+            return false;
+        }
+        
+        if ((!node_unsqueeze_scale && node_unsqueeze_scale->op_type() != "Unsqueeze") ||
+            (!node_unsqueeze_bias && node_unsqueeze_bias->op_type() != "Unsqueeze" )) {
+            return false;
+        }
+        
+        //check Shape input
+        if (node_shape->input(0) != node_reshape0->input(0)) {
+            return false;
+        }
 
-        /**
-         * Fuse for a special model
-         * GroupNormalization <= X + Reshape + InstanceNormalization + Shape + Reshape + (Unsqueeze) + Mul + (Unsqueeze) + Add
-         *
-         * */
-        do {
-            string node0_type = node_reshape0->op_type();
-            if (node0_type == "Reshape" && i + 5 < node_count) {
-                auto node_inst_norm  = index_nodes[i + 1].node;
-                auto node_shape  = index_nodes[i + 2].node;
-                auto node_reshape  = index_nodes[i + 3].node;
-                
-                int offset = 5;
-                onnx::NodeProto* node_unsqueeze_scale = nullptr;
-                onnx::NodeProto* node_mul = nullptr;
-                onnx::NodeProto* node_unsqueeze_bias = nullptr;
-                onnx::NodeProto* node_add = nullptr;
-                if (i + 7 < node_count) {
-                    node_unsqueeze_scale = index_nodes[i + 4].node;
-                    node_mul = index_nodes[i + 5].node;
-                    node_unsqueeze_bias = index_nodes[i + 6].node;
-                    node_add = index_nodes[i + 7].node;
-                    offset = 7;
-                } else {
-                    node_mul = index_nodes[i + 4].node;
-                    node_add = index_nodes[i + 5].node;
-                }
-                
-                if (node_inst_norm->op_type() != "InstanceNormalization" || node_shape->op_type() != "Shape" || node_reshape->op_type() != "Reshape" ||
-                    node_mul->op_type() != "Mul" || node_add->op_type() != "Add" ) {
-                    break;
-                }
-                
-                if ((!node_unsqueeze_scale && node_unsqueeze_scale->op_type() != "Unsqueeze") ||
-                    (!node_unsqueeze_bias && node_unsqueeze_bias->op_type() != "Unsqueeze" )) {
-                    break;
-                }
-                
-                //check Shape input
-                if (node_shape->input(0) != node_reshape0->input(0)) {
-                    break;
-                }
+        if ( !(node_inst_norm->input(0) == node_reshape0->output(0) && node_reshape->input(0) == node_inst_norm->output(0) &&
+               node_reshape->input(1) == node_shape->output(0) ) ) {
+            return false;
+        }
+        
+        if ( !(node_mul->input(0) == node_reshape->output(0) && node_add->input(0) == node_mul->output(0) ) ) {
+            return false;
+        }
+        
+        //inst scale bias
+        if ( node_inst_norm->input_size() < 3 || weights.find(node_inst_norm->input(1)) == weights.end() ||
+            weights.find(node_inst_norm->input(2)) == weights.end() ){
+            return false;
+        }
+        //group scale bias
+        string group_scale_name;
+        if ( node_unsqueeze_scale && weights.find(node_unsqueeze_scale->input(0)) != weights.end()){
+            group_scale_name = node_unsqueeze_scale->input(0);
+        } else if ( !node_unsqueeze_scale && weights.find(node_mul->input(1)) != weights.end()) {
+            group_scale_name = node_mul->input(1);
+        } else {
+            return false;
+        }
+        string group_bias_name;
+        if ( node_unsqueeze_bias && weights.find(node_unsqueeze_bias->input(0)) != weights.end()){
+            group_bias_name = node_unsqueeze_bias->input(0);
+        } else if ( !node_unsqueeze_bias && weights.find(node_add->input(1)) != weights.end()) {
+            group_bias_name = node_add->input(1);
+        } else {
+            return false;
+        }
+        
+        const onnx::TensorProto& inst_scale = weights[node_inst_norm->input(1)];
+        const onnx::TensorProto& inst_bias  = weights[node_inst_norm->input(2)];
+        const int group = get_tensor_proto_data_size(inst_scale);
+        
+        onnx::TensorProto& group_scale = weights[group_scale_name];
+        onnx::TensorProto& group_bias  = weights[group_bias_name];
+        const int channels = get_tensor_proto_data_size(group_scale);
+        const int channel_per_group = channels / group;
+        
+        //fix scale bias
+        {
+            const float* inst_scale_data = get_tensor_proto_data(inst_scale);
+            const float* inst_bias_data  = get_tensor_proto_data(inst_bias);
+            float* group_scale_data  = get_tensor_proto_mutable_data(group_scale);
+            float* group_bias_data   = get_tensor_proto_mutable_data(group_bias);
+
+            for (int j = 0; j < channels; j++) {
+                int inst_index = j / channel_per_group;
+                group_bias_data[j] += group_scale_data[j]*inst_bias_data[inst_index];
+                group_scale_data[j] *= inst_scale_data[inst_index];
+            }
+        }
+        
+        node_inst_norm->set_op_type("GroupNormalization");
+        // input
+        node_inst_norm->clear_input();
+        node_inst_norm->add_input(node_reshape0->input(0));
+        node_inst_norm->add_input(group_scale_name);
+        node_inst_norm->add_input(group_bias_name);
+        
+        //output
+        node_inst_norm->set_output(0, node_add->output(0));
+        
+        //group
+        auto attr_group = node_inst_norm->add_attribute();
+        attr_group->set_name("num_groups");
+        attr_group->set_i(group);
+        
+        // auto attr_channel = node_add->add_attribute();
+        // attr_channel->set_name("channel");
+        // attr_channel->set_i(channels);
+        
+        node_reshape0->set_op_type(k_tnn_noop_type);
+        node_shape->set_op_type(k_tnn_noop_type);
+        node_reshape->set_op_type(k_tnn_noop_type);
+        node_mul->set_op_type(k_tnn_noop_type);
+        node_add->set_op_type(k_tnn_noop_type);
+        if (node_unsqueeze_scale) {
+            node_unsqueeze_scale->set_op_type(k_tnn_noop_type);
+            node_reference.erase(node_reference.find(node_unsqueeze_scale->output(0)));
+            blob_names.erase(node_unsqueeze_scale->output(0));
+        }
+        if (node_unsqueeze_bias) {
+            node_unsqueeze_bias->set_op_type(k_tnn_noop_type);
+            node_reference.erase(node_reference.find(node_unsqueeze_bias->output(0)));
+            blob_names.erase(node_unsqueeze_bias->output(0));
+        }
 
-                if ( !(node_inst_norm->input(0) == node_reshape0->output(0) && node_reshape->input(0) == node_inst_norm->output(0) &&
-                       node_reshape->input(1) == node_shape->output(0) ) ) {
-                    break;
-                }
-                
-                if ( !(node_mul->input(0) == node_reshape->output(0) && node_add->input(0) == node_mul->output(0) ) ) {
-                    break;
-                }
-                
-                //inst scale bias
-                if ( node_inst_norm->input_size() < 3 || weights.find(node_inst_norm->input(1)) == weights.end() ||
-                    weights.find(node_inst_norm->input(2)) == weights.end() ){
-                    break;
-                }
-                //group scale bias
-                string group_scale_name;
-                if ( node_unsqueeze_scale && weights.find(node_unsqueeze_scale->input(0)) != weights.end()){
-                    group_scale_name = node_unsqueeze_scale->input(0);
-                } else if ( !node_unsqueeze_scale && weights.find(node_mul->input(1)) != weights.end()) {
-                    group_scale_name = node_mul->input(1);
-                } else {
-                    break;
-                }
-                string group_bias_name;
-                if ( node_unsqueeze_bias && weights.find(node_unsqueeze_bias->input(0)) != weights.end()){
-                    group_bias_name = node_unsqueeze_bias->input(0);
-                } else if ( !node_unsqueeze_bias && weights.find(node_add->input(1)) != weights.end()) {
-                    group_bias_name = node_add->input(1);
-                } else {
-                    break;
-                }
-                
-                const onnx::TensorProto& inst_scale = weights[node_inst_norm->input(1)];
-                const onnx::TensorProto& inst_bias  = weights[node_inst_norm->input(2)];
-                const int group = get_tensor_proto_data_size(inst_scale);
-                
-                onnx::TensorProto& group_scale = weights[group_scale_name];
-                onnx::TensorProto& group_bias  = weights[group_bias_name];
-                const int channels = get_tensor_proto_data_size(group_scale);
-                const int channel_per_group = channels / group;
-                
-                //fix scale bias
-                {
-                    const float* inst_scale_data = get_tensor_proto_data(inst_scale);
-                    const float* inst_bias_data  = get_tensor_proto_data(inst_bias);
-                    float* group_scale_data  = get_tensor_proto_mutable_data(group_scale);
-                    float* group_bias_data   = get_tensor_proto_mutable_data(group_bias);
+        node_reference.erase(node_reference.find(node_reshape0->output(0)));
+        node_reference.erase(node_reference.find(node_shape->output(0)));
+        node_reference.erase(node_reference.find(node_reshape->output(0)));
+        node_reference.erase(node_reference.find(node_mul->output(0)));
+        // node_reference.erase(node_reference.find(node_add->output(0)));
+        blob_names.erase(node_reshape0->output(0));
+        blob_names.erase(node_inst_norm->output(0));
+        blob_names.erase(node_shape->output(0));
+        blob_names.erase(node_reshape->output(0));
+        blob_names.erase(node_mul->output(0));
+        i += offset;
+    }
+    return true;
+}
+
+// Type 2
+// GroupNormalization <= Reshape(0, group, -1) + InstanceNormalization + Reshape(0, channel, xx, xx) + Mul + Add
+bool CheckAndFuseGroupNormType2(std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    string reduce_type = node->op_type();
+    if (reduce_type == "Reshape" && i + 4 < node_count) {
+        auto node_reshape1 = node;
+        auto shape1 = get_node_attr_ai(*node_reshape1, "shape", weights, 1);
+        if (shape1.size() <= 2) {
+            return false;
+        }
+        const int group = (int)shape1[1];
+        
+        auto next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        auto node_instnorm = index_nodes[next_indexes[0]].node;
+        if (node_instnorm->op_type() != "InstanceNormalization")
+            return false;
+        
+        next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        auto node_reshape2 = index_nodes[next_indexes[0]].node;
+        if (node_reshape2->op_type() != "Reshape")
+            return false;
+        auto shape2 = get_node_attr_ai(*node_reshape2, "shape", weights, 1);
+        if (shape2.size() <= 2) {
+            return false;
+        }
+        const int channels = (int)shape2[1];
+        
+        next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        auto node_mul = index_nodes[next_indexes[0]].node;
+        if (node_mul->op_type() != "Mul")
+            return false;
+        
+        next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        auto node_add = index_nodes[next_indexes[0]].node;
+        if (node_add->op_type() != "Add")
+            return false;
+        
+        if (node_reshape1->output(0) != node_instnorm->input(0) ||
+            node_instnorm->output(0) != node_reshape2->input(0) ||
+            node_reshape2->output(0) != node_mul->input(0) ||
+            node_mul->output(0) != node_add->input(0)) {
+            return false;
+        }
+        
+        //inst scale bias
+        if ( node_instnorm->input_size() < 3 || weights.find(node_instnorm->input(1)) == weights.end() ||
+            weights.find(node_instnorm->input(2)) == weights.end() ){
+            return false;
+        }
+        const onnx::TensorProto& inst_scale = weights[node_instnorm->input(1)];
+        const onnx::TensorProto& inst_bias  = weights[node_instnorm->input(2)];
+        if( group != get_tensor_proto_data_size(inst_scale) ) {
+            return false;
+        }
+        
+        //group scale bias
+        string group_scale_name = node_mul->input(1);
+        string group_bias_name = node_add->input(1);
+        if ( weights.find(group_scale_name) == weights.end() ||
+            weights.find(group_bias_name) == weights.end() ){
+            return false;
+        }
 
-                    for (int j = 0; j < channels; j++) {
-                        int inst_index = j / channel_per_group;
-                        group_bias_data[j] += group_scale_data[j]*inst_bias_data[inst_index];
-                        group_scale_data[j] *= inst_scale_data[inst_index];
-                    }
-                }
-                
-                node_inst_norm->set_op_type("GroupNormalization");
-                // input
-                node_inst_norm->clear_input();
-                node_inst_norm->add_input(node_reshape0->input(0));
-                node_inst_norm->add_input(group_scale_name);
-                node_inst_norm->add_input(group_bias_name);
-                
-                //output
-                node_inst_norm->set_output(0, node_add->output(0));
-                
-                //group
-                auto attr_group = node_inst_norm->add_attribute();
-                attr_group->set_name("num_groups");
-                attr_group->set_i(group);
-                
-//                auto attr_channel = node_add->add_attribute();
-//                attr_channel->set_name("channel");
-//                attr_channel->set_i(channels);
-                
-                node_reshape0->set_op_type(k_tnn_noop_type);
-                node_shape->set_op_type(k_tnn_noop_type);
-                node_reshape->set_op_type(k_tnn_noop_type);
-                node_mul->set_op_type(k_tnn_noop_type);
-                node_add->set_op_type(k_tnn_noop_type);
-                if (node_unsqueeze_scale) {
-                    node_unsqueeze_scale->set_op_type(k_tnn_noop_type);
-                    node_reference.erase(node_reference.find(node_unsqueeze_scale->output(0)));
-                    blob_names.erase(node_unsqueeze_scale->output(0));
-                }
-                if (node_unsqueeze_bias) {
-                    node_unsqueeze_bias->set_op_type(k_tnn_noop_type);
-                    node_reference.erase(node_reference.find(node_unsqueeze_bias->output(0)));
-                    blob_names.erase(node_unsqueeze_bias->output(0));
-                }
+        onnx::TensorProto& group_scale = weights[group_scale_name];
+        onnx::TensorProto& group_bias  = weights[group_bias_name];
+        if (channels != get_tensor_proto_data_size(group_scale)) {
+            return false;
+        }
+        const int channel_per_group = channels / group;
+        //fix scale bias
+        {
+            const float* inst_scale_data = get_tensor_proto_data(inst_scale);
+            const float* inst_bias_data  = get_tensor_proto_data(inst_bias);
+            float* group_scale_data  = get_tensor_proto_mutable_data(group_scale);
+            float* group_bias_data   = get_tensor_proto_mutable_data(group_bias);
 
-                node_reference.erase(node_reference.find(node_reshape0->output(0)));
-                node_reference.erase(node_reference.find(node_shape->output(0)));
-                node_reference.erase(node_reference.find(node_reshape->output(0)));
-                node_reference.erase(node_reference.find(node_mul->output(0)));
-//                node_reference.erase(node_reference.find(node_add->output(0)));
-                blob_names.erase(node_reshape0->output(0));
-                blob_names.erase(node_inst_norm->output(0));
-                blob_names.erase(node_shape->output(0));
-                blob_names.erase(node_reshape->output(0));
-                blob_names.erase(node_mul->output(0));
-                i += offset;
+            for (int j = 0; j < channels; j++) {
+                int inst_index = j / channel_per_group;
+                group_bias_data[j] += group_scale_data[j]*inst_bias_data[inst_index];
+                group_scale_data[j] *= inst_scale_data[inst_index];
             }
+        }
+        
+        node_reshape1->set_op_type(k_tnn_noop_type);
+        node_instnorm->set_op_type(k_tnn_noop_type);
+        node_reshape2->set_op_type(k_tnn_noop_type);
+        node_mul->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(node_reference.find(node_reshape1->output(0)));
+        node_reference.erase(node_reference.find(node_instnorm->output(0)));
+        node_reference.erase(node_reference.find(node_reshape2->output(0)));
+        node_reference.erase(node_reference.find(node_mul->output(0)));
+        
+        blob_names.erase(node_reshape1->output(0));
+        blob_names.erase(node_instnorm->output(0));
+        blob_names.erase(node_reshape2->output(0));
+        blob_names.erase(node_mul->output(0));
+        
+        node_add->set_op_type("GroupNormalization");
+        // input
+        node_add->clear_input();
+        node_add->add_input(node_reshape1->input(0));
+        node_add->add_input(group_scale_name);
+        node_add->add_input(group_bias_name);
+        
+        auto attr_group = node_add->add_attribute();
+        attr_group->set_name("num_groups");
+        attr_group->set_i(group);
+        auto attr_epsilon = node_add->add_attribute();
+        attr_epsilon->set_name("epsilon");
+        attr_epsilon->set_f(get_node_attr_f(*node_instnorm, "epsilon"));
+        // auto attr_channel = node_add->add_attribute();
+        // attr_channel->set_name("channel");
+        // attr_channel->set_i(channels);
+        i += 4;
+    }
+    return true;
+}
+
+
+int Onnx2TNN::FuseGroupNormalization(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                        std::map<std::string, onnx::TensorProto>& weights,
+                                        std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    for (int i = 0; i < index_nodes.size(); i++) {
+        // Type 1
+        // Fuse for a special model
+        // GroupNormalization <= X + Reshape + InstanceNormalization + Shape + Reshape + (Unsqueeze) + Mul + (Unsqueeze) + Add
+        do {
+            bool fused = CheckAndFuseGroupNormType1(index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }
     ClearEmptyNode(index_nodes);
@@ -174,139 +322,17 @@ int Onnx2TNN::FuseGroupNormalization(onnx::GraphProto* mutable_graph, std::vecto
     
     //right now group norm is not implemented on device of arm x86 opencl metal and cuda, so return it
     return 0;
-    node_count = index_nodes.size();
-    for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
+
+    /*
+    for (int i = 0; i < index_nodes.size(); i++) {
+        // Type 2
         //GroupNormalization <= Reshape(0, group, -1) + InstanceNormalization + Reshape(0, channel, xx, xx) + Mul + Add
         do {
-            string reduce_type = node->op_type();
-            if (reduce_type == "Reshape" && i + 4 < node_count) {
-                auto node_reshape1 = node;
-                auto shape1 = get_node_attr_ai(*node_reshape1, "shape", weights, 1);
-                if (shape1.size() <= 2) {
-                    break;
-                }
-                const int group = (int)shape1[1];
-                
-                auto next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                auto node_instnorm = index_nodes[next_indexes[0]].node;
-                if (node_instnorm->op_type() != "InstanceNormalization")
-                    break;
-                
-                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                auto node_reshape2 = index_nodes[next_indexes[0]].node;
-                if (node_reshape2->op_type() != "Reshape")
-                    break;
-                auto shape2 = get_node_attr_ai(*node_reshape2, "shape", weights, 1);
-                if (shape2.size() <= 2) {
-                    break;
-                }
-                const int channels = (int)shape2[1];
-                
-                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                auto node_mul = index_nodes[next_indexes[0]].node;
-                if (node_mul->op_type() != "Mul")
-                    break;
-                
-                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                auto node_add = index_nodes[next_indexes[0]].node;
-                if (node_add->op_type() != "Add")
-                    break;
-                
-                if (node_reshape1->output(0) != node_instnorm->input(0) ||
-                    node_instnorm->output(0) != node_reshape2->input(0) ||
-                    node_reshape2->output(0) != node_mul->input(0) ||
-                    node_mul->output(0) != node_add->input(0)) {
-                    break;
-                }
-                
-                //inst scale bias
-                if ( node_instnorm->input_size() < 3 || weights.find(node_instnorm->input(1)) == weights.end() ||
-                    weights.find(node_instnorm->input(2)) == weights.end() ){
-                    break;
-                }
-                const onnx::TensorProto& inst_scale = weights[node_instnorm->input(1)];
-                const onnx::TensorProto& inst_bias  = weights[node_instnorm->input(2)];
-                if( group != get_tensor_proto_data_size(inst_scale) ) {
-                    break;
-                }
-                
-                //group scale bias
-                string group_scale_name = node_mul->input(1);
-                string group_bias_name = node_add->input(1);
-                if ( weights.find(group_scale_name) == weights.end() ||
-                    weights.find(group_bias_name) == weights.end() ){
-                    break;
-                }
-      
-                onnx::TensorProto& group_scale = weights[group_scale_name];
-                onnx::TensorProto& group_bias  = weights[group_bias_name];
-                if (channels != get_tensor_proto_data_size(group_scale)) {
-                    break;
-                }
-                const int channel_per_group = channels / group;
-                //fix scale bias
-                {
-                    const float* inst_scale_data = get_tensor_proto_data(inst_scale);
-                    const float* inst_bias_data  = get_tensor_proto_data(inst_bias);
-                    float* group_scale_data  = get_tensor_proto_mutable_data(group_scale);
-                    float* group_bias_data   = get_tensor_proto_mutable_data(group_bias);
-
-                    for (int j = 0; j < channels; j++) {
-                        int inst_index = j / channel_per_group;
-                        group_bias_data[j] += group_scale_data[j]*inst_bias_data[inst_index];
-                        group_scale_data[j] *= inst_scale_data[inst_index];
-                    }
-                }
-                
-                node_reshape1->set_op_type(k_tnn_noop_type);
-                node_instnorm->set_op_type(k_tnn_noop_type);
-                node_reshape2->set_op_type(k_tnn_noop_type);
-                node_mul->set_op_type(k_tnn_noop_type);
-
-                node_reference.erase(node_reference.find(node_reshape1->output(0)));
-                node_reference.erase(node_reference.find(node_instnorm->output(0)));
-                node_reference.erase(node_reference.find(node_reshape2->output(0)));
-                node_reference.erase(node_reference.find(node_mul->output(0)));
-                
-                blob_names.erase(node_reshape1->output(0));
-                blob_names.erase(node_instnorm->output(0));
-                blob_names.erase(node_reshape2->output(0));
-                blob_names.erase(node_mul->output(0));
-                
-                node_add->set_op_type("GroupNormalization");
-                // input
-                node_add->clear_input();
-                node_add->add_input(node_reshape1->input(0));
-                node_add->add_input(group_scale_name);
-                node_add->add_input(group_bias_name);
-                
-                auto attr_group = node_add->add_attribute();
-                attr_group->set_name("num_groups");
-                attr_group->set_i(group);
-                auto attr_epsilon = node_add->add_attribute();
-                attr_epsilon->set_name("epsilon");
-                attr_epsilon->set_f(get_node_attr_f(*node_instnorm, "epsilon"));
-//                auto attr_channel = node_add->add_attribute();
-//                attr_channel->set_name("channel");
-//                attr_channel->set_i(channels);
-                
-                i += 4;
-            }
+            bool fused = CheckAndFuseGroupNormType2(index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }
     ClearEmptyNode(index_nodes);
     return 0;
+    */
 }
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc
index 704122475..278c12b71 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_hard_swish.cc
@@ -21,369 +21,440 @@ inline bool IsEqual(float num1, float num2) {
     return std::abs(num1 - num2) <= 1e-6;
 }
 
+//HardSwish Fuse Cases:
+// Type 1
+// HardSwish <= Add - Clip - Div - Mul
+// out =  x0 * (clip(x0 + 3, 0, 6) / 6)
+// out =  x0 * clip(x0/6 + 3/6, 0, 1)
+// ensure HardSigmoid first called before FuseHardSwish, so this pattern never happen
+bool CheckAndFuseHardSwishType1(OnnxNetInfo& onnx_net_info,
+                                std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Add" && i + 3 < node_count) {
+        if (node_reference.find(node->output(0)) == node_reference.end() ||
+            node_reference[node->output(0)] != 1)
+            return false;
+
+        if (weights.find(node->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& add_three = weights[node->input(1)];
+        if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+            return false;
+
+        float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                            : add_three.float_data().data()[0];
+        if (!IsEqual(constant_add_three, 3.f))
+            return false;
+
+        auto node_clip = index_nodes[i + 1].node;
+        auto node_div  = index_nodes[i + 2].node;
+        auto node_mul  = index_nodes[i + 3].node;
+
+        if (node_clip->op_type() != "Clip" || node_div->op_type() != "Div" || node_mul->op_type() != "Mul")
+            return false;
+
+        if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+            node_reference[node_clip->output(0)] != 1)
+            return false;
+
+        if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
+            node_reference[node_mul->output(0)] != 1)
+            return false;
+
+        float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info, 1, -FLT_MAX);
+        float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info, 2, FLT_MAX);
+        if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+            return false;
+
+        if (!(node_div->input_size() == 2 && node_div->input(0) == node_clip->output(0)))
+            return false;
+
+        if (weights.find(node_div->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& div_six = weights[node_div->input(1)];
+        if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+            return false;
+
+        float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
+                                                        : div_six.float_data().data()[0];
+        if (!IsEqual(constant_div_six, 6.f))
+            return false;
+        int x0_index = (node_mul->input(1) == node_div->output(0)) ? 0 : 1;
+        std::vector<std::string> inputs;
+        inputs.push_back(node_mul->input(x0_index));
+        if (inputs[0] != node->input(0)) {
+            inputs.push_back(node->input(0));
+        }
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+        node_clip->set_op_type(k_tnn_noop_type);
+        node_div->set_op_type(k_tnn_noop_type);
+
+        node_reference[node->input(0)] -= 1;
+
+        node_reference.erase(node_reference.find(node->output(0)));
+        node_reference.erase(node_reference.find(node_clip->output(0)));
+        node_reference.erase(node_reference.find(node_div->output(0)));
+        blob_names.erase(node->output(0));
+        blob_names.erase(node_clip->output(0));
+        blob_names.erase(node_div->output(0));
+
+        node_mul->set_op_type("HardSwish");
+        node_mul->clear_input();
+        node_mul->add_input(inputs[0]);
+        if (inputs.size() == 2) {
+            node_mul->add_input(inputs[1]);
+        }
+
+        onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+        attr_alpha->set_name("alpha");
+        attr_alpha->set_f(1.f / 6.f);
+
+        onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+        attr_beta->set_name("beta");
+        attr_beta->set_f(3.f / 6.f);
+
+        i += 3;
+    }
+    return true;
+}
+
+// Type 2
+// HardSwish <= Add - Clip - Cast - Div - Cast - Mul
+// out =  x0 * (clip(x0 + 3, 0, 6) / 6)
+// out =  x0 * clip(x0/6 + 3/6, 0, 1)
+bool CheckAndFuseHardSwishType2(OnnxNetInfo& onnx_net_info,
+                                std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Add" && i + 6 < node_count) {
+        if (node_reference.find(node->output(0)) == node_reference.end() ||
+            node_reference[node->output(0)] != 1)
+            return false;
+
+        if (weights.find(node->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& add_three = weights[node->input(1)];
+        if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+            return false;
+
+        float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                            : add_three.float_data().data()[0];
+        if (!IsEqual(constant_add_three, 3.f))
+            return false;
+
+        auto node_clip   = index_nodes[i + 1].node;
+        auto node_cast_1 = index_nodes[i + 2].node;
+        auto node_div    = index_nodes[i + 3].node;
+        auto node_cast_2 = index_nodes[i + 4].node;
+        auto node_mul    = index_nodes[i + 5].node;
+
+        if (node_clip->op_type() != "Clip" || node_cast_1->op_type() != "Cast" ||
+            node_div->op_type() != "Div" || node_cast_2->op_type() != "Cast" || node_mul->op_type() != "Mul")
+            return false;
+
+        if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+            node_reference[node_clip->output(0)] != 1)
+            return false;
+
+        if (node_reference.find(node_mul->output(0)) == node_reference.end())
+            return false;
+
+        float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info, 1, -FLT_MAX);
+        float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info, 2, FLT_MAX);
+        if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+            return false;
+
+        if (!(node_div->input_size() == 2 && node_div->input(0) == node_cast_1->output(0)))
+            return false;
+
+        if (weights.find(node_div->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& div_six = weights[node_div->input(1)];
+        if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+            return false;
+
+        float constant_div_six = 0.f;
+        if (div_six.has_raw_data()) {
+            auto data_type = div_six.data_type();
+            if (data_type == onnx::TensorProto_DataType_FLOAT) {
+                constant_div_six = ((const float*)div_six.raw_data().data())[0];
+            } else if (data_type == onnx::TensorProto_DataType_DOUBLE) {
+                constant_div_six = (float)((const double*)div_six.raw_data().data())[0];
+            }
+        } else {
+            constant_div_six = div_six.float_data().data()[0];
+        }
+        if (!IsEqual(constant_div_six, 6.f))
+            return false;
+        int x0_index = (node_mul->input(1) == node_cast_2->output(0)) ? 0 : 1;
+        std::vector<std::string> inputs;
+        inputs.push_back(node_mul->input(x0_index));
+        if (inputs[0] != node->input(0)) {
+            inputs.push_back(node->input(0));
+        }
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+        node_clip->set_op_type(k_tnn_noop_type);
+        node_cast_1->set_op_type(k_tnn_noop_type);
+        node_div->set_op_type(k_tnn_noop_type);
+        node_cast_2->set_op_type(k_tnn_noop_type);
+
+        node_reference[node->input(0)] -= 1;
+
+        node_reference.erase(node_reference.find(node->output(0)));
+        node_reference.erase(node_reference.find(node_clip->output(0)));
+        node_reference.erase(node_reference.find(node_cast_1->output(0)));
+        node_reference.erase(node_reference.find(node_div->output(0)));
+        node_reference.erase(node_reference.find(node_cast_2->output(0)));
+        blob_names.erase(node->output(0));
+        blob_names.erase(node_clip->output(0));
+        blob_names.erase(node_cast_1->output(0));
+        blob_names.erase(node_div->output(0));
+        blob_names.erase(node_cast_2->output(0));
+
+        node_mul->set_op_type("HardSwish");
+        node_mul->clear_input();
+        node_mul->add_input(inputs[0]);
+        if (inputs.size() == 2) {
+            node_mul->add_input(inputs[1]);
+        }
+
+        onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+        attr_alpha->set_name("alpha");
+        attr_alpha->set_f(1.f / 6.f);
+
+        onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+        attr_beta->set_name("beta");
+        attr_beta->set_f(3.f / 6.f);
+
+        i += 5;
+    }
+    return true;
+}
+
+// Type 3
+// HardSwish <= Add - Clip - Mul - Div
+// out =  (x0 * clip(x1 + 3, 0, 6)) / 6
+// out =  x0 * clip(x1/6 + 3/6, 0, 1)
+bool CheckAndFuseHardSwishType3(OnnxNetInfo& onnx_net_info,
+                                std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Add" && i + 3 < node_count) {
+        if (node_reference.find(node->output(0)) == node_reference.end() ||
+            node_reference[node->output(0)] != 1)
+            return false;
+
+        if (weights.find(node->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& add_three = weights[node->input(1)];
+        if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
+            return false;
+
+        float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
+                                                            : add_three.float_data().data()[0];
+        if (!IsEqual(constant_add_three, 3.f))
+            return false;
+
+        auto node_clip = index_nodes[i + 1].node;
+        auto node_mul  = index_nodes[i + 2].node;
+        auto node_div  = index_nodes[i + 3].node;
+
+        if (node_clip->op_type() != "Clip" || node_mul->op_type() != "Mul" || node_div->op_type() != "Div")
+            return false;
+
+        if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
+            node_reference[node_clip->output(0)] != 1)
+            return false;
+
+        if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
+            node_reference[node_mul->output(0)] != 1)
+            return false;
+
+        float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info, 1, -FLT_MAX);
+        float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info, 2, FLT_MAX);
+        if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
+            return false;
+
+        if (!(node_mul->input_size() == 2 &&
+              (node_mul->input(0) == node_clip->output(0) || node_mul->input(1) == node_clip->output(0))))
+            return false;
+        int x0_index = (node_mul->input(1) == node_clip->output(0)) ? 0 : 1;
+        std::vector<std::string> inputs;
+        inputs.push_back(node_mul->input(x0_index));
+        if (inputs[0] != node->input(0)) {
+            inputs.push_back(node->input(0));
+        }
+
+        if (node_div->input(0) != node_mul->output(0))
+            return false;
+
+        if (weights.find(node_div->input(1)) == weights.end())
+            return false;
+
+        const onnx::TensorProto& div_six = weights[node_div->input(1)];
+        if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
+            return false;
+
+        float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
+                                                        : div_six.float_data().data()[0];
+        if (!IsEqual(constant_div_six, 6.f))
+            return false;
+
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+        node_clip->set_op_type(k_tnn_noop_type);
+        node_mul->set_op_type(k_tnn_noop_type);
+
+        node_reference[node->input(0)] -= 1;
+
+        node_reference.erase(node_reference.find(node->output(0)));
+        node_reference.erase(node_reference.find(node_clip->output(0)));
+        node_reference.erase(node_reference.find(node_mul->output(0)));
+        blob_names.erase(node->output(0));
+        blob_names.erase(node_clip->output(0));
+        blob_names.erase(node_mul->output(0));
+
+        node_div->set_op_type("HardSwish");
+        node_div->clear_input();
+        node_div->add_input(inputs[0]);
+        if (inputs.size() == 2) {
+            node_div->add_input(inputs[1]);
+        }
+
+        onnx::AttributeProto* attr_alpha = node_div->add_attribute();
+        attr_alpha->set_name("alpha");
+        attr_alpha->set_f(1.f / 6.f);
+
+        onnx::AttributeProto* attr_beta = node_div->add_attribute();
+        attr_beta->set_name("beta");
+        attr_beta->set_f(3.f / 6.f);
+
+        i += 3;
+    }
+    return true;
+}
+
+// Type 4
+// HardSwish <= HardSigmoid - Mul
+bool CheckAndFuseHardSwishType4(OnnxNetInfo& onnx_net_info,
+                                std::vector<IndexNode>& index_nodes,
+                                std::map<std::string, onnx::TensorProto>& weights,
+                                std::map<std::string, int>& node_reference,
+                                std::set<std::string>& blob_names,
+                                int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "HardSigmoid" && i + 1 < node_count) {
+        if (node_reference.find(node->output(0)) == node_reference.end() ||
+            node_reference[node->output(0)] != 1)
+            return false;
+
+        float alpha = get_node_attr_f(*node, "alpha", 0.2f);
+        float beta  = get_node_attr_f(*node, "beta", 0.5f);
+
+        auto node_mul = index_nodes[i + 1].node;
+
+        if (node_mul->op_type() != "Mul")
+            return false;
+
+        if (!(node_mul->input_size() == 2 &&
+              (node_mul->input(0) == node->output(0) || node_mul->input(1) == node->output(0))))
+            return false;
+
+        int x0_index = (node_mul->input(1) == node->output(0)) ? 0 : 1;
+        std::vector<std::string> inputs;
+        inputs.push_back(node_mul->input(x0_index));
+        if (inputs[0] != node->input(0)) {
+            inputs.push_back(node->input(0));
+        }
+
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+
+        node_reference[node->input(0)] -= 1;
+
+        node_reference.erase(node_reference.find(node->output(0)));
+        blob_names.erase(node->output(0));
+
+        node_mul->set_op_type("HardSwish");
+        node_mul->clear_input();
+        node_mul->add_input(inputs[0]);
+        if (inputs.size() == 2) {
+            node_mul->add_input(inputs[1]);
+        }
+
+        onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
+        attr_alpha->set_name("alpha");
+        attr_alpha->set_f(alpha);
+
+        onnx::AttributeProto* attr_beta = node_mul->add_attribute();
+        attr_beta->set_name("beta");
+        attr_beta->set_f(beta);
+
+        i += 1;
+    }
+    return true;
+}
+
 
 int Onnx2TNN::FuseHardSwish(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
                             std::map<std::string, onnx::TensorProto>& weights,
                             std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
     auto const node_count = index_nodes.size();
-
     for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
+        // Type 1
         // HardSwish <= Add - Clip - Div - Mul
         // out =  x0 * (clip(x0 + 3, 0, 6) / 6)
         // out =  x0 * clip(x0/6 + 3/6, 0, 1)
         // ensure HardSigmoid first called before FuseHardSwish, so this pattern never happen
         do {
-            if (node->op_type() == "Add" && i + 3 < node_count) {
-                if (node_reference.find(node->output(0)) == node_reference.end() ||
-                    node_reference[node->output(0)] != 1)
-                    break;
-
-                if (weights.find(node->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& add_three = weights[node->input(1)];
-                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
-                    break;
-
-                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
-                                                                    : add_three.float_data().data()[0];
-                if (!IsEqual(constant_add_three, 3.f))
-                    break;
-
-                auto node_clip = index_nodes[i + 1].node;
-                auto node_div  = index_nodes[i + 2].node;
-                auto node_mul  = index_nodes[i + 3].node;
-
-                if (node_clip->op_type() != "Clip" || node_div->op_type() != "Div" || node_mul->op_type() != "Mul")
-                    break;
-
-                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
-                    node_reference[node_clip->output(0)] != 1)
-                    break;
-
-                if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
-                    node_reference[node_mul->output(0)] != 1)
-                    break;
-
-                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
-                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
-                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
-                    break;
-
-                if (!(node_div->input_size() == 2 && node_div->input(0) == node_clip->output(0)))
-                    break;
-
-                if (weights.find(node_div->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& div_six = weights[node_div->input(1)];
-                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
-                    break;
-
-                float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
-                                                                : div_six.float_data().data()[0];
-                if (!IsEqual(constant_div_six, 6.f))
-                    break;
-                int x0_index = (node_mul->input(1) == node_div->output(0)) ? 0 : 1;
-                std::vector<std::string> inputs;
-                inputs.push_back(node_mul->input(x0_index));
-                if (inputs[0] != node->input(0)) {
-                    inputs.push_back(node->input(0));
-                }
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-                node_clip->set_op_type(k_tnn_noop_type);
-                node_div->set_op_type(k_tnn_noop_type);
-
-                node_reference[node->input(0)] -= 1;
-
-                node_reference.erase(node_reference.find(node->output(0)));
-                node_reference.erase(node_reference.find(node_clip->output(0)));
-                node_reference.erase(node_reference.find(node_div->output(0)));
-                blob_names.erase(node->output(0));
-                blob_names.erase(node_clip->output(0));
-                blob_names.erase(node_div->output(0));
-
-                node_mul->set_op_type("HardSwish");
-                node_mul->clear_input();
-                node_mul->add_input(inputs[0]);
-                if (inputs.size() == 2) {
-                    node_mul->add_input(inputs[1]);
-                }
-
-                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
-                attr_alpha->set_name("alpha");
-                attr_alpha->set_f(1.f / 6.f);
-
-                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
-                attr_beta->set_name("beta");
-                attr_beta->set_f(3.f / 6.f);
-
-                i += 3;
-            }
+            bool fused = CheckAndFuseHardSwishType1(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 2
         // HardSwish <= Add - Clip - Cast - Div - Cast - Mul
         // out =  x0 * (clip(x0 + 3, 0, 6) / 6)
         // out =  x0 * clip(x0/6 + 3/6, 0, 1)
         do {
-            if (node->op_type() == "Add" && i + 6 < node_count) {
-                if (node_reference.find(node->output(0)) == node_reference.end() ||
-                    node_reference[node->output(0)] != 1)
-                    break;
-
-                if (weights.find(node->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& add_three = weights[node->input(1)];
-                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
-                    break;
-
-                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
-                                                                    : add_three.float_data().data()[0];
-                if (!IsEqual(constant_add_three, 3.f))
-                    break;
-
-                auto node_clip   = index_nodes[i + 1].node;
-                auto node_cast_1 = index_nodes[i + 2].node;
-                auto node_div    = index_nodes[i + 3].node;
-                auto node_cast_2 = index_nodes[i + 4].node;
-                auto node_mul    = index_nodes[i + 5].node;
-
-                if (node_clip->op_type() != "Clip" || node_cast_1->op_type() != "Cast" ||
-                    node_div->op_type() != "Div" || node_cast_2->op_type() != "Cast" || node_mul->op_type() != "Mul")
-                    break;
-
-                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
-                    node_reference[node_clip->output(0)] != 1)
-                    break;
-
-                if (node_reference.find(node_mul->output(0)) == node_reference.end())
-                    break;
-
-                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
-                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
-                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
-                    break;
-
-                if (!(node_div->input_size() == 2 && node_div->input(0) == node_cast_1->output(0)))
-                    break;
-
-                if (weights.find(node_div->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& div_six = weights[node_div->input(1)];
-                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
-                    break;
-
-                float constant_div_six = 0.f;
-                if (div_six.has_raw_data()) {
-                    auto data_type = div_six.data_type();
-                    if (data_type == onnx::TensorProto_DataType_FLOAT) {
-                        constant_div_six = ((const float*)div_six.raw_data().data())[0];
-                    } else if (data_type == onnx::TensorProto_DataType_DOUBLE) {
-                        constant_div_six = (float)((const double*)div_six.raw_data().data())[0];
-                    }
-                } else {
-                    constant_div_six = div_six.float_data().data()[0];
-                }
-                if (!IsEqual(constant_div_six, 6.f))
-                    break;
-                int x0_index = (node_mul->input(1) == node_cast_2->output(0)) ? 0 : 1;
-                std::vector<std::string> inputs;
-                inputs.push_back(node_mul->input(x0_index));
-                if (inputs[0] != node->input(0)) {
-                    inputs.push_back(node->input(0));
-                }
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-                node_clip->set_op_type(k_tnn_noop_type);
-                node_cast_1->set_op_type(k_tnn_noop_type);
-                node_div->set_op_type(k_tnn_noop_type);
-                node_cast_2->set_op_type(k_tnn_noop_type);
-
-                node_reference[node->input(0)] -= 1;
-
-                node_reference.erase(node_reference.find(node->output(0)));
-                node_reference.erase(node_reference.find(node_clip->output(0)));
-                node_reference.erase(node_reference.find(node_cast_1->output(0)));
-                node_reference.erase(node_reference.find(node_div->output(0)));
-                node_reference.erase(node_reference.find(node_cast_2->output(0)));
-                blob_names.erase(node->output(0));
-                blob_names.erase(node_clip->output(0));
-                blob_names.erase(node_cast_1->output(0));
-                blob_names.erase(node_div->output(0));
-                blob_names.erase(node_cast_2->output(0));
-
-                node_mul->set_op_type("HardSwish");
-                node_mul->clear_input();
-                node_mul->add_input(inputs[0]);
-                if (inputs.size() == 2) {
-                    node_mul->add_input(inputs[1]);
-                }
-
-                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
-                attr_alpha->set_name("alpha");
-                attr_alpha->set_f(1.f / 6.f);
-
-                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
-                attr_beta->set_name("beta");
-                attr_beta->set_f(3.f / 6.f);
-
-                i += 5;
-            }
+            bool fused = CheckAndFuseHardSwishType2(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 3
         // HardSwish <= Add - Clip - Mul - Div
         // out =  (x0 * clip(x1 + 3, 0, 6)) / 6
         // out =  x0 * clip(x1/6 + 3/6, 0, 1)
         do {
-            if (node->op_type() == "Add" && i + 3 < node_count) {
-                if (node_reference.find(node->output(0)) == node_reference.end() ||
-                    node_reference[node->output(0)] != 1)
-                    break;
-
-                if (weights.find(node->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& add_three = weights[node->input(1)];
-                if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1)
-                    break;
-
-                float constant_add_three = add_three.has_raw_data() ? ((const float*)add_three.raw_data().data())[0]
-                                                                    : add_three.float_data().data()[0];
-                if (!IsEqual(constant_add_three, 3.f))
-                    break;
-
-                auto node_clip = index_nodes[i + 1].node;
-                auto node_mul  = index_nodes[i + 2].node;
-                auto node_div  = index_nodes[i + 3].node;
-
-                if (node_clip->op_type() != "Clip" || node_mul->op_type() != "Mul" || node_div->op_type() != "Div")
-                    break;
-
-                if (node_reference.find(node_clip->output(0)) == node_reference.end() ||
-                    node_reference[node_clip->output(0)] != 1)
-                    break;
-
-                if (node_reference.find(node_mul->output(0)) == node_reference.end() ||
-                    node_reference[node_mul->output(0)] != 1)
-                    break;
-
-                float relu6_min = get_node_attr_f(*node_clip, "min", onnx_net_info_, 1, -FLT_MAX);
-                float relu6_max = get_node_attr_f(*node_clip, "max", onnx_net_info_, 2, FLT_MAX);
-                if (!IsEqual(relu6_min, 0.f) || !IsEqual(relu6_max, 6.f))
-                    break;
-
-                if (!(node_mul->input_size() == 2 &&
-                      (node_mul->input(0) == node_clip->output(0) || node_mul->input(1) == node_clip->output(0))))
-                    break;
-                int x0_index = (node_mul->input(1) == node_clip->output(0)) ? 0 : 1;
-                std::vector<std::string> inputs;
-                inputs.push_back(node_mul->input(x0_index));
-                if (inputs[0] != node->input(0)) {
-                    inputs.push_back(node->input(0));
-                }
-
-                if (node_div->input(0) != node_mul->output(0))
-                    break;
-
-                if (weights.find(node_div->input(1)) == weights.end())
-                    break;
-
-                const onnx::TensorProto& div_six = weights[node_div->input(1)];
-                if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1)
-                    break;
-
-                float constant_div_six = div_six.has_raw_data() ? ((const float*)div_six.raw_data().data())[0]
-                                                                : div_six.float_data().data()[0];
-                if (!IsEqual(constant_div_six, 6.f))
-                    break;
-
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-                node_clip->set_op_type(k_tnn_noop_type);
-                node_mul->set_op_type(k_tnn_noop_type);
-
-                node_reference[node->input(0)] -= 1;
-
-                node_reference.erase(node_reference.find(node->output(0)));
-                node_reference.erase(node_reference.find(node_clip->output(0)));
-                node_reference.erase(node_reference.find(node_mul->output(0)));
-                blob_names.erase(node->output(0));
-                blob_names.erase(node_clip->output(0));
-                blob_names.erase(node_mul->output(0));
-
-                node_div->set_op_type("HardSwish");
-                node_div->clear_input();
-                node_div->add_input(inputs[0]);
-                if (inputs.size() == 2) {
-                    node_div->add_input(inputs[1]);
-                }
-
-                onnx::AttributeProto* attr_alpha = node_div->add_attribute();
-                attr_alpha->set_name("alpha");
-                attr_alpha->set_f(1.f / 6.f);
-
-                onnx::AttributeProto* attr_beta = node_div->add_attribute();
-                attr_beta->set_name("beta");
-                attr_beta->set_f(3.f / 6.f);
-
-                i += 3;
-            }
+            bool fused = CheckAndFuseHardSwishType3(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
+
+        // Type 4
         // HardSwish <= HardSigmoid - Mul
         do {
-            if (node->op_type() == "HardSigmoid" && i + 1 < node_count) {
-                if (node_reference.find(node->output(0)) == node_reference.end() ||
-                    node_reference[node->output(0)] != 1)
-                    break;
-
-                float alpha = get_node_attr_f(*node, "alpha", 0.2f);
-                float beta  = get_node_attr_f(*node, "beta", 0.5f);
-
-                auto node_mul = index_nodes[i + 1].node;
-
-                if (node_mul->op_type() != "Mul")
-                    break;
-
-                if (!(node_mul->input_size() == 2 &&
-                      (node_mul->input(0) == node->output(0) || node_mul->input(1) == node->output(0))))
-                    break;
-
-                int x0_index = (node_mul->input(1) == node->output(0)) ? 0 : 1;
-                std::vector<std::string> inputs;
-                inputs.push_back(node_mul->input(x0_index));
-                if (inputs[0] != node->input(0)) {
-                    inputs.push_back(node->input(0));
-                }
-
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-
-                node_reference[node->input(0)] -= 1;
-
-                node_reference.erase(node_reference.find(node->output(0)));
-                blob_names.erase(node->output(0));
-
-                node_mul->set_op_type("HardSwish");
-                node_mul->clear_input();
-                node_mul->add_input(inputs[0]);
-                if (inputs.size() == 2) {
-                    node_mul->add_input(inputs[1]);
-                }
-
-                onnx::AttributeProto* attr_alpha = node_mul->add_attribute();
-                attr_alpha->set_name("alpha");
-                attr_alpha->set_f(alpha);
-
-                onnx::AttributeProto* attr_beta = node_mul->add_attribute();
-                attr_beta->set_name("beta");
-                attr_beta->set_f(beta);
-
-                i += 1;
-            }
+            bool fused = CheckAndFuseHardSwishType4(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }
 
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc
index 7cd1ee348..173e08008 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_lstm.cc
@@ -16,290 +16,337 @@
 
 #include "onnx2tnn.h"
 
-int Onnx2TNN::FuseLSTM(onnx::GraphProto* mutable_graph,
-                                  std::vector<IndexNode> & index_nodes,
-                                  std::map<std::string, onnx::TensorProto>& weights,
-                                  std::map<std::string, int>& node_reference,
-                                  std::set<std::string>& blob_names) {
-    auto const node_count = index_nodes.size();
 
-    for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
+// Type 1
+// LSTM <= LSTM(direction=forward) - Squeeze(axis = 1)
+bool CheckAndFuseLSTMType1(OnnxNetInfo& onnx_net_info,
+                           std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "LSTM" && i + 1 < node_count) {
+        onnx::NodeProto* node_lstm = node;
+        auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
+        if (direction != "forward" && direction != "reverse") {
+            return false;
+        }
+        
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        onnx::NodeProto* node_suqeeze = index_nodes[next_indexes[0]].node;
+        
+        // check op
+        if (!(node_suqeeze->op_type() == "Squeeze"))
+            return false;
+        
+        auto axes = get_node_attr_ai(*node_suqeeze, "axes", weights, 1);
+        if (axes.size() != 1 || axes[0] != 1)
+            return false;
+        
+        node_suqeeze->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(node_reference.find(node_lstm->output(0)));
+        blob_names.erase(node_lstm->output(0));
+        
+        node_lstm->set_output(0, node_suqeeze->output(0));
+
+        i += 1;
+    }
+    return true;
+}
 
-        // LSTM <= LSTM(direction=forward) - Squeeze(axis = 1)
-        do {
-            if (node->op_type() == "LSTM" && i + 1 < node_count) {
-                onnx::NodeProto* node_lstm = node;
-                auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
-                if (direction != "forward" && direction != "reverse") {
-                    break;
-                }
-                
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                onnx::NodeProto* node_suqeeze = index_nodes[next_indexes[0]].node;
-                
-                // check op
-                if (!(node_suqeeze->op_type() == "Squeeze"))
-                    break;
-                
-                auto axes = get_node_attr_ai(*node_suqeeze, "axes", weights, 1);
-                if (axes.size() != 1 || axes[0] != 1)
-                        break;
-                
-                node_suqeeze->set_op_type(k_tnn_noop_type);
-
-                node_reference.erase(node_reference.find(node_lstm->output(0)));
-                blob_names.erase(node_lstm->output(0));
-                
-                node_lstm->set_output(0, node_suqeeze->output(0));
-
-                i += 1;
+// Type 2
+// LSTM <= LSTM(direction=bidirectional) - Transpose - Reshape
+bool CheckAndFuseLSTMType2(OnnxNetInfo& onnx_net_info,
+                           std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "LSTM" && i + 2 < node_count) {
+        onnx::NodeProto* node_lstm = node;
+        auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
+        if (direction != "bidirectional") {
+            return false;
+        }
+        
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        onnx::NodeProto* node_transpose = index_nodes[next_indexes[0]].node;
+        
+        // check op
+        if (node_transpose->op_type() != "Transpose")
+            return false;
+        auto perm = get_node_attr_ai(*node_transpose, "perm");
+        if (perm.size() != 4 || perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3)
+            return false;
+        
+        next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+        onnx::NodeProto* node_reshape = index_nodes[next_indexes[0]].node;
+        // check op
+        if (node_reshape->op_type() != "Reshape")
+            return false;
+        auto shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info, 1);
+        if (shape.size() != 3 || shape[0] != 0 || shape[1] != 0 || shape[2] != -1)
+            return false;
+        
+        node_transpose->set_op_type(k_tnn_noop_type);
+        node_reshape->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(node_reference.find(node_lstm->output(0)));
+        blob_names.erase(node_lstm->output(0));
+        node_reference.erase(node_reference.find(node_transpose->output(0)));
+        blob_names.erase(node_transpose->output(0));
+        
+        node_lstm->set_output(0, node_reshape->output(0));
+
+        i += 2;
+    }
+    return true;
+}
+
+// Type 3
+bool CheckAndFuseLSTMType3(OnnxNetInfo& onnx_net_info,
+                           std::vector<IndexNode>& index_nodes,
+                           std::map<std::string, onnx::TensorProto>& weights,
+                           std::map<std::string, int>& node_reference,
+                           std::set<std::string>& blob_names,
+                           int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    auto tp = node->op_type();
+    auto on = node->output(0);
+    if (node->op_type() == "LSTM" && i + 1 < node_count) {
+        onnx::NodeProto* node_lstm = node;
+        auto direction             = get_node_attr_s(*node_lstm, "direction", "forward");
+        if (direction != "forward" && direction != "reverse") {
+            return false;
+        }
+
+        const auto& R_name = node->input(2);
+        if (weights.find(R_name) == weights.end()) {
+            return false;
+        }
+        const auto R          = weights[R_name];
+        const int hidden_size = R.dims(2);
+
+        std::vector<int> previous_indexs = GetPreviousIndexNode(index_nodes, i);
+        std::vector<int> next_indexes    = GetNextIndexNode(index_nodes, i);
+
+        if (next_indexes.size() != 3) {
+            return false;
+        }
+
+        // check unsqueeze is valid
+        auto check_unsqueeze = [&](onnx::NodeProto* node,
+                                   std::map<std::string, onnx::TensorProto>& weights) -> bool {
+            if (node->op_type() != "Unsqueeze") {
+                return false;
             }
-        } while (0);
-        // LSTM <= LSTM(direction=bidirectional) - Transpose - Reshape
-        do {
-            if (node->op_type() == "LSTM" && i + 2 < node_count) {
-                onnx::NodeProto* node_lstm = node;
-                auto direction = get_node_attr_s(*node_lstm, "direction", "forward");
-                if (direction != "bidirectional") {
-                    break;
-                }
-                
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                onnx::NodeProto* node_transpose = index_nodes[next_indexes[0]].node;
-                
-                // check op
-                if (node_transpose->op_type() != "Transpose")
-                    break;
-                auto perm = get_node_attr_ai(*node_transpose, "perm");
-                if (perm.size() != 4 || perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3)
-                        break;
-                
-                next_indexes = GetNextIndexNode(index_nodes, next_indexes[0]);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-                onnx::NodeProto* node_reshape = index_nodes[next_indexes[0]].node;
-                // check op
-                if (node_reshape->op_type() != "Reshape")
-                    break;
-                auto shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info_, 1);
-                if (shape.size() != 3 || shape[0] != 0 || shape[1] != 0 || shape[2] != -1)
-                        break;
-                
-                node_transpose->set_op_type(k_tnn_noop_type);
-                node_reshape->set_op_type(k_tnn_noop_type);
-
-                node_reference.erase(node_reference.find(node_lstm->output(0)));
-                blob_names.erase(node_lstm->output(0));
-                node_reference.erase(node_reference.find(node_transpose->output(0)));
-                blob_names.erase(node_transpose->output(0));
-                
-                node_lstm->set_output(0, node_reshape->output(0));
-
-                i += 2;
+            auto axes = get_node_attr_ai(*node, "axes", weights, 1);
+            if (axes.size() != 1 || axes[0] != 0) {
+                return false;
             }
-        } while (0);
-
-        // LSTM <= Reshape - Unsqueeze - Reshape - Unsqueeze - LSTM(direction=forward)
-        // - Squeeze(axis = 1) - Squeeze - Reshape - Squeeze - Reshape
-        do {
-            auto tp = node->op_type();
-            auto on = node->output(0);
-            if (node->op_type() == "LSTM" && i + 1 < node_count) {
-                onnx::NodeProto* node_lstm = node;
-                auto direction             = get_node_attr_s(*node_lstm, "direction", "forward");
-                if (direction != "forward" && direction != "reverse") {
-                    break;
-                }
-
-                const auto& R_name = node->input(2);
-                if (weights.find(R_name) == weights.end()) {
-                    break;
-                }
-                const auto R          = weights[R_name];
-                const int hidden_size = R.dims(2);
-
-                std::vector<int> previous_indexs = GetPreviousIndexNode(index_nodes, i);
-                std::vector<int> next_indexes    = GetNextIndexNode(index_nodes, i);
-
-                if (next_indexes.size() != 3) {
-                    break;
-                }
 
-                // check unsqueeze is valid
-                auto check_unsqueeze = [&](onnx::NodeProto* node,
-                                           std::map<std::string, onnx::TensorProto>& weights) -> bool {
-                    if (node->op_type() != "Unsqueeze") {
-                        return false;
-                    }
-                    auto axes = get_node_attr_ai(*node, "axes", weights, 1);
-                    if (axes.size() != 1 || axes[0] != 0) {
-                        return false;
-                    }
-
-                    return true;
-                };
-
-                // Let the index of unsqueeze at the end
-                {
-                    const auto tmp_indexes = previous_indexs;
-                    int unsqueeze_index    = previous_indexs.size() - 1;
-                    int other_index        = 0;
-                    for (const auto item : tmp_indexes) {
-                        const auto& op_type = index_nodes[item].node->op_type();
-                        if (op_type != "Unsqueeze") {
-                            previous_indexs[other_index++] = item;
-                        } else {
-                            previous_indexs[unsqueeze_index--] = item;
-                        }
-                    }
-                }
-
-                onnx::NodeProto* node_unsqueeze0 = index_nodes[previous_indexs[1]].node;
-                onnx::NodeProto* node_unsqueeze1 = index_nodes[previous_indexs[2]].node;
-                if (!check_unsqueeze(node_unsqueeze0, weights) || !check_unsqueeze(node_unsqueeze1, weights)) {
-                    break;
+            return true;
+        };
+
+        // Let the index of unsqueeze at the end
+        {
+            const auto tmp_indexes = previous_indexs;
+            int unsqueeze_index    = previous_indexs.size() - 1;
+            int other_index        = 0;
+            for (const auto item : tmp_indexes) {
+                const auto& op_type = index_nodes[item].node->op_type();
+                if (op_type != "Unsqueeze") {
+                    previous_indexs[other_index++] = item;
+                } else {
+                    previous_indexs[unsqueeze_index--] = item;
                 }
+            }
+        }
+
+        onnx::NodeProto* node_unsqueeze0 = index_nodes[previous_indexs[1]].node;
+        onnx::NodeProto* node_unsqueeze1 = index_nodes[previous_indexs[2]].node;
+        if (!check_unsqueeze(node_unsqueeze0, weights) || !check_unsqueeze(node_unsqueeze1, weights)) {
+            return false;
+        }
+
+        std::vector<int> unsqueeze0_previous_indexs = GetPreviousIndexNode(index_nodes, previous_indexs[1]);
+        std::vector<int> unsqueeze1_previous_indexs = GetPreviousIndexNode(index_nodes, previous_indexs[2]);
+        if (unsqueeze0_previous_indexs.size() > 2 || unsqueeze1_previous_indexs.size() > 2) {
+            return false;
+        }
+
+        // check reshape is valid
+        auto check_reshape = [&](onnx::NodeProto* node, std::vector<IndexNode>& index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights, int index,
+                                 int target_shape_dims, int hidden_size) -> bool {
+            if (node->op_type() != "Reshape") {
+                return false;
+            }
+            std::vector<int> reshape_next_indexes = GetNextIndexNode(index_nodes, index);
+            if (reshape_next_indexes.size() != 1) {
+                return false;
+            }
+            auto shape = get_node_attr_ai(*node, "shape", weights, 1);
+            if (shape.size() != target_shape_dims) {
+                return false;
+            }
 
-                std::vector<int> unsqueeze0_previous_indexs = GetPreviousIndexNode(index_nodes, previous_indexs[1]);
-                std::vector<int> unsqueeze1_previous_indexs = GetPreviousIndexNode(index_nodes, previous_indexs[2]);
-                if (unsqueeze0_previous_indexs.size() > 2 || unsqueeze1_previous_indexs.size() > 2) {
-                    break;
-                }
+            if (shape.back() != hidden_size) {
+                return false;
+            }
 
-                // check reshape is valid
-                auto check_reshape = [&](onnx::NodeProto* node, std::vector<IndexNode>& index_nodes,
-                                         std::map<std::string, onnx::TensorProto>& weights, int index,
-                                         int target_shape_dims, int hidden_size) -> bool {
-                    if (node->op_type() != "Reshape") {
-                        return false;
-                    }
-                    std::vector<int> reshape_next_indexes = GetNextIndexNode(index_nodes, index);
-                    if (reshape_next_indexes.size() != 1) {
-                        return false;
-                    }
-                    auto shape = get_node_attr_ai(*node, "shape", weights, 1);
-                    if (shape.size() != target_shape_dims) {
-                        return false;
-                    }
-
-                    if (shape.back() != hidden_size) {
-                        return false;
-                    }
-
-                    return true;
-                };
-
-                onnx::NodeProto* node_reshape0 = index_nodes[unsqueeze0_previous_indexs[0]].node;
-                onnx::NodeProto* node_reshape1 = index_nodes[unsqueeze1_previous_indexs[0]].node;
-                if (!check_reshape(node_reshape0, index_nodes, weights, unsqueeze0_previous_indexs[0], 2,
-                                   hidden_size) ||
-                    !check_reshape(node_reshape1, index_nodes, weights, unsqueeze1_previous_indexs[0], 2,
-                                   hidden_size)) {
-                    break;
-                }
+            return true;
+        };
 
-                onnx::NodeProto* node_squeeze0 = index_nodes[next_indexes[0]].node;
+        onnx::NodeProto* node_reshape0 = index_nodes[unsqueeze0_previous_indexs[0]].node;
+        onnx::NodeProto* node_reshape1 = index_nodes[unsqueeze1_previous_indexs[0]].node;
+        if (!check_reshape(node_reshape0, index_nodes, weights, unsqueeze0_previous_indexs[0], 2,
+                           hidden_size) ||
+            !check_reshape(node_reshape1, index_nodes, weights, unsqueeze1_previous_indexs[0], 2,
+                           hidden_size)) {
+            return false;
+        }
 
-                // check op
-                if (!(node_squeeze0->op_type() == "Squeeze"))
-                    break;
+        onnx::NodeProto* node_squeeze0 = index_nodes[next_indexes[0]].node;
 
-                auto axes = get_node_attr_ai(*node_squeeze0, "axes", weights, 1);
-                if (axes.size() != 1 || axes[0] != 1)
-                    break;
+        // check op
+        if (!(node_squeeze0->op_type() == "Squeeze"))
+            return false;
 
-                auto check_squeeze = [&](onnx::NodeProto* node, std::map<std::string, onnx::TensorProto>& weights) {
-                    if (node->op_type() != "Squeeze") {
-                        return false;
-                    }
-                    auto axes = get_node_attr_ai(*node, "axes", weights, 1);
-                    if (!axes.empty()) {
-                        return false;
-                    }
+        auto axes = get_node_attr_ai(*node_squeeze0, "axes", weights, 1);
+        if (axes.size() != 1 || axes[0] != 1)
+            return false;
 
-                    return true;
-                };
+        auto check_squeeze = [&](onnx::NodeProto* node, std::map<std::string, onnx::TensorProto>& weights) {
+            if (node->op_type() != "Squeeze") {
+                return false;
+            }
+            auto axes = get_node_attr_ai(*node, "axes", weights, 1);
+            if (!axes.empty()) {
+                return false;
+            }
 
-                onnx::NodeProto* node_squeeze1 = index_nodes[next_indexes[1]].node;
-                onnx::NodeProto* node_squeeze2 = index_nodes[next_indexes[2]].node;
-                if (!check_squeeze(node_squeeze1, weights) || !check_squeeze(node_squeeze2, weights)) {
-                    break;
-                }
+            return true;
+        };
+
+        onnx::NodeProto* node_squeeze1 = index_nodes[next_indexes[1]].node;
+        onnx::NodeProto* node_squeeze2 = index_nodes[next_indexes[2]].node;
+        if (!check_squeeze(node_squeeze1, weights) || !check_squeeze(node_squeeze2, weights)) {
+            return false;
+        }
+
+        std::vector<int> squeeze1_next_index = GetNextIndexNode(index_nodes, next_indexes[1]);
+        std::vector<int> squeeze2_next_index = GetNextIndexNode(index_nodes, next_indexes[2]);
+        if (squeeze1_next_index.size() != 1 && squeeze2_next_index.size() != 2) {
+            return false;
+        }
+
+        onnx::NodeProto* node_reshape2 = index_nodes[squeeze1_next_index[0]].node;
+        onnx::NodeProto* node_reshape3 = index_nodes[squeeze2_next_index[0]].node;
+        if (!check_reshape(node_reshape2, index_nodes, weights, squeeze1_next_index[0], 3, hidden_size) ||
+            !check_reshape(node_reshape3, index_nodes, weights, squeeze2_next_index[0], 3, hidden_size)) {
+            return false;
+        }
+
+        node_reshape0->set_op_type(k_tnn_noop_type);
+        node_reshape1->set_op_type(k_tnn_noop_type);
+        node_squeeze0->set_op_type(k_tnn_noop_type);
+        node_squeeze1->set_op_type(k_tnn_noop_type);
+        node_squeeze2->set_op_type(k_tnn_noop_type);
+        node_reshape2->set_op_type(k_tnn_noop_type);
+        node_reshape3->set_op_type(k_tnn_noop_type);
+
+        node_unsqueeze0->set_op_type("Squeeze");
+        const auto node_unsqueeze0_input = node_unsqueeze0->input(0);
+        node_unsqueeze0->clear_input();
+        node_unsqueeze0->add_input(node_unsqueeze0_input);
+        onnx::AttributeProto* unsqueeze0_attr = node_unsqueeze0->add_attribute();
+        unsqueeze0_attr->set_name("axes");
+        unsqueeze0_attr->add_ints(-1);
+
+        node_unsqueeze1->set_op_type("Squeeze");
+        const auto node_unsqueeze1_input = node_unsqueeze1->input(0);
+        node_unsqueeze1->clear_input();
+        node_unsqueeze1->add_input(node_unsqueeze1_input);
+        onnx::AttributeProto* unsqueeze1_attr = node_unsqueeze1->add_attribute();
+        unsqueeze1_attr->set_name("axes");
+        unsqueeze1_attr->add_ints(-1);
+
+        node_reference.erase(node_reference.find(node_lstm->output(0)));
+        node_reference.erase(node_reference.find(node_lstm->output(1)));
+        node_reference.erase(node_reference.find(node_lstm->output(2)));
+        node_reference.erase(node_reference.find(node_reshape0->output(0)));
+        node_reference.erase(node_reference.find(node_reshape1->output(0)));
+        node_reference.erase(node_reference.find(node_unsqueeze0->output(0)));
+        node_reference.erase(node_reference.find(node_unsqueeze1->output(0)));
+        node_reference.erase(node_reference.find(node_squeeze1->output(0)));
+        node_reference.erase(node_reference.find(node_squeeze2->output(0)));
+
+        blob_names.erase(node_lstm->output(0));
+        blob_names.erase(node_lstm->output(1));
+        blob_names.erase(node_lstm->output(2));
+        blob_names.erase(node_reshape0->output(0));
+        blob_names.erase(node_reshape1->output(0));
+        blob_names.erase(node_unsqueeze0->output(0));
+        blob_names.erase(node_unsqueeze1->output(0));
+        blob_names.erase(node_squeeze0->output(0));
+        blob_names.erase(node_squeeze1->output(0));
+
+        node_unsqueeze0->set_input(0, node_reshape0->input(0));
+        node_unsqueeze1->set_input(0, node_reshape1->input(0));
+        node_lstm->set_output(0, node_squeeze0->output(0));
+        node_lstm->set_output(1, node_reshape2->output(0));
+        node_lstm->set_output(2, node_reshape3->output(0));
+
+        node_squeeze0->clear_input();
+        node_squeeze0->clear_output();
+    }
+    return true;
+}
 
-                std::vector<int> squeeze1_next_index = GetNextIndexNode(index_nodes, next_indexes[1]);
-                std::vector<int> squeeze2_next_index = GetNextIndexNode(index_nodes, next_indexes[2]);
-                if (squeeze1_next_index.size() != 1 && squeeze2_next_index.size() != 2) {
-                    break;
-                }
+int Onnx2TNN::FuseLSTM(onnx::GraphProto* mutable_graph,
+                                  std::vector<IndexNode> & index_nodes,
+                                  std::map<std::string, onnx::TensorProto>& weights,
+                                  std::map<std::string, int>& node_reference,
+                                  std::set<std::string>& blob_names) {
+    for (int i = 0; i < index_nodes.size(); i++) {
+        auto node = index_nodes[i].node;
 
-                onnx::NodeProto* node_reshape2 = index_nodes[squeeze1_next_index[0]].node;
-                onnx::NodeProto* node_reshape3 = index_nodes[squeeze2_next_index[0]].node;
-                if (!check_reshape(node_reshape2, index_nodes, weights, squeeze1_next_index[0], 3, hidden_size) ||
-                    !check_reshape(node_reshape3, index_nodes, weights, squeeze2_next_index[0], 3, hidden_size)) {
-                    break;
-                }
+        // Type 1:
+        // LSTM <= LSTM(direction=forward) - Squeeze(axis = 1)
+        do {
+            bool fused = CheckAndFuseLSTMType1(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
+        } while (0);
+        
+        // Type 2:
+        // LSTM <= LSTM(direction=bidirectional) - Transpose - Reshape
+        do {
+            bool fused = CheckAndFuseLSTMType2(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
+        } while (0);
 
-                node_reshape0->set_op_type(k_tnn_noop_type);
-                node_reshape1->set_op_type(k_tnn_noop_type);
-                node_squeeze0->set_op_type(k_tnn_noop_type);
-                node_squeeze1->set_op_type(k_tnn_noop_type);
-                node_squeeze2->set_op_type(k_tnn_noop_type);
-                node_reshape2->set_op_type(k_tnn_noop_type);
-                node_reshape3->set_op_type(k_tnn_noop_type);
-
-                node_unsqueeze0->set_op_type("Squeeze");
-                const auto node_unsqueeze0_input = node_unsqueeze0->input(0);
-                node_unsqueeze0->clear_input();
-                node_unsqueeze0->add_input(node_unsqueeze0_input);
-                onnx::AttributeProto* unsqueeze0_attr = node_unsqueeze0->add_attribute();
-                unsqueeze0_attr->set_name("axes");
-                unsqueeze0_attr->add_ints(-1);
-
-                node_unsqueeze1->set_op_type("Squeeze");
-                const auto node_unsqueeze1_input = node_unsqueeze1->input(0);
-                node_unsqueeze1->clear_input();
-                node_unsqueeze1->add_input(node_unsqueeze1_input);
-                onnx::AttributeProto* unsqueeze1_attr = node_unsqueeze1->add_attribute();
-                unsqueeze1_attr->set_name("axes");
-                unsqueeze1_attr->add_ints(-1);
-
-                node_reference.erase(node_reference.find(node_lstm->output(0)));
-                node_reference.erase(node_reference.find(node_lstm->output(1)));
-                node_reference.erase(node_reference.find(node_lstm->output(2)));
-                node_reference.erase(node_reference.find(node_reshape0->output(0)));
-                node_reference.erase(node_reference.find(node_reshape1->output(0)));
-                node_reference.erase(node_reference.find(node_unsqueeze0->output(0)));
-                node_reference.erase(node_reference.find(node_unsqueeze1->output(0)));
-                node_reference.erase(node_reference.find(node_squeeze1->output(0)));
-                node_reference.erase(node_reference.find(node_squeeze2->output(0)));
-
-                blob_names.erase(node_lstm->output(0));
-                blob_names.erase(node_lstm->output(1));
-                blob_names.erase(node_lstm->output(2));
-                blob_names.erase(node_reshape0->output(0));
-                blob_names.erase(node_reshape1->output(0));
-                blob_names.erase(node_unsqueeze0->output(0));
-                blob_names.erase(node_unsqueeze1->output(0));
-                blob_names.erase(node_squeeze0->output(0));
-                blob_names.erase(node_squeeze1->output(0));
-
-                node_unsqueeze0->set_input(0, node_reshape0->input(0));
-                node_unsqueeze1->set_input(0, node_reshape1->input(0));
-                node_lstm->set_output(0, node_squeeze0->output(0));
-                node_lstm->set_output(1, node_reshape2->output(0));
-                node_lstm->set_output(2, node_reshape3->output(0));
-
-                node_squeeze0->clear_input();
-                node_squeeze0->clear_output();
-            }
+        // Type 3:
+        // LSTM <= Reshape - Unsqueeze - Reshape - Unsqueeze - LSTM(direction=forward)
+        // - Squeeze(axis = 1) - Squeeze - Reshape - Squeeze - Reshape
+        do {
+            bool fused = CheckAndFuseLSTMType3(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }
 
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc
index 5c6072206..1098fd459 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_shuffle_channel.cc
@@ -17,325 +17,369 @@
 #include "onnx2tnn.h"
 #include "onnx_utility.h"
 
-int Onnx2TNN::FuseShuffleChannel(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
-                                 std::map<std::string, onnx::TensorProto>& weights,
-                                 std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+// Type 1
+// ShuffleChannel <= Reshape - Transpose - Reshape
+bool CheckAndFuseShuffleChannelType1(OnnxNetInfo& onnx_net_info,
+                                     std::vector<IndexNode>& index_nodes,
+                                     std::map<std::string, onnx::TensorProto>& weights,
+                                     std::map<std::string, int>& node_reference,
+                                     std::set<std::string>& blob_names,
+                                     int& i) {
     auto const node_count = index_nodes.size();
-
-    for (int i = 0; i < node_count; i++) {
-        auto node = index_nodes[i].node;
-
-        // ShuffleChannel <= Reshape - Transpose - Reshape
-        do {
-            if (node->op_type() == "Reshape" && i + 2 < node_count) {
-                if (node_reference[node->output(0)] != 1) {
-                    break;
-                }
-
-                auto node2 = index_nodes[i + 1].node;
-                auto node3 = index_nodes[i + 2].node;
-                if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") {
-                    break;
-                }
-
-                if (node_reference[node2->output(0)] != 1) {
-                    break;
-                }
-
-                std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info_, 1);
-                std::vector<int64_t> perm   = get_node_attr_ai(*node2, "perm");
-                std::vector<int64_t> shape3 = get_node_attr_ai(*node3, "shape", onnx_net_info_, 1);
-
-                int64_t group = 0;
-
-                if (shape1.size() == 5 && perm.size() == 5) {
-                    // batch groups channels_per_group, height, width
-                    group = shape1[1];
-
-                    // 0 2 1 3 4
-                    if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4) {
-                        break;
-                    }
-
-                    if (shape3.size() != 4 || shape3[0] != shape1[0] ||
-                        (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3] ||
-                        shape3[3] != shape1[4]) {
-                        break;
-                    }
-                } else if (shape1.size() == 3 && perm.size() == 3) {
-                    // groups, channels_per_group, height*width
-                    group = shape1[0];
-                    // 1 0 2
-                    if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
-                        break;
-                    }
-
-                    // TODO：考虑情况shape3各种大小
-                    if (shape3.size() != 5 || shape3[0] != shape1[0] ||
-                        (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3]) {
-                        break;
-                    }
-                } else {
-                    break;
-                }
-                // or batch groups channels_per_group, height*width
-
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-                node2->set_op_type(k_tnn_noop_type);
-
-                node_reference.erase(node_reference.find(node->output(0)));
-                node_reference.erase(node_reference.find(node2->output(0)));
-                blob_names.erase(node->output(0));
-                blob_names.erase(node2->output(0));
-
-                node3->set_op_type("ShuffleChannel");
-                node3->set_input(0, node->input(0));
-
-                onnx::AttributeProto* attr_group = node3->add_attribute();
-                attr_group->set_name("group");
-                attr_group->set_i(group);
-
-                i += 2;
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Reshape" && i + 2 < node_count) {
+        if (node_reference[node->output(0)] != 1) {
+            return false;
+        }
+
+        auto node2 = index_nodes[i + 1].node;
+        auto node3 = index_nodes[i + 2].node;
+        if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") {
+            return false;
+        }
+
+        if (node_reference[node2->output(0)] != 1) {
+            return false;
+        }
+
+        std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info, 1);
+        std::vector<int64_t> perm   = get_node_attr_ai(*node2, "perm");
+        std::vector<int64_t> shape3 = get_node_attr_ai(*node3, "shape", onnx_net_info, 1);
+
+        int64_t group = 0;
+
+        if (shape1.size() == 5 && perm.size() == 5) {
+            // batch groups channels_per_group, height, width
+            group = shape1[1];
+
+            // 0 2 1 3 4
+            if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4) {
+                return false;
             }
-        } while (0);
-
-        // ShuffleChannel - StrideSlice - StrideSlice <= Reshape - Transpose - Reshape - Gather - Gather
-        do {
-            if (node->op_type() == "Reshape" && i + 4 < node_count) {
-                if (node_reference[node->output(0)] != 1) {
-                    break;
-                }
-
-                auto node_transpose           = index_nodes[i + 1].node;
-                auto node_reshape2            = index_nodes[i + 2].node;
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i + 2);
-                if (next_indexes.size() < 2) {
-                    break;
-                }
-                auto node_gather1 = index_nodes[next_indexes[0]].node;
-                auto node_gather2 = index_nodes[next_indexes[1]].node;
-                if (node_transpose->op_type() != "Transpose" || node_reshape2->op_type() != "Reshape" ||
-                    node_gather1->op_type() != "Gather" || node_gather2->op_type() != "Gather") {
-                    break;
-                }
-
-                if (node_reference[node_transpose->input(0)] != 1 ||
-                    node_reference[node_reshape2->input(0)] != 1) {
-                    break;
-                }
-
-                if (node->output(0) != node_transpose->input(0) ||
-                    node_transpose->output(0) != node_reshape2->input(0) ||
-                    node_reshape2->output(0) != node_gather1->input(0) ||
-                    node_reshape2->output(0) != node_gather2->input(0)) {
-                    break;
-                }
-
-                std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info_, 1);
-                std::vector<int64_t> perm   = get_node_attr_ai(*node_transpose, "perm");
-                std::vector<int64_t> shape3 = get_node_attr_ai(*node_reshape2, "shape", onnx_net_info_, 1);
 
-                if (shape1.size() != 3 || perm.size() != 3 || shape3.size() != 5) {
-                    break;
-                }
-
-                // groups, channels_per_group, height*width
-                int64_t output_channels = shape3[2] * 2;
-                int64_t group           = shape3[2];
-
-                //                def channel_shuffle_failed(x):
-                //                batchsize, num_channels, height, width = x.data.size()
-                //                assert (num_channels % 4 == 0)
-                //                x = x.reshape(batchsize * num_channels // 2, 2, height * width)
-                //                x = x.permute(1, 0, 2)
-                //                x = x.reshape(2, -1, num_channels // 2, height, width)
-                //                return x[0], x[1]
-
-                //                def channel_shuffle_succeed(x):
-                //                batchsize, num_channels, height, width = x.data.size()
-                //                assert (num_channels % 4 == 0)
-                //                x = x.reshape(batchsize, num_channels // 2, 2, height, width)
-                //                x = x.permute(0, 2, 1, 3, 4)
-                //                x = x.reshape(batchsize, num_channels, height, width)
-                //                return x[:,0:num_channels/2,:,:], x[:,num_channels/2:num_channels,:,:]
-
-                // 1 0 2
-                if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
-                    break;
-                }
-
-                // TODO：考虑情况shape3各种大小
-                if (shape3[0] != 2 || shape3[1] != -1) {
-                    break;
-                }
+            if (shape3.size() != 4 || shape3[0] != shape1[0] ||
+                (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3] ||
+                shape3[3] != shape1[4]) {
+                return false;
+            }
+        } else if (shape1.size() == 3 && perm.size() == 3) {
+            // groups, channels_per_group, height*width
+            group = shape1[0];
+            // 1 0 2
+            if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
+                return false;
+            }
 
-                int64_t axis1 = get_node_attr_i(*node_gather1, "axis");
-                auto indices1 = get_node_attr_ai(*node_gather1, "indices", onnx_net_info_, 1);
-                int64_t axis2 = get_node_attr_i(*node_gather2, "axis");
-                auto indices2 = get_node_attr_ai(*node_gather2, "indices", onnx_net_info_, 1);
-                if (axis1 != 0 || axis2 != 0 || indices1[0] != 0 || indices2[0] != 1) {
-                    break;
-                }
+            // TODO：考虑情况shape3各种大小
+            if (shape3.size() != 5 || shape3[0] != shape1[0] ||
+                (shape3[1] != -1 && shape3[1] != shape1[1] * shape1[2]) || shape3[2] != shape1[3]) {
+                return false;
+            }
+        } else {
+            return false;
+        }
+        // or batch groups channels_per_group, height*width
 
-                // reduce
-                node->set_op_type(k_tnn_noop_type);
-                node_transpose->set_op_type(k_tnn_noop_type);
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+        node2->set_op_type(k_tnn_noop_type);
 
-                node_reference.erase(node_reference.find(node->output(0)));
-                node_reference.erase(node_reference.find(node_transpose->output(0)));
-                blob_names.erase(node->output(0));
-                blob_names.erase(node_transpose->output(0));
+        node_reference.erase(node_reference.find(node->output(0)));
+        node_reference.erase(node_reference.find(node2->output(0)));
+        blob_names.erase(node->output(0));
+        blob_names.erase(node2->output(0));
 
-                node_reshape2->set_op_type("ShuffleChannel");
-                node_reshape2->set_input(0, node->input(0));
+        node3->set_op_type("ShuffleChannel");
+        node3->set_input(0, node->input(0));
 
-                onnx::AttributeProto* attr_group = node_reshape2->add_attribute();
-                attr_group->set_name("group");
-                attr_group->set_i(group);
+        onnx::AttributeProto* attr_group = node3->add_attribute();
+        attr_group->set_name("group");
+        attr_group->set_i(group);
 
-                // convert  gather to stride slice
-                node_gather1->set_op_type("Slice");
-                {
-                    onnx::AttributeProto* attr_starts = node_gather1->add_attribute();
-                    attr_starts->set_name("starts");
-                    attr_starts->add_ints(0);
+        i += 2;
+    }
+    return true;
+}
 
-                    onnx::AttributeProto* attr_ends = node_gather1->add_attribute();
-                    attr_ends->set_name("ends");
-                    attr_ends->add_ints(output_channels / 2);
+// Type 2
+// ShuffleChannel - StrideSlice - StrideSlice <= Reshape - Transpose - Reshape - Gather - Gather
+bool CheckAndFuseShuffleChannelType2(OnnxNetInfo& onnx_net_info,
+                                     std::vector<IndexNode>& index_nodes,
+                                     std::map<std::string, onnx::TensorProto>& weights,
+                                     std::map<std::string, int>& node_reference,
+                                     std::set<std::string>& blob_names,
+                                     int& i) {
+    auto const node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Reshape" && i + 4 < node_count) {
+        if (node_reference[node->output(0)] != 1) {
+            return false;
+        }
+
+        auto node_transpose           = index_nodes[i + 1].node;
+        auto node_reshape2            = index_nodes[i + 2].node;
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i + 2);
+        if (next_indexes.size() < 2) {
+            return false;
+        }
+        auto node_gather1 = index_nodes[next_indexes[0]].node;
+        auto node_gather2 = index_nodes[next_indexes[1]].node;
+        if (node_transpose->op_type() != "Transpose" || node_reshape2->op_type() != "Reshape" ||
+            node_gather1->op_type() != "Gather" || node_gather2->op_type() != "Gather") {
+            return false;
+        }
+
+        if (node_reference[node_transpose->input(0)] != 1 ||
+            node_reference[node_reshape2->input(0)] != 1) {
+            return false;
+        }
+
+        if (node->output(0) != node_transpose->input(0) ||
+            node_transpose->output(0) != node_reshape2->input(0) ||
+            node_reshape2->output(0) != node_gather1->input(0) ||
+            node_reshape2->output(0) != node_gather2->input(0)) {
+            return false;
+        }
+
+        std::vector<int64_t> shape1 = get_node_attr_ai(*node, "shape", onnx_net_info, 1);
+        std::vector<int64_t> perm   = get_node_attr_ai(*node_transpose, "perm");
+        std::vector<int64_t> shape3 = get_node_attr_ai(*node_reshape2, "shape", onnx_net_info, 1);
+
+        if (shape1.size() != 3 || perm.size() != 3 || shape3.size() != 5) {
+            return false;
+        }
+
+        // groups, channels_per_group, height*width
+        int64_t output_channels = shape3[2] * 2;
+        int64_t group           = shape3[2];
+
+        //                def channel_shuffle_failed(x):
+        //                batchsize, num_channels, height, width = x.data.size()
+        //                assert (num_channels % 4 == 0)
+        //                x = x.reshape(batchsize * num_channels // 2, 2, height * width)
+        //                x = x.permute(1, 0, 2)
+        //                x = x.reshape(2, -1, num_channels // 2, height, width)
+        //                return x[0], x[1]
+
+        //                def channel_shuffle_succeed(x):
+        //                batchsize, num_channels, height, width = x.data.size()
+        //                assert (num_channels % 4 == 0)
+        //                x = x.reshape(batchsize, num_channels // 2, 2, height, width)
+        //                x = x.permute(0, 2, 1, 3, 4)
+        //                x = x.reshape(batchsize, num_channels, height, width)
+        //                return x[:,0:num_channels/2,:,:], x[:,num_channels/2:num_channels,:,:]
+
+        // 1 0 2
+        if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) {
+            return false;
+        }
+
+        // TODO：考虑情况shape3各种大小
+        if (shape3[0] != 2 || shape3[1] != -1) {
+            return false;
+        }
+
+        int64_t axis1 = get_node_attr_i(*node_gather1, "axis");
+        auto indices1 = get_node_attr_ai(*node_gather1, "indices", onnx_net_info, 1);
+        int64_t axis2 = get_node_attr_i(*node_gather2, "axis");
+        auto indices2 = get_node_attr_ai(*node_gather2, "indices", onnx_net_info, 1);
+        if (axis1 != 0 || axis2 != 0 || indices1[0] != 0 || indices2[0] != 1) {
+            return false;
+        }
+
+        // reduce
+        node->set_op_type(k_tnn_noop_type);
+        node_transpose->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(node_reference.find(node->output(0)));
+        node_reference.erase(node_reference.find(node_transpose->output(0)));
+        blob_names.erase(node->output(0));
+        blob_names.erase(node_transpose->output(0));
+
+        node_reshape2->set_op_type("ShuffleChannel");
+        node_reshape2->set_input(0, node->input(0));
+
+        onnx::AttributeProto* attr_group = node_reshape2->add_attribute();
+        attr_group->set_name("group");
+        attr_group->set_i(group);
+
+        // convert  gather to stride slice
+        node_gather1->set_op_type("Slice");
+        {
+            onnx::AttributeProto* attr_starts = node_gather1->add_attribute();
+            attr_starts->set_name("starts");
+            attr_starts->add_ints(0);
+
+            onnx::AttributeProto* attr_ends = node_gather1->add_attribute();
+            attr_ends->set_name("ends");
+            attr_ends->add_ints(output_channels / 2);
+
+            onnx::AttributeProto* attr_axes = node_gather1->add_attribute();
+            attr_axes->set_name("axes");
+            attr_axes->add_ints(1);
+        }
+
+        node_gather2->set_op_type("Slice");
+        {
+            onnx::AttributeProto* attr_starts = node_gather2->add_attribute();
+            attr_starts->set_name("starts");
+            attr_starts->add_ints(output_channels / 2);
+
+            onnx::AttributeProto* attr_ends = node_gather2->add_attribute();
+            attr_ends->set_name("ends");
+            attr_ends->add_ints(INT_MAX);
+
+            onnx::AttributeProto* attr_axes = node_gather2->add_attribute();
+            attr_axes->set_name("axes");
+            attr_axes->add_ints(1);
+        }
+        i += 4;
+    }
+    return true;
+}
 
-                    onnx::AttributeProto* attr_axes = node_gather1->add_attribute();
-                    attr_axes->set_name("axes");
-                    attr_axes->add_ints(1);
+bool CheckAndFuseShuffleChannelType3(OnnxNetInfo& onnx_net_info,
+                                     std::vector<IndexNode>& index_nodes,
+                                     std::map<std::string, onnx::TensorProto>& weights,
+                                     std::map<std::string, int>& node_reference,
+                                     std::set<std::string>& blob_names,
+                                     int& i) {
+    auto const node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Split") {
+        int64_t group = node->output_size();
+        int64_t g     = 0;
+        std::vector<onnx::NodeProto*> nodes_unsqueeze;
+
+        for (; g < group; g++) {
+            auto node_unsqueeze = index_nodes[i + 1 + g].node;
+            if (node_unsqueeze->op_type() != "Unsqueeze") {
+                return false;
+            }
+            // [BUG] can not get the axis seanxcwang@20200616
+            // else {
+            //     if (get_node_attr_i(*node_unsqueeze, "axis") != 3) {
+            //         std::cout << get_node_attr_i(*node_unsqueeze, "axis") << std::endl;
+            //         return false;
+            //     }
+            // }
+            auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + g);
+            if (next_indexes.size() != 1) {
+                return false;
+            }
+            nodes_unsqueeze.push_back(node_unsqueeze);
+        }
+        // all outputs of split should be unsqueeze
+        if (g < group) {
+            return false;
+        }
+
+        // next node should be concat
+        auto node_concat = index_nodes[i + 1 + group].node;
+        if (node_concat->op_type() != "Concat") {
+            return false;
+        } else {
+            if (get_node_attr_i(*node_concat, "axis") != 3) {
+                return false;
+            }
+            auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + group);
+            if (next_indexes.size() != 1) {
+                return false;
+            }
+        }
+
+        // next node should be transpose
+        auto node_transpose = index_nodes[i + 2 + group].node;
+        if (node_transpose->op_type() != "Transpose") {
+            return false;
+        } else {
+            auto next_indexes = GetNextIndexNode(index_nodes, i + 2 + group);
+            if (next_indexes.size() != 1) {
+                return false;
+            }
+            std::vector<int64_t> perm = get_node_attr_ai(*node_transpose, "perm");
+            if (perm.size() == 5) {
+                if (perm[0] != 0 || perm[1] != 1 || perm[2] != 2 || perm[3] != 4 || perm[4] != 3) {
+                    return false;
                 }
+            } else {
+                return false;
+            }
+        }
+
+        // next node should be reshape
+        auto node_reshape = index_nodes[i + 3 + group].node;
+        if (node_reshape->op_type() != "Reshape") {
+            return false;
+        } else {
+            std::vector<int64_t> shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info, 1);
+            if (shape.size() != 4) {
+                return false;
+            }
+        }
+
+        // get the shuffle pattern, reduce now
+        node->set_op_type(k_tnn_noop_type);
+        for (auto& iter : nodes_unsqueeze) {
+            iter->set_op_type(k_tnn_noop_type);
+        }
+        node_concat->set_op_type(k_tnn_noop_type);
+        node_transpose->set_op_type(k_tnn_noop_type);
+
+        for (g = 0; g < group; g++) {
+            node_reference.erase(node_reference.find(node->output(g)));
+        }
+        for (auto& iter : nodes_unsqueeze) {
+            node_reference.erase(node_reference.find(iter->output(0)));
+        }
+        node_reference.erase(node_reference.find(node_concat->output(0)));
+        node_reference.erase(node_reference.find(node_transpose->output(0)));
+
+        for (g = 0; g < group; g++) {
+            blob_names.erase(node->output(0));
+        }
+        for (auto& iter : nodes_unsqueeze) {
+            blob_names.erase(iter->output(0));
+        }
+        blob_names.erase(node_concat->output(0));
+        blob_names.erase(node_transpose->output(0));
+
+        // set new node
+        node_reshape->set_op_type("ShuffleChannel");
+        node_reshape->set_input(0, node->input(0));
+        onnx::AttributeProto* attr_group = node_reshape->add_attribute();
+        attr_group->set_name("group");
+        attr_group->set_i(group);
+
+        i += 3 + group;
+    }
+    return true;
+}
 
-                node_gather2->set_op_type("Slice");
-                {
-                    onnx::AttributeProto* attr_starts = node_gather2->add_attribute();
-                    attr_starts->set_name("starts");
-                    attr_starts->add_ints(output_channels / 2);
 
-                    onnx::AttributeProto* attr_ends = node_gather2->add_attribute();
-                    attr_ends->set_name("ends");
-                    attr_ends->add_ints(INT_MAX);
+int Onnx2TNN::FuseShuffleChannel(onnx::GraphProto* mutable_graph, std::vector<IndexNode>& index_nodes,
+                                 std::map<std::string, onnx::TensorProto>& weights,
+                                 std::map<std::string, int>& node_reference, std::set<std::string>& blob_names) {
+    for (int i = 0; i < index_nodes.size(); i++) {
+        // Type 1
+        // ShuffleChannel <= Reshape - Transpose - Reshape
+        do {
+            bool fused = CheckAndFuseShuffleChannelType1(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
+        } while (0);
 
-                    onnx::AttributeProto* attr_axes = node_gather2->add_attribute();
-                    attr_axes->set_name("axes");
-                    attr_axes->add_ints(1);
-                }
-                i += 4;
-            }
+        // Type 2
+        // ShuffleChannel - StrideSlice - StrideSlice <= Reshape - Transpose - Reshape - Gather - Gather
+        do {
+            bool fused = CheckAndFuseShuffleChannelType2(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 3
         // ShuffleChannel <= split - unsqueeze(n>=1) - concat - transpose(0,1,2,4,3) - reshape
         do {
-            if (node->op_type() == "Split") {
-                int64_t group = node->output_size();
-                int64_t g     = 0;
-                std::vector<onnx::NodeProto*> nodes_unsqueeze;
-
-                for (; g < group; g++) {
-                    auto node_unsqueeze = index_nodes[i + 1 + g].node;
-                    if (node_unsqueeze->op_type() != "Unsqueeze") {
-                        break;
-                    }
-                    // [BUG] can not get the axis seanxcwang@20200616
-                    // else {
-                    //     if (get_node_attr_i(*node_unsqueeze, "axis") != 3) {
-                    //         std::cout << get_node_attr_i(*node_unsqueeze, "axis") << std::endl;
-                    //         break;
-                    //     }
-                    // }
-                    auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + g);
-                    if (next_indexes.size() != 1) {
-                        break;
-                    }
-                    nodes_unsqueeze.push_back(node_unsqueeze);
-                }
-                // all outputs of split should be unsqueeze
-                if (g < group)
-                    break;
-
-                // next node should be concat
-                auto node_concat = index_nodes[i + 1 + group].node;
-                if (node_concat->op_type() != "Concat") {
-                    break;
-                } else {
-                    if (get_node_attr_i(*node_concat, "axis") != 3) {
-                        break;
-                    }
-                    auto next_indexes = GetNextIndexNode(index_nodes, i + 1 + group);
-                    if (next_indexes.size() != 1) {
-                        break;
-                    }
-                }
-
-                // next node should be transpose
-                auto node_transpose = index_nodes[i + 2 + group].node;
-                if (node_transpose->op_type() != "Transpose") {
-                    break;
-                } else {
-                    auto next_indexes = GetNextIndexNode(index_nodes, i + 2 + group);
-                    if (next_indexes.size() != 1) {
-                        break;
-                    }
-                    std::vector<int64_t> perm = get_node_attr_ai(*node_transpose, "perm");
-                    if (perm.size() == 5) {
-                        if (perm[0] != 0 || perm[1] != 1 || perm[2] != 2 || perm[3] != 4 || perm[4] != 3) {
-                            break;
-                        }
-                    } else {
-                        break;
-                    }
-                }
-
-                // next node should be reshape
-                auto node_reshape = index_nodes[i + 3 + group].node;
-                if (node_reshape->op_type() != "Reshape") {
-                    break;
-                } else {
-                    std::vector<int64_t> shape = get_node_attr_ai(*node_reshape, "shape", onnx_net_info_, 1);
-                    if (shape.size() != 4) {
-                        break;
-                    }
-                }
-
-                // get the shuffle pattern, reduce now
-                node->set_op_type(k_tnn_noop_type);
-                for (auto& iter : nodes_unsqueeze) {
-                    iter->set_op_type(k_tnn_noop_type);
-                }
-                node_concat->set_op_type(k_tnn_noop_type);
-                node_transpose->set_op_type(k_tnn_noop_type);
-
-                for (g = 0; g < group; g++) {
-                    node_reference.erase(node_reference.find(node->output(g)));
-                }
-                for (auto& iter : nodes_unsqueeze) {
-                    node_reference.erase(node_reference.find(iter->output(0)));
-                }
-                node_reference.erase(node_reference.find(node_concat->output(0)));
-                node_reference.erase(node_reference.find(node_transpose->output(0)));
-
-                for (g = 0; g < group; g++) {
-                    blob_names.erase(node->output(0));
-                }
-                for (auto& iter : nodes_unsqueeze) {
-                    blob_names.erase(iter->output(0));
-                }
-                blob_names.erase(node_concat->output(0));
-                blob_names.erase(node_transpose->output(0));
-
-                // set new node
-                node_reshape->set_op_type("ShuffleChannel");
-                node_reshape->set_input(0, node->input(0));
-                onnx::AttributeProto* attr_group = node_reshape->add_attribute();
-                attr_group->set_name("group");
-                attr_group->set_i(group);
-
-                i += 3 + group;
-            }
+            bool fused = CheckAndFuseShuffleChannelType3(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }
 
diff --git a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc
index 6c4357b65..2f1d2efeb 100644
--- a/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc
+++ b/tools/onnx2tnn/src/core/onnx_fuse/onnx2tnn_fuse_softmax.cc
@@ -16,6 +16,195 @@
 
 #include "onnx2tnn.h"
 
+
+// Type 1
+// Softmax <= Exp - ReduceSum - Div
+bool CheckAndFuseSoftmaxType1(OnnxNetInfo& onnx_net_info,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names,
+                              int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Exp" && i + 2 < node_count) {
+        auto node_exp = node;
+        auto node_reducesum = index_nodes[i+1].node;
+        auto node_div = index_nodes[i+2].node;
+
+        // check op
+        if (!(node_reducesum->op_type() == "ReduceSum" &&
+            node_div->op_type() == "Div"))
+            return false;
+        std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
+        if (next_indexes.size() != 2) {
+            return false;
+        }
+        next_indexes = GetNextIndexNode(index_nodes, i+1);
+        if (next_indexes.size() != 1) {
+            return false;
+        }
+
+        auto axis = get_node_attr_ai(*node_reducesum, "axes");
+
+        bool can_fuse = false;
+        int softmax_axis = 1;
+        if (axis.size() == 1 &&
+            node_div->input_size() == 2) {
+            softmax_axis = axis[0];
+
+            can_fuse =
+                (node_div->input(0) == node_exp->output(0) &&
+                node_div->input(1) == node_reducesum->output(0)) ||
+                (node_div->input(1) == node_exp->output(0) &&
+                node_div->input(0) == node_reducesum->output(0));
+        } else {
+            DLog("axis size %d, axis[0]:%d input_size:%d\n", (int)axis.size(), (int)axis[0], (int)node_div->input_size());
+        }
+
+        if (!can_fuse) {
+            DLog("exp didn't fuse to softmax\n");
+            return false;
+        }
+
+        node_reducesum->set_op_type(k_tnn_noop_type);
+        node_div->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(node_reference.find(node_exp->output(0)));
+        node_reference.erase(
+            node_reference.find(node_reducesum->output(0)));
+        blob_names.erase(node_exp->output(0));
+        blob_names.erase(node_reducesum->output(0));
+
+        node_exp->set_op_type("Softmax");
+        node_exp->set_output(0, node_div->output(0));
+        onnx::AttributeProto* attr_group = node_exp->add_attribute();
+        attr_group->set_name("axis");
+        attr_group->set_i(softmax_axis);
+
+        i += 2;
+    }
+    return true;
+}
+
+// Type 2
+// Softmax <= Transpose - Softmax - Transpose
+bool CheckAndFuseSoftmaxType2(OnnxNetInfo& onnx_net_info,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names,
+                              int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Transpose" && i + 2 < node_count) {
+        auto node_transpose1 = node;
+        auto node_softmax = index_nodes[i+1].node;
+        auto node_transpose2 = index_nodes[i+2].node;
+
+        // check op
+        if (!(node_softmax->op_type() == "Softmax" &&
+              node_transpose2->op_type() == "Transpose"))
+            return false;
+
+        std::vector<int64_t> perm1 =
+            get_node_attr_ai(*node_transpose1, "perm");
+        int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
+        std::vector<int64_t> perm2 =
+            get_node_attr_ai(*node_transpose2, "perm");
+        bool can_fuse = false;
+        if (perm1.size() == 4 && perm2.size() == 4) {
+            can_fuse = axis == 3 && perm1[0] == 0 && perm1[1] == 2 &&
+                       perm1[2] == 3 && perm1[3] == 1 &&
+                       perm2[0] == 0 && perm2[1] == 3 &&
+                       perm2[2] == 1 && perm2[3] == 2;
+        }
+
+        if (!can_fuse) {
+            return false;
+        }
+
+        node_transpose1->set_op_type(k_tnn_noop_type);
+        node_transpose2->set_op_type(k_tnn_noop_type);
+
+        node_reference.erase(
+            node_reference.find(node_transpose1->output(0)));
+        node_reference.erase(
+            node_reference.find(node_softmax->output(0)));
+        blob_names.erase(node_transpose1->output(0));
+        blob_names.erase(node_softmax->output(0));
+
+        auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
+        axis_attr->set_i(1);
+        node_softmax->set_input(0, node_transpose1->input(0));
+        node_softmax->set_output(0, node_transpose2->output(0));
+
+        i += 2;
+    }
+    return true;
+}
+
+// Type 3
+// Softmax <= Transpose - Reshape - Softmax - Reshape - Transpose
+bool CheckAndFuseSoftmaxType3(OnnxNetInfo& onnx_net_info,
+                              std::vector<IndexNode>& index_nodes,
+                              std::map<std::string, onnx::TensorProto>& weights,
+                              std::map<std::string, int>& node_reference,
+                              std::set<std::string>& blob_names,
+                              int& i) {
+    auto node_count = index_nodes.size();
+    auto node = index_nodes[i].node;
+    if (node->op_type() == "Transpose" && i + 4 < node_count) {
+        auto node_transpose1 = node;
+        auto node_reshape1 = index_nodes[i+1].node;
+        auto node_softmax = index_nodes[i+2].node;
+        auto node_reshape2 = index_nodes[i+3].node;
+        auto node_transpose2 = index_nodes[i+4].node;
+        // check op
+        if (!(node_reshape1->op_type() == "Reshape" &&
+            node_softmax->op_type() == "Softmax" &&
+            node_reshape2->op_type() == "Reshape" &&
+            node_transpose2->op_type() == "Transpose"))
+            return false;
+        std::vector<int64_t> perm1 = get_node_attr_ai(*node_transpose1, "perm");
+        int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
+        std::vector<int64_t> perm2 = get_node_attr_ai(*node_transpose2, "perm");
+        bool can_fuse = false;
+        if (perm1.size() == 4 && perm2.size() == 4) {
+            can_fuse = axis == 1 && perm1[0] == 0 && perm1[1] == 2 &&
+                perm1[2] == 3 && perm1[3] == 1 &&
+                perm2[0] == 0 && perm2[1] == 3 &&
+                perm2[2] == 1 && perm2[3] == 2;
+        }
+
+        if (!can_fuse) {
+            return false;
+        }
+
+        node_transpose1->set_op_type(k_tnn_noop_type);
+        node_reshape1->set_op_type(k_tnn_noop_type);
+        node_reshape2->set_op_type(k_tnn_noop_type);
+        node_transpose2->set_op_type(k_tnn_noop_type);
+        node_reference.erase(node_transpose1->output(0));
+        node_reference.erase(node_reshape1->output(0));
+        node_reference.erase(node_softmax->output(0));
+        node_reference.erase(node_reshape2->output(0));
+        blob_names.erase(node_transpose1->output(0));
+        blob_names.erase(node_reshape1->output(0));
+        blob_names.erase(node_softmax->output(0));
+        blob_names.erase(node_reshape2->output(0));
+        
+        auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
+        axis_attr->set_i(1);
+        node_softmax->set_input(0, node_transpose1->input(0));
+        node_softmax->set_output(0, node_transpose2->output(0));
+
+        i += 4;
+    }
+    return true;
+}
+
+
 int Onnx2TNN::FuseSoftmax(onnx::GraphProto* mutable_graph,
                                std::vector<IndexNode> & index_nodes,
                                std::map<std::string, onnx::TensorProto>& weights,
@@ -26,171 +215,25 @@ int Onnx2TNN::FuseSoftmax(onnx::GraphProto* mutable_graph,
     for (int i = 0; i < node_count; i++) {
         auto node = index_nodes[i].node;
 
+        // Type 1
         // Softmax <= Exp - ReduceSum - Div
         do {
-            if (node->op_type() == "Exp" && i + 2 < node_count) {
-                auto node_exp = node;
-                auto node_reducesum = index_nodes[i+1].node;
-                auto node_div = index_nodes[i+2].node;
-
-                // check op
-                if (!(node_reducesum->op_type() == "ReduceSum" &&
-                    node_div->op_type() == "Div"))
-                    break;
-                std::vector<int> next_indexes = GetNextIndexNode(index_nodes, i);
-                if (next_indexes.size() != 2) {
-                    break;
-                }
-                next_indexes = GetNextIndexNode(index_nodes, i+1);
-                if (next_indexes.size() != 1) {
-                    break;
-                }
-
-                auto axis = get_node_attr_ai(*node_reducesum, "axes");
-
-                bool can_fuse = false;
-                int softmax_axis = 1;
-                if (axis.size() == 1 &&
-                    node_div->input_size() == 2) {
-                    softmax_axis = axis[0];
-
-                    can_fuse =
-                        (node_div->input(0) == node_exp->output(0) &&
-                        node_div->input(1) == node_reducesum->output(0)) ||
-                        (node_div->input(1) == node_exp->output(0) &&
-                        node_div->input(0) == node_reducesum->output(0));
-                } else {
-                    DLog("axis size %d, axis[0]:%d input_size:%d\n", (int)axis.size(), (int)axis[0], (int)node_div->input_size());
-                }
-
-                if (!can_fuse) {
-                    DLog("exp didn't fuse to softmax\n");
-                    break;
-                }
-
-                node_reducesum->set_op_type(k_tnn_noop_type);
-                node_div->set_op_type(k_tnn_noop_type);
-
-                node_reference.erase(node_reference.find(node_exp->output(0)));
-                node_reference.erase(
-                    node_reference.find(node_reducesum->output(0)));
-                blob_names.erase(node_exp->output(0));
-                blob_names.erase(node_reducesum->output(0));
-
-                node_exp->set_op_type("Softmax");
-                node_exp->set_output(0, node_div->output(0));
-                onnx::AttributeProto* attr_group = node_exp->add_attribute();
-                attr_group->set_name("axis");
-                attr_group->set_i(softmax_axis);
-
-                i += 2;
-
-            }
+            bool fused = CheckAndFuseSoftmaxType1(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 2
         // Softmax <= Transpose - Softmax - Transpose
         do {
-          if (node->op_type() == "Transpose" && i + 2 < node_count) {
-              auto node_transpose1 = node;
-              auto node_softmax = index_nodes[i+1].node;
-              auto node_transpose2 = index_nodes[i+2].node;
-
-              // check op
-              if (!(node_softmax->op_type() == "Softmax" &&
-                    node_transpose2->op_type() == "Transpose"))
-                  break;
-
-              std::vector<int64_t> perm1 =
-                  get_node_attr_ai(*node_transpose1, "perm");
-              int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
-              std::vector<int64_t> perm2 =
-                  get_node_attr_ai(*node_transpose2, "perm");
-              bool can_fuse = false;
-              if (perm1.size() == 4 && perm2.size() == 4) {
-                  can_fuse = axis == 3 && perm1[0] == 0 && perm1[1] == 2 &&
-                             perm1[2] == 3 && perm1[3] == 1 &&
-                             perm2[0] == 0 && perm2[1] == 3 &&
-                             perm2[2] == 1 && perm2[3] == 2;
-              }
-
-              if (!can_fuse) {
-                  break;
-              }
-
-              node_transpose1->set_op_type(k_tnn_noop_type);
-              node_transpose2->set_op_type(k_tnn_noop_type);
-
-              node_reference.erase(
-                  node_reference.find(node_transpose1->output(0)));
-              node_reference.erase(
-                  node_reference.find(node_softmax->output(0)));
-              blob_names.erase(node_transpose1->output(0));
-              blob_names.erase(node_softmax->output(0));
-
-              auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
-              axis_attr->set_i(1);
-              node_softmax->set_input(0, node_transpose1->input(0));
-              node_softmax->set_output(0, node_transpose2->output(0));
-
-              i += 2;
-          }
+            bool fused = CheckAndFuseSoftmaxType2(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
 
+        // Type 3
         // Softmax <= Transpose - Reshape - Softmax - Reshape - Transpose
         do {
-            if (node->op_type() == "Transpose" && i + 4 < node_count) {
-              auto node_transpose1 = node;
-              auto node_reshape1 = index_nodes[i+1].node;
-              auto node_softmax = index_nodes[i+2].node;
-              auto node_reshape2 = index_nodes[i+3].node;
-              auto node_transpose2 = index_nodes[i+4].node;
-
-              // check op
-              if (!(node_reshape1->op_type() == "Reshape" &&
-                    node_softmax->op_type() == "Softmax" &&
-                    node_reshape2->op_type() == "Reshape" &&
-                    node_transpose2->op_type() == "Transpose"))
-                  break;
-
-              std::vector<int64_t> perm1 =
-                  get_node_attr_ai(*node_transpose1, "perm");
-              int64_t axis = get_node_attr_i(*node_softmax, "axis", 1);
-              std::vector<int64_t> perm2 =
-                  get_node_attr_ai(*node_transpose2, "perm");
-              bool can_fuse = false;
-              if (perm1.size() == 4 && perm2.size() == 4) {
-                  can_fuse = axis == 1 && perm1[0] == 0 && perm1[1] == 2 &&
-                             perm1[2] == 3 && perm1[3] == 1 &&
-                             perm2[0] == 0 && perm2[1] == 3 &&
-                             perm2[2] == 1 && perm2[3] == 2;
-              }
-
-              if (!can_fuse) {
-                  break;
-              }
-
-              node_transpose1->set_op_type(k_tnn_noop_type);
-              node_reshape1->set_op_type(k_tnn_noop_type);
-              node_reshape2->set_op_type(k_tnn_noop_type);
-              node_transpose2->set_op_type(k_tnn_noop_type);
-
-              node_reference.erase(node_transpose1->output(0));
-              node_reference.erase(node_reshape1->output(0));
-              node_reference.erase(node_softmax->output(0));
-              node_reference.erase(node_reshape2->output(0));
-
-              blob_names.erase(node_transpose1->output(0));
-              blob_names.erase(node_reshape1->output(0));
-              blob_names.erase(node_softmax->output(0));
-              blob_names.erase(node_reshape2->output(0));
-
-              auto axis_attr = get_node_mutable_attr(*node_softmax, "axis");
-              axis_attr->set_i(1);
-              node_softmax->set_input(0, node_transpose1->input(0));
-              node_softmax->set_output(0, node_transpose2->output(0));
-
-              i += 4;
-          }
+            bool fused = CheckAndFuseSoftmaxType3(onnx_net_info_, index_nodes, weights, node_reference, blob_names, i);
+            if (!fused) break;
         } while (0);
     }